In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ramil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ramil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from nltk.stem import PorterStemmer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
# load data
df = pd.read_csv('labeledTrainData.tsv', sep='\t', encoding='utf-8')

In [4]:
df.to_dict('records')[0]

{'id': '5814_8',
 'sentiment': 1,
 'review': "With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />Th

In [5]:
texts = df.review.tolist()
sentiment = df.sentiment.tolist()

In [6]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import math

def pmiCalc(classProb, allProb):
    if classProb == 0 or allProb == 0: 
        return 0
    return math.log10(classProb / allProb)

cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

def cleanhtml(text):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', text)
    return cleantext

def word_tokenize(text):
    tokenizer = RegexpTokenizer(r'\w+')
    clean = tokenizer.tokenize(text)
    return clean
   
def stop_words(tokenize_words):
    stopWords = set(stopwords.words('english'))
    wordsFiltered = []
    for w in tokenize_words:
        if w not in stopWords:
            wordsFiltered.append(w.lower())
    return wordsFiltered


def preprocess_text(text):
    #clean html 
    clean = cleanhtml(text)
    
    # tokenize and remove punctuation 
    tokenize = word_tokenize(clean)
    
    # stop words and lower
    stop = stop_words(tokenize)
    
    return stop


def space_tokenize(text):
    # a_b_c -> ['a', 'b', 'c']
    split_array = text.split(' ');
    return split_array
    
    
def get_vocab(texts, rare = False, percent = 10):
    # unique words
    all_word_dict = {}
    
    all_words_set = set()
    for text in texts: 
#       all_words = space_tokenize(text)
        
        all_words = preprocess_text(text)
        for word in all_words: 
            wd = all_word_dict.get(word)
            if wd == None:
                all_word_dict.update([(word, 1)])
            else:
                all_word_dict.update([(word, wd+1)])
                
        words_set = all_words_set.update(all_words)
    all_sorted = {k: v for k, v in sorted(all_word_dict.items(), key=lambda item: item[1])}
    
    words_amount = len(all_sorted)
    delta_word_amount = round(words_amount / percent)
    
    
    words = []
    for k, v in all_sorted.items():
        words.append([k, v])
      
    print("all: ", len(words), "persent: ", percent)
#     remove rare and popular 
    without_rare_and_popular = words[delta_word_amount: len(words) - delta_word_amount]
    print("without_rare_and_popular", len(without_rare_and_popular))

    return dict(without_rare_and_popular)






class BOWencoder:
    def __init__(self, vocab=None, tokenize=preprocess_text):
        self.vocab = vocab
        self.vocab2idx = self.get_vocab2idx(vocab)
        self.tokenize = tokenize
        
    def encode_single_text(self, text):
        """
            text -> vector
        """
#         words = space_tokenize(text)
        words = self.tokenize(text)
        vector = [0 for i in range(len(self.vocab))]
        
        for text_word in words:
            if self.vocab2idx.get(text_word) != None:
                vector[self.vocab2idx.get(text_word)] = 1   
        return vector

        
    def encode_texts(self, texts):
        """
            multiple text
        """
        vector_text_data = []
        for text in texts:
            vector_text_data.append(self.encode_single_text(text))
        return vector_text_data
        
    def get_vocab2idx(self, vocab):
        """ 
           id -> word dict
        """
        dictV = {word: idx for idx, word in zip(range(len(vocab)), vocab)}
        return dictV

In [12]:
class NaiveBayesClassifier:
    def __init__(self, n_classes):
        self.n_classes = n_classes
        self.class_probs = {}
        self.conditional_probabilities = {}
        self.pmi = {}
        self.idf = []

    def fit(self, X, y):
        """
            X - matrix
            y - result
        """
#         pmi 
        # class_probs
        y_len = len(y)
        positive_amount = sum(y)
        positive = positive_amount / y_len
        negative = 1 - positive
        self.class_probs.update([(0, negative), (1, positive)])
        print(positive, negative)
        # conditional_probabilities
        arr_len = len(X[0])
        pos_f_in_neg = [0 for i in range(arr_len)]
        pos_f_in_pos = [0 for i in range(arr_len)]
        pos_f_in_all = [0 for i in range(arr_len)]
        for row, y_idx in zip(X, range(y_len)):
            for item_idx in range(len(row)):
                pos_f_in_all[item_idx] += row[item_idx]
                if y[y_idx] == 0:
                    pos_f_in_neg[item_idx] += row[item_idx]
                else:
                    pos_f_in_pos[item_idx] += row[item_idx]
                    
        pos_f_in_negative = pos_f_in_neg
        pos_f_in_positive = pos_f_in_pos
        pos_f_in_all_words = pos_f_in_all
        
        for idx in range(len(pos_f_in_neg)):
            pos_f_in_neg[idx] /= (y_len - positive_amount)
            pos_f_in_pos[idx] /= (positive_amount)
            pos_f_in_all[idx] /= y_len
        self.conditional_probabilities.update([(0, pos_f_in_neg), (1, pos_f_in_pos)])
        
#         pmi
        neg_pmi = []
        pos_pmi = []
        for idx in range(len(pos_f_in_neg)):
            neg_pmi.append({idx: pmiCalc(pos_f_in_neg[idx], pos_f_in_all[idx])})
            pos_pmi.append({idx: pmiCalc(pos_f_in_pos[idx], pos_f_in_all[idx])})
        
        self.pmi = dict([(0, neg_pmi), (1, pos_pmi)])

#         idf
        idf_temp = []
        for idx in range(len(pos_f_in_neg)):
            temp = pos_f_in_negative[idx]+pos_f_in_positive[idx]
            if temp != 0:
                idf_temp.append(math.log10(y_len/(temp)))


        
    def predict(self, X):
        result_array = []
        f = open('a.txt', 'w')
        for row in X:
            positive_value = self.class_probs.get(1)
            negative_value = self.class_probs.get(0)
            f.write(str(positive_value) + " " +  str(negative_value) + "\n")
            for item_idx in range(len(row)):
                
                if row[item_idx]:
                    if self.conditional_probabilities.get(1)[item_idx] != 0.0:
                        positive_value *= self.conditional_probabilities.get(1)[item_idx] 
                    if self.conditional_probabilities.get(0)[item_idx] != 0.0:
                        negative_value *= self.conditional_probabilities.get(0)[item_idx] 
#                 else:
#                     positive_value *= (1 - self.conditional_probabilities.get(1)[item_idx])
#                     negative_value *= (1 - self.conditional_probabilities.get(0)[item_idx])
                f.write(str(positive_value) + " " +  str(negative_value) + " " + str(row[item_idx]) + " "+ str(self.conditional_probabilities.get(1)[item_idx])+ " "+ str(self.conditional_probabilities.get(0)[item_idx]) + "\n")
            
            result_array.append(1 if positive_value > negative_value else 0)
            f.write("---\n")
        return result_array


In [19]:
amount = 1000
X_train, X_test, y_train, y_test = train_test_split(texts[:amount], sentiment[:amount])
train_texts = X_train
vocab = get_vocab(X_train)
vocab.update(get_vocab(X_test[:amount]))
# pmi_vocab = get_pmi_vocab(vocab, X_train, y_train);
bow_encoder = BOWencoder(vocab=vocab)
print(f'len vocab: {len(vocab)}')
X_train = bow_encoder.encode_texts(X_train)
X_test = bow_encoder.encode_texts(X_test)
print(len(X_train), len(y_train))

all:  11330 persent:  10
without_rare_and_popular 9064
all:  6466 persent:  10
without_rare_and_popular 5172
len vocab: 11189
750 750


In [13]:
model = NaiveBayesClassifier(n_classes=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(y_pred)
print(y_test[:amount])

0.492 0.508
[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0]
[0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1,

In [14]:
print(classification_report(y_pred, y_test[:amount]))

              precision    recall  f1-score   support

           0       0.45      0.42      0.43       146
           1       0.25      0.27      0.26       104

    accuracy                           0.36       250
   macro avg       0.35      0.34      0.34       250
weighted avg       0.36      0.36      0.36       250



In [15]:
# clean text
import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

def cleanhtml(text):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', text)
    return cleantext

def word_tokenize(text):
    tokenizer = RegexpTokenizer(r'\w+')#fix
    clean = tokenizer.tokenize(text)
    return clean
   
def stop_words(tokenize_words):
    stopWords = set(stopwords.words('english'))
    wordsFiltered = []
    for w in tokenize_words:
        if w not in stopWords:
            wordsFiltered.append(w.lower())
    return wordsFiltered


def stem(text_arr):
    ps = PorterStemmer()
    stem_arr = []
    for word in text_arr:
        stem_arr.append(ps.stem(word))
    return stem_arr

def preprocess_text(text):
    #clean html 
    clean = cleanhtml(text)
    
    # tokenize and remove punctuation 
    tokenize = word_tokenize(clean)
    
    # stop words
    stop = stop_words(tokenize)
    
    # stem 
    stem_arr = stem(stop)
    
    return stem_arr


def preprocess_texts(texts):
    tmp_set = set()
    for text in texts:
        arr = preprocess_text(text) 
        tmp_set.update(arr)
    return tmp_set

print(preprocess_text(df.to_dict('records')[0]['review']))

['with', 'stuff', 'go', 'moment', 'mj', 'start', 'listen', 'music', 'watch', 'odd', 'documentari', 'watch', 'the', 'wiz', 'watch', 'moonwalk', 'mayb', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'realli', 'cool', 'eighti', 'mayb', 'make', 'mind', 'whether', 'guilti', 'innoc', 'moonwalk', 'part', 'biographi', 'part', 'featur', 'film', 'rememb', 'go', 'see', 'cinema', 'origin', 'releas', 'some', 'subtl', 'messag', 'mj', 'feel', 'toward', 'press', 'also', 'obviou', 'messag', 'drug', 'bad', 'kay', 'visual', 'impress', 'cours', 'michael', 'jackson', 'unless', 'remot', 'like', 'mj', 'anyway', 'go', 'hate', 'find', 'bore', 'some', 'may', 'call', 'mj', 'egotist', 'consent', 'make', 'movi', 'but', 'mj', 'fan', 'would', 'say', 'made', 'fan', 'true', 'realli', 'nice', 'the', 'actual', 'featur', 'film', 'bit', 'final', 'start', '20', 'minut', 'exclud', 'smooth', 'crimin', 'sequenc', 'joe', 'pesci', 'convinc', 'psychopath', 'power', 'drug', 'lord', 'whi', 'want', 'mj', 'dead', 'bad', 'be

In [16]:
TP = 0
TN = 0
FP = 0
FN = 0

for i in range(len(y_pred)):
    if y_test[i] == y_pred[i] and y_test[i] == 1:
        TP += 1
    if y_test[i] == y_pred[i] and y_test[i] == 0:
        TN += 1
    if y_test[i] != y_pred[i] and y_test[i] == 1:
        FN += 1
    if y_test[i] != y_pred[i] and y_test[i] == 0:
        FP += 1

precision_1 = TP / (TP + FP) if (TP + FP) > 0 else 0
precision_0 = TN / (FN + TN) if (FN + TN) > 0 else 0
recall_1 = TP / (TP + FN) if (TP + FN) > 0 else 0
recall_0 = TN / (TN + FP) if (TN + FP) > 0 else 0

print("precision: ", precision_1, precision_0)
print("recall: ", recall_1, recall_0)

f_measure_1 = 2 * precision_1 * recall_1 / (precision_1 + recall_1) if (precision_1 + recall_1) > 0 else 0
f_measure_0 = 2 * precision_0 * recall_0 / (precision_0 + recall_0) if (precision_0 + recall_0) > 0 else 0

print("F: ", f_measure_1, f_measure_0)

balanced_accuracy = 1/ 2 * (TP / ((TP+FN) + TN / (TN+FP)))
print("balanced_accuracy: ", balanced_accuracy)

precision:  0.2692307692307692 0.4178082191780822
recall:  0.24778761061946902 0.44525547445255476
F:  0.25806451612903225 0.43109540636042404
balanced_accuracy:  0.12340754085703257
