In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ramil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ramil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from nltk.stem import PorterStemmer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
# load data
df = pd.read_csv('labeledTrainData.tsv', sep='\t', encoding='utf-8')

In [4]:
df.to_dict('records')[0]

{'id': '5814_8',
 'sentiment': 1,
 'review': "With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />Th

In [5]:
texts = df.review.tolist()
sentiment = df.sentiment.tolist()

In [6]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import math

def pmiCalc(classProb, allProb):
    if classProb == 0 or allProb == 0: 
        return 0
    return math.log10(classProb / allProb)

cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

def cleanhtml(text):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', text)
    return cleantext

def word_tokenize(text):
    tokenizer = RegexpTokenizer(r'\w+')
    clean = tokenizer.tokenize(text)
    return clean
   
def stop_words(tokenize_words):
    stopWords = set(stopwords.words('english'))
    wordsFiltered = []
    for w in tokenize_words:
        if w not in stopWords:
            wordsFiltered.append(w.lower())
    return wordsFiltered


def preprocess_text(text):
    #clean html 
    clean = cleanhtml(text)
    
    # tokenize and remove punctuation 
    tokenize = word_tokenize(clean)
    
    # stop words and lower
    stop = stop_words(tokenize)
    
    return stop


def space_tokenize(text):
    # a_b_c -> ['a', 'b', 'c']
    split_array = text.split(' ');
    return split_array
    
    
def get_vocab(texts, rare = False, percent = 10):
    # unique words
    all_word_dict = {}
    
    all_words_set = set()
    for text in texts: 
#       all_words = space_tokenize(text)
        
        all_words = preprocess_text(text)
        for word in all_words: 
            wd = all_word_dict.get(word)
            if wd == None:
                all_word_dict.update([(word, 1)])
            else:
                all_word_dict.update([(word, wd+1)])
                
        words_set = all_words_set.update(all_words)
    all_sorted = {k: v for k, v in sorted(all_word_dict.items(), key=lambda item: item[1])}
    
    words_amount = len(all_sorted)
    delta_word_amount = round(words_amount / percent)
    
    
    words = []
    for k, v in all_sorted.items():
        words.append([k, v])
      
    print("all: ", len(words), "persent: ", percent)
#     remove rare and popular 
    without_rare_and_popular = words[delta_word_amount: len(words) - delta_word_amount]
    print("without_rare_and_popular", len(without_rare_and_popular))

    return dict(without_rare_and_popular)






class BOWencoder:
    def __init__(self, vocab=None, tokenize=preprocess_text):
        self.vocab = vocab
        self.vocab2idx = self.get_vocab2idx(vocab)
        self.tokenize = tokenize
        
    def encode_single_text(self, text):
        """
            text -> vector
        """
#         words = space_tokenize(text)
        words = self.tokenize(text)
        vector = [0 for i in range(len(self.vocab))]
        
        for text_word in words:
            if self.vocab2idx.get(text_word) != None:
                vector[self.vocab2idx.get(text_word)] = 1   
        return vector

        
    def encode_texts(self, texts):
        """
            multiple text
        """
        vector_text_data = []
        for text in texts:
            vector_text_data.append(self.encode_single_text(text))
        return vector_text_data
        
    def get_vocab2idx(self, vocab):
        """ 
           id -> word dict
        """
        dictV = {word: idx for idx, word in zip(range(len(vocab)), vocab)}
        return dictV

In [7]:
class NaiveBayesClassifier:
    def __init__(self, n_classes):
        self.n_classes = n_classes
        self.class_probs = {}
        self.conditional_probabilities = {}
        self.pmi = {}
        self.idf = []

    def fit(self, X, y):
        """
            X - matrix
            y - result
        """
#         pmi 
        # class_probs
        y_len = len(y)
        positive_amount = sum(y)
        positive = positive_amount / y_len
        negative = 1 - positive
        self.class_probs.update([(0, negative), (1, positive)])
        print(positive, negative)
        # conditional_probabilities
        arr_len = len(X[0])
        pos_f_in_neg = [0 for i in range(arr_len)]
        pos_f_in_pos = [0 for i in range(arr_len)]
        pos_f_in_all = [0 for i in range(arr_len)]
        for row, y_idx in zip(X, range(y_len)):
            for item_idx in range(len(row)):
                pos_f_in_all[item_idx] += row[item_idx]
                if y[y_idx] == 0:
                    pos_f_in_neg[item_idx] += row[item_idx]
#                     pos_f_in_neg[item_idx] += 1
                else:
                    pos_f_in_pos[item_idx] += row[item_idx]
#                     pos_f_in_pos[item_idx] += 1
                    
        pos_f_in_negative = pos_f_in_neg
        pos_f_in_positive = pos_f_in_pos
        pos_f_in_all_words = pos_f_in_all
#         print(pos_f_in_positive, pos_f_in_negative)
        
        for idx in range(len(pos_f_in_neg)):
            pos_f_in_neg[idx] /= (y_len - positive_amount)
            pos_f_in_pos[idx] /= (positive_amount)
            pos_f_in_all[idx] /= y_len
        self.conditional_probabilities.update([(0, pos_f_in_neg), (1, pos_f_in_pos)])
        
#         pmi
        neg_pmi = []
        pos_pmi = []
        for idx in range(len(pos_f_in_neg)):
            neg_pmi.append({idx: pmiCalc(pos_f_in_neg[idx], pos_f_in_all[idx])})
            pos_pmi.append({idx: pmiCalc(pos_f_in_pos[idx], pos_f_in_all[idx])})
        
        self.pmi = dict([(0, neg_pmi), (1, pos_pmi)])
#         print(self.pmi)
#         print(self.conditional_probabilities)

#         idf
        idf_temp = []
        for idx in range(len(pos_f_in_neg)):
            idf_temp.append(math.log10(y_len/(pos_f_in_negative[idx]+pos_f_in_positive[idx])))
#         print(idf_temp)



        
    def predict(self, X):
        result_array = []
        f = open('a.txt', 'w')
        for row in X:
            positive_value = self.class_probs.get(1)
            negative_value = self.class_probs.get(0)
            f.write(str(positive_value) + " " +  str(negative_value) + "\n")
            for item_idx in range(len(row)):
                
                if row[item_idx]:
                    if self.conditional_probabilities.get(1)[item_idx] != 0.0:
                        positive_value *= self.conditional_probabilities.get(1)[item_idx] 
                    if self.conditional_probabilities.get(0)[item_idx] != 0.0:
                        negative_value *= self.conditional_probabilities.get(0)[item_idx] 
#                 else:
#                     positive_value *= (1 - self.conditional_probabilities.get(1)[item_idx])
#                     negative_value *= (1 - self.conditional_probabilities.get(0)[item_idx])
                f.write(str(positive_value) + " " +  str(negative_value) + " " + str(row[item_idx]) + " "+ str(self.conditional_probabilities.get(1)[item_idx])+ " "+ str(self.conditional_probabilities.get(0)[item_idx]) + "\n")
            
            result_array.append(1 if positive_value > negative_value else 0)
            f.write("---\n")
        return result_array


In [8]:
amount = 1000
X_train, X_test, y_train, y_test = train_test_split(texts[:amount], sentiment[:amount])
train_texts = X_train
# vocab = get_vocab(X_train)
# vocab.update(get_vocab(X_test[:amount]))
# pmi_vocab = get_pmi_vocab(vocab, X_train, y_train);
# bow_encoder = BOWencoder(vocab=vocab)
# print(vocab)
# X_train = bow_encoder.encode_texts(X_train)
# X_test = bow_encoder.encode_texts(X_test)
# print(len(X_train), len(y_train))

In [10]:
model = NaiveBayesClassifier(n_classes=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(y_pred)
print(y_test[:amount])

0.484 0.516


TypeError: unsupported operand type(s) for +=: 'int' and 'str'

In [11]:
# print(classification_report(y_pred, y_test[:amount]))

In [12]:
# clean text
import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

def cleanhtml(text):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', text)
    return cleantext

def word_tokenize(text):
    tokenizer = RegexpTokenizer(r'\w+')#fix
    clean = tokenizer.tokenize(text)
    return clean
   
def stop_words(tokenize_words):
    stopWords = set(stopwords.words('english'))
    wordsFiltered = []
    for w in tokenize_words:
        if w not in stopWords:
            wordsFiltered.append(w.lower())
    return wordsFiltered


def stem(text_arr):
    ps = PorterStemmer()
    stem_arr = []
    for word in text_arr:
        stem_arr.append(ps.stem(word))
    return stem_arr

def preprocess_text(text):
    #clean html 
    clean = cleanhtml(text)
    
    # tokenize and remove punctuation 
    tokenize = word_tokenize(clean)
    
    # stop words
    stop = stop_words(tokenize)
    
    # stem 
    stem_arr = stem(stop)
    
    return stem_arr


def preprocess_texts(texts):
    tmp_set = set()
    for text in texts:
        arr = preprocess_text(text) 
        tmp_set.update(arr)
    return tmp_set

print(preprocess_text(df.to_dict('records')[0]['review']))

['with', 'stuff', 'go', 'moment', 'mj', 'start', 'listen', 'music', 'watch', 'odd', 'documentari', 'watch', 'the', 'wiz', 'watch', 'moonwalk', 'mayb', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'realli', 'cool', 'eighti', 'mayb', 'make', 'mind', 'whether', 'guilti', 'innoc', 'moonwalk', 'part', 'biographi', 'part', 'featur', 'film', 'rememb', 'go', 'see', 'cinema', 'origin', 'releas', 'some', 'subtl', 'messag', 'mj', 'feel', 'toward', 'press', 'also', 'obviou', 'messag', 'drug', 'bad', 'kay', 'visual', 'impress', 'cours', 'michael', 'jackson', 'unless', 'remot', 'like', 'mj', 'anyway', 'go', 'hate', 'find', 'bore', 'some', 'may', 'call', 'mj', 'egotist', 'consent', 'make', 'movi', 'but', 'mj', 'fan', 'would', 'say', 'made', 'fan', 'true', 'realli', 'nice', 'the', 'actual', 'featur', 'film', 'bit', 'final', 'start', '20', 'minut', 'exclud', 'smooth', 'crimin', 'sequenc', 'joe', 'pesci', 'convinc', 'psychopath', 'power', 'drug', 'lord', 'whi', 'want', 'mj', 'dead', 'bad', 'be

In [13]:
TP = 0
TN = 0
FP = 0
FN = 0

for i in range(len(y_pred)):
    if y_test[i] == y_pred[i] and y_test[i] == 1:
        TP += 1
    if y_test[i] == y_pred[i] and y_test[i] == 0:
        TN += 1
    if y_test[i] != y_pred[i] and y_test[i] == 1:
        FN += 1
    if y_test[i] != y_pred[i] and y_test[i] == 0:
        FP += 1

precision_1 = TP / (TP + FP) if (TP + FP) > 0 else 0
precision_0 = TN / (FN + TN) if (FN + TN) > 0 else 0
recall_1 = TP / (TP + FN) if (TP + FN) > 0 else 0
recall_0 = TN / (TN + FP) if (TN + FP) > 0 else 0

print("precision: ", precision_1, precision_0)
print("recall: ", recall_1, recall_0)

f_measure_1 = 2 * precision_1 * recall_1 / (precision_1 + recall_1) if (precision_1 + recall_1) > 0 else 0
f_measure_0 = 2 * precision_0 * recall_0 / (precision_0 + recall_0) if (precision_0 + recall_0) > 0 else 0

print("F: ", f_measure_1, f_measure_0)

balanced_accuracy = 1/ 2 * (TP / ((TP+FN) + TN / (TN+FP)))
print("balanced_accuracy: ", balanced_accuracy)

NameError: name 'y_pred' is not defined

In [14]:
# after 

import math

def pmi(classProb, allProb):
    return math.log10(classProb / allProb)

pmi(1/2, 2/3)


-0.12493873660829995

In [15]:
# tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
# print(train_texts)
corpus = train_texts

vocab_tf = preprocess_texts(corpus)

tfidf = TfidfVectorizer(stop_words = 'english', vocabulary=vocab_tf)
X = tfidf.fit_transform(corpus)
X_dense = X.todense()

# print(tfidf.get_feature_names())
X_test_transform = tfidf.transform(X_test)
X_test_transform_dense = X_test_transform.todense()
    


In [16]:
# LinearRegression
from sklearn.linear_model import LogisticRegression
import numpy as np

temp = []
# x = np.array([5, 15, 25, 35, 45, 55]).reshape((-1, 1))
# y = np.array([5, 20, 14, 32, 22, 38])

# for row in X_train:
#     temp.append([row])
# arr = np.array(temp)
# reg = LinearRegression().fit(x, y)
# r_sq = reg.score(x, y)
# print('coefficient of determination:', r_sq)

clf = LogisticRegression(random_state=0).fit(X_dense, y_train)
pred = clf.predict(X_test_transform_dense)
print(classification_report(pred, y_test[:amount]))

              precision    recall  f1-score   support

           0       0.74      0.71      0.73       136
           1       0.67      0.70      0.69       114

    accuracy                           0.71       250
   macro avg       0.71      0.71      0.71       250
weighted avg       0.71      0.71      0.71       250



In [17]:
# DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

dclf = DecisionTreeClassifier(random_state=0)

dclf.fit(X_dense, y_train)      # Use fit method on the train data

dtc_pred = dclf.predict(X_test_transform_dense)   # Predict the target class of test data
print(classification_report(dtc_pred, y_test[:amount])) 

              precision    recall  f1-score   support

           0       0.64      0.64      0.64       131
           1       0.61      0.61      0.61       119

    accuracy                           0.62       250
   macro avg       0.62      0.62      0.62       250
weighted avg       0.62      0.62      0.62       250



In [18]:
# KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier


neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_dense, y_train)
neigh_pred = neigh.predict(X_test_transform_dense)
print(classification_report(neigh_pred, y_test[:amount]))

              precision    recall  f1-score   support

           0       0.64      0.65      0.65       129
           1       0.62      0.61      0.62       121

    accuracy                           0.63       250
   macro avg       0.63      0.63      0.63       250
weighted avg       0.63      0.63      0.63       250



In [19]:
neigh4 = KNeighborsClassifier(n_neighbors=4)
neigh4.fit(X_dense, y_train)
neigh_pred4 = neigh.predict(X_test_transform_dense)
print(classification_report(neigh_pred4, y_test[:amount]))

              precision    recall  f1-score   support

           0       0.64      0.65      0.65       129
           1       0.62      0.61      0.62       121

    accuracy                           0.63       250
   macro avg       0.63      0.63      0.63       250
weighted avg       0.63      0.63      0.63       250



In [20]:
# KMeans
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

clusters = range(2,10)

for n_clusters in clusters:
    clusterer = KMeans(n_clusters=n_clusters)
    preds = clusterer.fit_predict(X_dense)
    centers = clusterer.cluster_centers_

    score = silhouette_score(X_dense, preds)
    print("n clusters = {}, silhouette score is {})".format(n_clusters, score))


n clusters = 2, silhouette score is 0.001082290848147776)
n clusters = 3, silhouette score is 0.002111464647337573)
n clusters = 4, silhouette score is 0.0013670162761225426)
n clusters = 5, silhouette score is 0.0007808571662161179)
n clusters = 6, silhouette score is 0.0013833071246397003)
n clusters = 7, silhouette score is 0.000584318311731445)
n clusters = 8, silhouette score is 0.001582752481063855)
n clusters = 9, silhouette score is 0.0011253640559257972)


In [21]:
from sklearn.cluster import AgglomerativeClustering    

clusters = range(2,15)

for n_clusters in clusters:
    clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='average')
    preds = clusterer.fit_predict(X_dense)

    score = silhouette_score(X_dense, preds)
    print("n clusters = {}, silhouette score is {})".format(n_clusters, score))



n clusters = 2, silhouette score is 0.012089099341894212)
n clusters = 3, silhouette score is 0.008015915180398023)
n clusters = 4, silhouette score is 0.005399846404261093)
n clusters = 5, silhouette score is 0.003625392808416664)
n clusters = 6, silhouette score is 0.002875469549989265)
n clusters = 7, silhouette score is 0.002520163433484808)
n clusters = 8, silhouette score is 0.001365765847717215)
n clusters = 9, silhouette score is 0.0010971044877066919)
n clusters = 10, silhouette score is 0.0009436853901055807)
n clusters = 11, silhouette score is 0.0005209377121339101)
n clusters = 12, silhouette score is 9.346444603566743e-06)
n clusters = 13, silhouette score is -0.0007078377508032418)
n clusters = 14, silhouette score is -0.0008103203460560886)


In [22]:
max_clusterer = KMeans(n_clusters=2)
max_preds = max_clusterer.fit_predict(X_dense)
max_centers = max_clusterer.cluster_centers_
print(max_centers)
print(len(max_centers[0]))

[[ 0.00000000e+00  8.01253341e-04  1.42444805e-03 ...  1.47421436e-03
   1.35525272e-20 -2.43945489e-19]
 [ 0.00000000e+00 -3.25260652e-19  1.17373139e-03 ... -8.67361738e-19
   1.68241264e-04  2.35084258e-04]]
11520


In [23]:
from sklearn.metrics import pairwise_distances_argmin_min
closest, _ = pairwise_distances_argmin_min(max_centers, X_dense)
print(closest)

[143 170]


In [24]:
# vocab_tf contains vocab
# max_preds pred
# corpus - texts 
clean_texts = []
for text_idx in range(len(corpus)):
    clean_texts.append({'words': preprocess_text(corpus[text_idx]), 'ans': max_preds[text_idx]})

In [25]:
pos_amount = sum(max_preds)
neg_amount = len(max_preds) - pos_amount
print(pos_amount, neg_amount, len(max_preds))
vocab_tf_list = list(vocab_tf)
vocab_tf_list_range = range(len(vocab_tf_list))
pos_amount_words = [0 for i in vocab_tf_list_range]
neg_amount_words = [0 for i in vocab_tf_list_range]
all_amount_words = [0 for i in vocab_tf_list_range]
for vocab_word_idx in vocab_tf_list_range:
    for text in clean_texts:
        if(vocab_tf_list[vocab_word_idx] in text['words']):
            all_amount_words[vocab_word_idx] += 1
            if text['ans'] == 1:
                pos_amount_words[vocab_word_idx] += 1
            else: 
                neg_amount_words[vocab_word_idx] += 1

473 277 750


In [26]:
neg_pmi = []
pos_pmi = []
for idx in vocab_tf_list_range:
    pos_pmi_calc = pmiCalc(pos_amount_words[idx], all_amount_words[idx])
    if pos_pmi_calc != 0:
        pos_pmi.append({'idx': idx, 'pmi': pos_pmi_calc})  
    
    neg_pmi_calc = pmiCalc(neg_amount_words[idx], all_amount_words[idx])
    if neg_pmi_calc != 0:
        neg_pmi.append({'idx': idx, 'pmi': neg_pmi_calc})  
            

In [27]:
sorted_pos_pmi = sorted(pos_pmi, key = lambda x : x['pmi'], reverse = True)
sorted_neg_pmi = sorted(neg_pmi, key = lambda x : x['pmi'], reverse = True)

top_amount = 10
print(f'Top {top_amount} positive words')
for word in range(top_amount):
    print(vocab_tf_list[sorted_pos_pmi[word]['idx']])
print('------------')
print(f'Top {top_amount} nagetive words')
for word in range(top_amount):
    print(vocab_tf_list[sorted_neg_pmi[word]['idx']])
    

Top 10 positive words
u
pair
hill
accid
fellow
correct
thrown
camp
gari
disgust
------------
Top 10 nagetive words
epic
credibl
elev
frankli
admittedli
desert
detract
ingeni
lumet
sincer


In [28]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
import numpy as np

n = range(100, 250, 10)

temp = []

clf = LogisticRegression(random_state=0).fit(X_dense, y_train)
pred = clf.predict(X_test_transform_dense)
print(classification_report(pred, y_test[:amount]))

for n_components in n:
    pca = PCA(n_components=n_components)
    pca.fit(X_dense)
    ans = pca.transform(X_dense)
    
    pca.fit(X_test_transform_dense)
    ans_test = pca.transform(X_test_transform_dense)
    
    
    clf = LogisticRegression(random_state=0).fit(ans, y_train)
    pred = clf.predict(ans_test)
    report = classification_report(pred, y_test[:amount], output_dict=True)

    print(f'n clusters = {n_components}, aaccuracy = {report["accuracy"]}')

              precision    recall  f1-score   support

           0       0.74      0.71      0.73       136
           1       0.67      0.70      0.69       114

    accuracy                           0.71       250
   macro avg       0.71      0.71      0.71       250
weighted avg       0.71      0.71      0.71       250

n clusters = 100, aaccuracy = 0.52
n clusters = 110, aaccuracy = 0.516
n clusters = 120, aaccuracy = 0.568
n clusters = 130, aaccuracy = 0.528
n clusters = 140, aaccuracy = 0.532
n clusters = 150, aaccuracy = 0.544
n clusters = 160, aaccuracy = 0.564
n clusters = 170, aaccuracy = 0.564
n clusters = 180, aaccuracy = 0.54
n clusters = 190, aaccuracy = 0.52
n clusters = 200, aaccuracy = 0.548
n clusters = 210, aaccuracy = 0.512
n clusters = 220, aaccuracy = 0.552
n clusters = 230, aaccuracy = 0.536
n clusters = 240, aaccuracy = 0.552


In [29]:

import gensim
from bs4 import BeautifulSoup
import re, string
clean = [ ]

for doc in texts:
    x = doc.lower()                     
    x = BeautifulSoup(x, 'lxml').text   
    x = re.sub('[^A-Za-z0-9]+', ' ', x)
    x = x.split(' ')
    clean.append(x)

In [30]:
print(clean[0])

['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again', 'maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent', 'moonwalker', 'is', 'part', 'biography', 'part', 'feature', 'film', 'which', 'i', 'remember', 'going', 'to', 'see', 'at', 'the', 'cinema', 'when', 'it', 'was', 'originally', 'released', 'some', 'of', 'it', 'has', 'subtle', 'messages', 'about', 'mj', 's', 'feeling', 'towards', 'the', 'press', 'and', 'also', 'the', 'obvious', 'message', 'of', 'drugs', 'are', 'bad', 'm', 'kay', 'visually', 'impressive', 'but', 'of', 'course', 'this', 'is', 'all', 'about', 

In [31]:
model = gensim.models.Word2Vec(sentences=clean, vector_size=200, window=10, min_count=10, negative=15)

In [32]:
market = model.wv.most_similar('market')
print("market", market)
print()

street = model.wv.most_similar('street')
print("street", street)
print()

game = model.wv.most_similar('game')
print("game", game)
print()

lesson = model.wv.most_similar('lesson')
print("lesson", lesson)
print()


market [('usa', 0.6520488858222961), ('studios', 0.6254586577415466), ('cinemas', 0.6106382012367249), ('national', 0.5976017117500305), ('access', 0.5929945111274719), ('europe', 0.5917080044746399), ('uk', 0.588087260723114), ('distributor', 0.5803756713867188), ('theaters', 0.5784987211227417), ('stations', 0.5773553252220154)]

street [('elm', 0.7049148082733154), ('beach', 0.7016922235488892), ('bus', 0.684274435043335), ('streets', 0.6769227385520935), ('park', 0.675114095211029), ('bike', 0.6698469519615173), ('nightmare', 0.6573826670646667), ('roof', 0.6552714705467224), ('pickup', 0.6508170962333679), ('bench', 0.6490457653999329)]

game [('games', 0.6782853603363037), ('football', 0.5202775597572327), ('clip', 0.45584455132484436), ('basketball', 0.42370274662971497), ('store', 0.42357170581817627), ('baseball', 0.4216800928115845), ('stores', 0.4211229383945465), ('player', 0.4094279110431671), ('video', 0.4052484333515167), ('missions', 0.4023422300815582)]

lesson [('stat

In [33]:
from sklearn.neighbors import KDTree

# pca = PCA(n_components=100)
# pca.fit(X_dense)
# ans = pca.transform(X_dense)
print(X_dense[:10])
tree = KDTree(X_dense)
dist, ind = tree.query(X_dense[:10], k=3) 
print(dist)
print(ind)
    
    

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0.         1.33241903 1.33377324]
 [0.         1.25177617 1.27739943]
 [0.         1.24333221 1.27485093]
 [0.         1.27514949 1.28733204]
 [0.         1.29498475 1.30467611]
 [0.         1.29417285 1.31284273]
 [0.         1.2835654  1.30435275]
 [0.         1.28982187 1.33307602]
 [0.         1.29705226 1.30535572]
 [0.         1.3070475  1.31370258]]
[[  0 729  74]
 [  1 746  23]
 [  2 648 665]
 [  3 660 563]
 [  4 174 478]
 [  5 180 583]
 [  6 387  23]
 [  7 464 301]
 [  8 593 549]
 [  9 353 593]]


In [34]:
def preprocessing(corpus):
    # initialize
    clean_text = []

    for row in corpus:
        # tokenize
        tokens = nltk.tokenize.word_tokenize(row)
        # lowercase
        tokens = [token.lower() for token in tokens]
        # isword
        tokens = [token for token in tokens if token.isalpha()]
        clean_sentence = ''
        clean_sentence = ' '.join(token for token in tokens)
        clean_text.append(clean_sentence)
        
    return clean_text
    
all_text = preprocessing(texts)

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
# Convert a collection of text documents to a matrix of token counts
cv = CountVectorizer(ngram_range=(1,1), stop_words = 'english')
# matrix of token counts
X = cv.fit_transform(all_text)
Xc = (X.T * X) # matrix manipulation
Xc.setdiag(0) # set the diagonals to be zeroes as it's pointless to be 1

In [47]:
names = cv.get_feature_names()
print(names.index('market'))

37953


In [57]:
from sklearn.decomposition import TruncatedSVD
clf = TruncatedSVD(100)
Xpca = clf.fit_transform(Xc)
pca = PCA(n_components=100)
pca.fit(Xc)
ans = pca.transform(Xpca)

TypeError: PCA does not support sparse input. See TruncatedSVD for a possible alternative.

In [45]:
print(len(ans[0]))

100


In [51]:
tree = KDTree(ans)

In [52]:
market = names.index('market')
dist, ind = tree.query([ans[market]], k=5) 
print(ind)
for i in ind:
    for j in i:
        print(names[j])

[[37953 46630 67941 48588 51353]]
market
placed
welcome
provided
reputation


In [53]:
street = names.index('street')
dist, ind = tree.query([ans[street]], k=5) 
print(ind)
for i in ind:
    for j in i:
        print(names[j])

[[59429  8267 18064 51947 23115]]
street
business
dream
rich
following


In [54]:
game = names.index('game')
dist, ind = tree.query([ans[game]], k=5) 
print(ind)
for i in ind:
    for j in i:
        print(names[j])

[[24365 22279 27170 52991 68735]]
game
fight
hand
run
wo


In [55]:
lesson = names.index('lesson')
dist, ind = tree.query([ans[lesson]], k=5) 
print(ind)
for i in ind:
    for j in i:
        print(names[j])

[[35526 29019 60507 45126 22571]]
lesson
hopes
survive
passed
fits
