In [None]:
import gensim
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
from stemmer import Stemmer

In [None]:
from gensim.test.utils import datapath
from gensim import utils
import pandas as pd

class MyCorpus(object):
    """This class implements an interator that yields sentences (lists of str) required by gensim. It also performs some preprocessing like: stemming, removing stop words, splittin text to words and finally converting it to numpy arrays for further processing."""
    
    stop_list = ['.', ',', '"', ')', '(', ':', ';', '?', '!', '\'', '–', '-', '{', '}', '\xa0']
    
    def __init__(self):
        self.df = pd.read_csv("data/short_reviews_bert_10_1_50.csv")
        self.df.drop_duplicates(inplace=True)
        self.stemmer = Stemmer("root_dict.json")
        
        self.df_copy = self.df.copy()
        self.df_copy['content'] = self.df_copy['content'].apply(self.sanitize_text)
        self.X = self.df_copy['content'].to_numpy()
        self.y = self.df_copy['rating'].to_numpy()
        
    def __iter__(self):
        for row in self.X:
            yield row.split(" ")       
        
    def sanitize_text(self, text):
        temp = text.lower()
        for element in MyCorpus.stop_list:
            temp = temp.replace(element, " ")

        for j in range(10):
            temp = temp.replace("  ", " ")

        temp = temp.split(" ")
        stemmed_text = ""
        for word in temp:
            found = self.stemmer.find(word) 
            if found:
                stemmed_text += found.replace(" ","") #sometimes word has space in the end
            else:
                stemmed_text += word
            stemmed_text += " "

        return stemmed_text[:-1]
        
    
#     def __iter__(self):
#         corpus_path = datapath('lee_background.cor')
#         for line in open(corpus_path):
#             # assume there's one document per line, tokens separated by whitespace
#             yield utils.simple_preprocess(line)

In [None]:
c = MyCorpus()
#create instance of class

In [None]:
#check if iterator is working properly
for i in c:
    print(i)
    break

In [None]:
model = gensim.models.Word2Vec(sentences=c, min_count=30, size=300, workers=12, window=5)
# create gensim Word2Vec and train it with our data
# parameters:
# min_count - minimal count of occurences in whole dataset to be considered as valid input (we wanted to remove some outliers)
# size - number of dimensions that data words will be represented in
# window - window size for Word2Vec algorithm

In [None]:
from collections import defaultdict
X = []
y = c.y

index_to_word = defaultdict(str)

#converts output from gensim to list, for further processing and store index_to_word from gensim notation
for i, word in enumerate(model.wv.vocab):
    X.append(model.wv[word])
    index_to_word[i] = word



In [None]:
#convert from list to numpy array
X = np.array(X)

In [None]:
print(X.shape)

In [None]:
len(c.df)

In [None]:
from sklearn.neighbors import NearestNeighbors
#create and fit NearestNeighbors model to find N nearest friends in 300 dimensional space of words from Word2Vec
N = 10
nn = NearestNeighbors(n_neighbors = N)
nn.fit(X)

In [None]:
#these words "przeciętniak" and "pokochać" are taken from FinalAnalysis notebook where we found that these are propably highly emotional. They have very high and very low average rating respectively. The idea is that instead of manually picking positive and negative words, we picke one for each of these groups and then with Word2Vec and KNN we will find groups of words that are positive or negative. This way we can later perform sentiment analysis without need to manually search for best words representing positive/negaitve sentiment.

search1 = np.array(model.wv['gówno'])
search2 = np.array(model.wv['widowisko'])
search1 = search1.reshape(1, -1)
search2 = search2.reshape(1, -1)

dist1, out1 = nn.kneighbors(search1)
dist2, out2 = nn.kneighbors(search2)

In [None]:
print("negatywne===============================================")
negative = []
for i, index in enumerate(out1[0]):
    print(index_to_word[index], dist1[0][i])
    negative.append(index_to_word[index])

    
print("pozytywne===============================================")
positive = []
for i, index in enumerate(out2[0]):
    print(index_to_word[index], dist2[0][i])
    positive.append(index_to_word[index])

In [None]:
np.linalg.norm(np.array(model.wv['gówno']) - np.array(model.wv['widowisko']))

#it can be seen here that on average negative words have Euclidean distance to "przeciętniak" about 1.1, and positive to "pokochać" about 1.75

In [76]:
pd.options.display.max_colwidth = 1500
chosen_y = []

#store in lists representations of positive and negative words
positive_representations = [model.wv[x].reshape(1, -1)[0] for x in positive]
negative_representations = [model.wv[x].reshape(1, -1)[0] for x in negative]


def find_closest(word):
    '''Function calculating sum of distances to positive words and to negative words and normalizing it to N.'''
    try:
        w1 = np.array(model.wv[word])
    except:
        return None
    pos_avg = 0
    neg_avg = 0
    mins = np.linalg.norm(w1 - positive_representations[0])
    for vec in positive_representations[1:]:
         pos_avg += np.linalg.norm(w1 - vec)
        
    for vec in negative_representations:
        neg_avg -= np.linalg.norm(w1 - vec)


    return 1/(pos_avg/N) + 1/(neg_avg/N)

outcome = []
#for first 5000 comments calculate output
for i, comment in enumerate(tqdm(c.X[:5000])):
    sums = 0
    for word in comment.split(" "):
        out = find_closest(word)
        if out is None: continue
        sums += out
    outcome.append([c.df.iloc[i]['content'], c.df.iloc[i]['rating'], sums/len(comment.split(" "))])


100%|██████████| 5000/5000 [00:17<00:00, 280.67it/s]


In [80]:
df_out = pd.DataFrame(outcome)

In [78]:
# 20 most positive comments by our algorithm
df_out.sort_values(by=2, ascending=False).head(20)

Unnamed: 0,0,1,2
498,"Natalie- rewelacja, Jean Reno- rewelacja, Gary Oldman- wow, wow, wow ....",10,0.061332
4428,Zwłaszcza Russell Crowe i Jennifer Connelly zagrali znakomicie.,9,0.050307
2733,"Marlon Brando zmiażdżył Pacino palcem. Koniec, kropka.",10,0.046059
4986,"Genialny, ponadczasowy, przezabawny. Jak myślicie???",10,0.044811
3124,"Przesympatyczna, przezabawna i szalona komedia, polecam. No i ten Bradley Cooper ech ...",7,0.044448
601,"Mistrz Gary Oldman,równie dobry Jean Reno,i ta fantastyczna mała wtedy jeszcze Natalie Portman... niesamowity film...i tyle.",10,0.044368
1015,Ta psychodeliczna muza i scena z Bufallo Billem - niezapomniana,10,0.044112
3138,wybuchowa dawka śmiechu :)))zdjęcia na końcu- bezcenne ;P,8,0.043943
1988,"Film jest rewelacyjny, nieziemskie trio: Norton, Pitt, Bonham Carter!!! Warto przeczytać także książkę!",10,0.043488
556,"!1!Gary Oldman-""Leon""!2!Heat h Ledger-""The Dark Knight""!3!Anthony Perkins-""Psycho""",7,0.041759


In [82]:
# 50 most negative comments by our algorithm
df_out.sort_values(by=2, ascending=True).iloc[0:50]

Unnamed: 0,0,1,2
1783,"Do cholery, dlaczego nie ma wersji bluray z polskimi napisami ???? Polscy dystrybutorzy wolą wydawać szmiry na błękitnych krążkach ale o takim filmie ""zapomnieli"". Żenada!!!!",9,-0.001507
3654,cuuuuuuuuudoooooooooooooooo!!! !!!!kochamgoooooooooo,9,0.0
4291,ten film nie zasluzyl na zadne wyróznienia!! jest nudny i przewidywalny jak zupa fasolowa. Stracilem czas ogladaja to indyjskie badziewie,2,0.000258
3168,"NIe pojmuje tych ocen, toz to szmira pierwszego kalibru, ani razu sie nie usmiechnelam, ani ja ani nikt z mojej rodziny...",3,0.000391
4853,"proponuję nie odpisywać na jego żałosne prowokacje, niech jego tematy spadną na sam dół",8,0.000811
1669,shrek to moja współlokatorka też jest zielona i śmierdzi gównem,9,0.001136
2263,"nudny,jakies pierdoły opowiadaja, lepiej obejrzec cos mądrego na przykład wiadomosci w tvp pierdół nie bede słuchał kilka ciekawych scen było 2/10 daje",2,0.001475
4998,Sexmisji może najwyżej podskoczyć jeszcze MIś ewentualnie...,9,0.002346
999,"no i jednak miałem racje ci kretyni z administracji wszystko kasują uprawnienia itd ,przypadkiem takiego obłąkanego kolesia jest socrer_fw ... patrze po innych jego wypowiedziach tam użytkownikom oceny znikają tylko temu debilowi nie.HAHAHAHA ale sie filmweb kompromituje !!!!!!!!!!!!!! sądzicie że ludzie nie potrafia żyć bez filmwebu żeby robić takie jajca ? admini filmweba to jacyś durnie.Wszystkim ...",10,0.002494
4109,Ich habe eine Pistole auf Ihre Eier gerichtet seit Sie hier sitzen.Brylant aktorski!,9,0.002715
