In [2]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [4]:
nlp(u'The quick brown fox jumped.').vector.shape

(300,)

In [5]:
nlp(u'fox').vector.shape

(300,)

In [6]:
tokens = nlp(u'lion cat pet')

In [8]:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

lion lion 1.0
lion cat 0.52654374
lion pet 0.39923766
cat lion 0.52654374
cat cat 1.0
cat pet 0.7505456
pet lion 0.39923766
pet cat 0.7505456
pet pet 1.0


In [10]:
nlp.vocab.vectors.shape

(684831, 300)

In [9]:
tokens = nlp(u"dog cat nargle")
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
nargle False 0.0 True


In [11]:
from scipy import spatial

cosine_similarity = lambda vec1, vec2: 1 - spatial.distance.cosine(vec1, vec2)

In [12]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

In [13]:
new_vector = king - man + woman

In [16]:
computed_similarities = []

# for every word in my vocabulary
for word in nlp.vocab:  
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:                                              # check if all character are a-z letters
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word, similarity))

In [17]:
computed_similarities = sorted(computed_similarities, key=lambda item:-item[1])     # sort by descending order

In [19]:
print([t[0].text for t in computed_similarities[:10]])

['king', 'queen', 'prince', 'kings', 'princess', 'royal', 'throne', 'queens', 'monarch', 'kingdom']


In [20]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\chamin\AppData\Roaming\nltk_data...


True

In [21]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()



In [22]:
a = "This is a good movie"
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [23]:
a = "This was the best, most awesome movie EVER MADE!!!"

In [24]:
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.425, 'pos': 0.575, 'compound': 0.8877}

In [25]:
a = "This was the WORST movie that has ever disgraced the screen."
sid.polarity_scores(a)

{'neg': 0.465, 'neu': 0.535, 'pos': 0.0, 'compound': -0.8331}

In [26]:
import pandas as pd

In [27]:
%pwd

'C:\\Users\\chamin\\Documents\\NLP'

In [31]:
df = pd.read_csv('..\\NLP\\UPDATED_NLP_COURSE\\TextFiles\\amazonreviews.tsv', sep='\t')

In [32]:
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [33]:
df.dropna(inplace=True)

In [35]:
for index, label, review in df.itertuples():
    if type(review) == str:
        if review.isspace():
            blanks.append(index)

In [38]:
df.iloc[0]['review']

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [37]:
sid.polarity_scores(df.iloc[0]['review'])

{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [39]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))

In [41]:
df.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [42]:
df['compound'] = df['scores'].apply(lambda d: d['compound'])

In [43]:
df.head()

Unnamed: 0,label,review,scores,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781


In [44]:
df['compound_score'] = df['compound'].apply(lambda score: 'pos' if score >=0 else 'neg')

In [45]:
df.head()

Unnamed: 0,label,review,scores,compound,compound_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


In [49]:
from sklearn.metrics import accuracy_score, classification_report

In [55]:
accuracy_score(df['label'], df['compound_score'])

0.7091

In [56]:
print(classification_report(df['label'], df['compound_score']))

              precision    recall  f1-score   support

         neg       0.86      0.51      0.64      5097
         pos       0.64      0.91      0.75      4903

   micro avg       0.71      0.71      0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [58]:
movie = pd.read_csv('..\\NLP\\UPDATED_NLP_COURSE\\TextFiles\\moviereviews.tsv', sep='\t')

In [59]:
movie.dropna(inplace=True)

blanks = []
for index, label, review in movie.itertuples():
    if type(review) == str:
        if review.isspace():
            blanks.append(index)
            
len(blanks)

27

In [60]:
movie.drop(blanks, inplace=True)

In [61]:
movie['label'].value_counts()

neg    969
pos    969
Name: label, dtype: int64

In [62]:
movie['scores'] = movie['review'].apply(lambda review: sid.polarity_scores(review))

In [64]:
movie['compound'] = movie['scores'].apply(lambda d: d['compound'])

In [65]:
movie['compound_score'] = movie['compound'].apply(lambda score: 'pos' if score >=0 else 'neg')

In [67]:
movie.head()

Unnamed: 0,label,review,scores,compound,compound_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com...",0.9953,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com...",-0.7264,neg


In [68]:
accuracy_score(movie['label'], movie['compound_score'])

0.6367389060887513

In [69]:
print(classification_report(df['label'], df['compound_score']))

              precision    recall  f1-score   support

         neg       0.86      0.51      0.64      5097
         pos       0.64      0.91      0.75      4903

   micro avg       0.71      0.71      0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000

