In [3]:
import spacy

In [4]:
nlp = spacy.load('en_core_web_md')

In [10]:
nlp(u'The quick brown fox jumped').vector.shape

(384,)

In [6]:
nlp(u'fox').vector.shape

(300,)

In [7]:
tokens = nlp(u'like love hate')

In [8]:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text,token2.text,token1.similarity(token2))

like like 1.0
like love 0.65790397
like hate 0.6574652
love like 0.65790397
love love 1.0
love hate 0.6393099
hate like 0.6574652
hate love 0.6393099
hate hate 1.0


In [12]:
len(nlp.vocab.vectors.shape)

2

In [10]:
tokens = nlp(u'cat dog nargle')

In [13]:
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

cat True 6.6808186 False
dog True 7.0336733 False
nargle False 0.0 True


In [14]:
from scipy import spatial

In [15]:
cosine_similarity = lambda vec1, vec2: 1 - spatial.distance.cosine(vec1, vec2)

In [16]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

In [17]:
new_vector = king-man+woman

In [19]:
computed_similarities = []
for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector,word.vector)
                computed_similarities.append((word,similarity))

In [20]:
computed_similarities = sorted(computed_similarities, key=lambda item:-item[1])

In [21]:
print([t[0].text for t in computed_similarities[:10]])

['king', 'queen', 'commoner', 'highness', 'prince', 'sultan', 'maharajas', 'princes', 'kumbia', 'kings']


In [22]:
import nltk

In [23]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...


True

In [24]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [25]:
sid = SentimentIntensityAnalyzer()

In [26]:
a = "This is a good movie"

In [27]:
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [28]:
a = 'This was the best, most aweome movie EVER MADE!!!'

In [29]:
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.612, 'pos': 0.388, 'compound': 0.7249}

In [30]:
a = "Worst movie ever"

In [31]:
sid.polarity_scores(a)

{'neg': 0.672, 'neu': 0.328, 'pos': 0.0, 'compound': -0.6249}

In [32]:
a = "What a movie"

In [33]:
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [34]:
import pandas as pd

In [37]:
df = pd.read_csv('C:\\Users\\adity\\OneDrive\\Desktop\\NLP\\UPDATED-NLP-COURSE\\UPDATED_NLP_COURSE\\TextFiles\\amazonreviews.tsv', sep='\t')

In [38]:
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [39]:
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [40]:
df.dropna(inplace=True)

In [45]:
blanks = []
for i,lb,rv in df.itertuples():
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)

In [50]:
df.drop(blanks, inplace = True)

In [51]:
df.iolc[0]['review']

AttributeError: 'DataFrame' object has no attribute 'iolc'

In [52]:
sid.polarity_scores(df.iloc[0]['review'])

{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [53]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))

In [54]:
df.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [56]:
df['compound'] = df['scores'].apply(lambda d:d['compound'])

In [59]:
df.head()

Unnamed: 0,label,review,scores,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781


In [60]:
import numpy as np

In [62]:
df['compound'] = np.where(df['compound'] > 0, 'pos', 'neg')

In [63]:
df.head()

Unnamed: 0,label,review,scores,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",pos


In [64]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [65]:
accuracy_score(df['label'], df['compound'])

0.7122

In [66]:
print(classification_report(df['label'], df['compound']))

              precision    recall  f1-score   support

         neg       0.85      0.53      0.65      5097
         pos       0.65      0.90      0.75      4903

   micro avg       0.71      0.71      0.71     10000
   macro avg       0.75      0.72      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [67]:
print(confusion_matrix(df['label'], df['compound']))

[[2709 2388]
 [ 490 4413]]


In [69]:
df = pd.read_csv("C://Users//adity//OneDrive//Desktop//NLP//UPDATED-NLP-COURSE//UPDATED_NLP_COURSE//TextFiles/moviereviews.tsv", sep = '\t')

In [70]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [71]:
df.dropna(inplace = True)

In [72]:
blanks = []
for i,lb,rv in df.itertuples():
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)

In [74]:
df.drop(blanks, inplace=True)

In [75]:
df['label'].value_counts()

neg    969
pos    969
Name: label, dtype: int64

In [76]:
sid = SentimentIntensityAnalyzer()

In [78]:
df['scores'] = df['review'].apply(lambda review:sid.polarity_scores(review))

In [79]:
df['compound'] = df['scores'].apply(lambda d:d['compound'])

In [80]:
df['compound'] = np.where(df['compound'] > 0, 'pos', 'neg')

In [81]:
df.head()

Unnamed: 0,label,review,scores,compound
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com...",pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co...",pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com...",neg


In [83]:
accuracy_score(df['label'], df['compound'])

0.6367389060887513

In [84]:
print(classification_report(df['label'], df['compound']))

              precision    recall  f1-score   support

         neg       0.72      0.44      0.55       969
         pos       0.60      0.83      0.70       969

   micro avg       0.64      0.64      0.64      1938
   macro avg       0.66      0.64      0.62      1938
weighted avg       0.66      0.64      0.62      1938



NameError: name 'sizeof' is not defined