In [1]:
import spacy 

nlp = spacy.load('en_core_web_md')
doc = nlp("Is this the region, is this the soil, the clime")

In [2]:
vec = doc[0].vector
print(len(vec))
vec[:10]

300


array([-8.4961e-02,  5.0200e-01,  2.3823e-03, -1.6755e-01,  3.0721e-01,
       -2.3762e-01,  1.6069e-01, -3.6786e-01, -5.8347e-02,  2.4990e+00],
      dtype=float32)

In [3]:
doc = nlp("Heilo how are you?")

for w in doc:
    print(w.text, "\t", w.is_oov, w.has_vector, w.vector_norm)

Heilo 	 True False 0.0
how 	 False True 5.2509694
are 	 False True 5.41568
you 	 False True 5.1979666
? 	 False True 5.1608233


 Come possiamo vedere la parola heilo non è presente nel vocabolario e visto che non esiste un word embedding per questa parola la sua norma è pari a 0

In [4]:
# we can obtain a word vector for a document by averaging the vectors of all the words
import numpy as np

v = np.mean([w.vector for w in doc], 0)
v[:10]

array([-0.12565279,  0.1822664 , -0.23188598, -0.14474145,  0.08241   ,
       -0.090287  ,  0.09519981, -0.045398  , -0.0420998 ,  1.88006   ],
      dtype=float32)

In [5]:
#similarity check

doc = nlp("Do you like cups or cuppuluni or cupcake?")
word = nlp("cupcakes")

for w in doc:
    print("{} vs {} \t {:0.2f}".format(w.text, word.text, w.similarity(word)))


Do vs cupcakes 	 0.21
you vs cupcakes 	 0.23
like vs cupcakes 	 0.28
cups vs cupcakes 	 0.48
or vs cupcakes 	 0.18
cuppuluni vs cupcakes 	 0.00
or vs cupcakes 	 0.18
cupcake vs cupcakes 	 1.00
? vs cupcakes 	 0.18


  "__main__", mod_spec)


In [6]:
#we can do so by using documents

doc1 = nlp("Spacchi e Piriti")
doc2 = nlp("Padroni")
doc3 = nlp("BACARA'")

print("Similarities between document 1 and 2: {:0.2f}".format(doc1.similarity(doc2)))
print("Similarities between document 1 and 3: {:0.2f}".format(doc1.similarity(doc3)))

Similarities between document 1 and 2: 0.00
Similarities between document 1 and 3: 0.26


  "__main__", mod_spec)


# Word Arithmetics

In [7]:
from tqdm import tqdm

w = nlp("brother")
man = nlp("man")
woman = nlp("woman")

v = w.vector - man.vector + woman.vector

cd = lambda x,y: np.dot(x,y)/ (np.sqrt(np.dot(x,x)) * np.sqrt(np.dot(y,y)))

words = []
similarities = []
for word in tqdm(nlp.vocab, total = len(nlp.vocab)):
    if word.has_vector:
        if word.is_lower:
            words.append(word.text)
            similarities.append(cd(v, word.vector))
np.array(words)[np.argsort(similarities)[::-1]]            

100%|██████████| 1340248/1340248 [00:18<00:00, 72461.81it/s]


array(['sister', '4-year-old', 'daughter', ..., 'avaiation', 'invizimals',
       'sqli'], dtype='<U66')

# Domanda 6

Maybe because sister, 4-year-old and daughter are words that are used together 

# Sentiment Analysis with VADER

In [8]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/default/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [9]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

sid.polarity_scores("This is kinda shitty")

{'neg': 0.524, 'neu': 0.476, 'pos': 0.0, 'compound': -0.5118}

# Domanda 7

The third one has a greater score since we got great written in caps, which gives more emphasis and at the end we also have the esclamation mark

In [10]:
# import reviews db and apply sentiment analysis
import pandas as pd

reviews = pd.read_csv('http://antoninofurnari.it/downloads/reviews.csv')
reviews.info()
reviews.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5006 entries, 0 to 5005
Data columns (total 3 columns):
author    5006 non-null object
review    5006 non-null object
rating    5006 non-null float64
dtypes: float64(1), object(2)
memory usage: 117.5+ KB


Unnamed: 0,author,review,rating
0,Dennis_Schwartz,"in my opinion , a movie reviewer's most import...",0.1
1,Dennis_Schwartz,"you can watch this movie , that is based on a ...",0.2
2,Dennis_Schwartz,"this is asking a lot to believe , and though i...",0.2
3,Dennis_Schwartz,no heroes and no story are the main attributes...,0.2
4,Dennis_Schwartz,"this is not an art movie , yet i saw it an art...",0.2


In [11]:
#Analyize the first review with vader

sid.polarity_scores(reviews.loc[0]['review'])

{'neg': 0.134, 'neu': 0.753, 'pos': 0.113, 'compound': -0.8923}

In [12]:
#define a function to compute polarity using compound

vader_polarity = lambda x: sid.polarity_scores(x)['compound']
vader_polarity(reviews.loc[0]['review'])

-0.8923

In [13]:
#compute the compound for each review

tqdm.pandas()
reviews['polarity'] = reviews['review'].progress_apply(vader_polarity)
reviews.head()

  from pandas import Panel
100%|██████████| 5006/5006 [00:27<00:00, 182.44it/s]


Unnamed: 0,author,review,rating,polarity
0,Dennis_Schwartz,"in my opinion , a movie reviewer's most import...",0.1,-0.8923
1,Dennis_Schwartz,"you can watch this movie , that is based on a ...",0.2,0.8927
2,Dennis_Schwartz,"this is asking a lot to believe , and though i...",0.2,0.9772
3,Dennis_Schwartz,no heroes and no story are the main attributes...,0.2,0.0316
4,Dennis_Schwartz,"this is not an art movie , yet i saw it an art...",0.2,0.9904


In [14]:
#consider a review negative if the rating is smaller than 0.5
#consider a review positive if the polarity is positive

reviews['label'] = reviews['rating']>=0.5
reviews['predicted_label'] = reviews['polarity']>0
reviews.head()

Unnamed: 0,author,review,rating,polarity,label,predicted_label
0,Dennis_Schwartz,"in my opinion , a movie reviewer's most import...",0.1,-0.8923,False,False
1,Dennis_Schwartz,"you can watch this movie , that is based on a ...",0.2,0.8927,False,True
2,Dennis_Schwartz,"this is asking a lot to believe , and though i...",0.2,0.9772,False,True
3,Dennis_Schwartz,no heroes and no story are the main attributes...,0.2,0.0316,False,True
4,Dennis_Schwartz,"this is not an art movie , yet i saw it an art...",0.2,0.9904,False,True


In [15]:
from sklearn.metrics import accuracy_score, confusion_matrix

print("Accuracy: {:0.2f}%".format(accuracy_score(reviews['label'], reviews['predicted_label'])*100))


Accuracy: 73.93%


In [16]:
#we will now use vader scores as features for training
vader_features = lambda x: np.array(list(sid.polarity_scores(x).values()))
vader_features(reviews.iloc[0]['review'])

array([ 0.134 ,  0.753 ,  0.113 , -0.8923])

In [18]:
#use a little portion of data
from sklearn.model_selection import train_test_split

np.random.seed(123)
reviews_train, reviews_test = train_test_split(reviews, test_size=0.99)
print(len(reviews_train))

50


In [19]:
#compute a feature vector for each review

x_train = np.vstack(list(reviews_train['review'].progress_apply(vader_features)))
x_test = np.vstack(list(reviews_test['review'].progress_apply(vader_features)))


100%|██████████| 50/50 [00:00<00:00, 162.63it/s]
100%|██████████| 4956/4956 [00:28<00:00, 174.83it/s]


In [21]:
#save the labels
y_train = reviews_train['label']
y_test = reviews_test['label']


In [22]:
#try with a logistic regressor

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler #normalize features

scale = MinMaxScaler()
reg = LogisticRegression()

x_trains = scale.fit_transform(x_train)
reg.fit(x_trains, y_train)

x_tests = scale.transform(x_test)
reg.score(x_tests, y_test)




0.7495964487489911

In [24]:
# try with a linear regressor

def mae(y_true, y_pred):
    return (y_true-y_pred).abs().mean()

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler #normalize features

y_train = reviews_train['rating']
y_test = reviews_test['rating']

scale = MinMaxScaler()
reg = LinearRegression()

x_trains = scale.fit_transform(x_train)
reg.fit(x_trains, y_train)

x_tests = scale.transform(x_test)

y_testp = reg.predict(x_tests)

mae(y_test, y_testp)

0.1411718850494114