In [58]:
#Importiamo spacy con il modello inglese
import spacy
nlp = spacy.load("it_core_news_sm")
text = ('I giovani non hanno bisogno di sermoni, i giovani hanno bisogno di esempi di onestà, di coerenza e di altruismo.')
doc = nlp(text)

# Statistica

In [59]:
#Frequenza Parole
from collections import Counter
words = [token.text for token in doc if not token.is_stop and not token.is_punct]
word_freq = Counter(words)
common_words = word_freq.most_common()
print (common_words)

[('giovani', 2), ('bisogno', 2), ('I', 1), ('sermoni', 1), ('i', 1), ('esempi', 1), ('onestà', 1), ('coerenza', 1), ('e', 1), ('altruismo', 1)]


In [60]:
#Parole Uniche
unique_words = [word for (word, freq) in word_freq.items() if freq == 1]
print (unique_words)

['I', 'sermoni', 'i', 'esempi', 'onestà', 'coerenza', 'e', 'altruismo']


# Speech Tagging

In [45]:
#Dal risultato possiamo vedere come ad ogni parola sia stata associato il "valore grammaticale"
for token in doc:
    print (token, token.pos_)

I DET
giovani NOUN
non ADV
hanno VERB
bisogno NOUN
di ADP
sermoni NOUN
, PUNCT
i DET
giovani NOUN
hanno VERB
bisogno NOUN
di ADP
esempi NOUN
di ADP
onestà NOUN
, PUNCT
di ADP
coerenza NOUN
e CONJ
di ADP
altruismo NOUN
. PUNCT


In [None]:
#Visualizzazione del tagging grafica
from spacy import displacy
displacy.serve(doc, style='dep')

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



# Lemmatizzazione

In [40]:
#Processo di tokenizzazione, ossia divisione di una frase in token
for token in doc:
    print (token, token.lemma_)


I I
giovani giovane
non non
hanno avere
bisogno bisognare
di di
sermoni sermone
, ,
i il
giovani giovane
hanno avere
bisogno bisognare
di di
esempi esempio
di di
onestà onestà
, ,
di di
coerenza coerenza
e e
di di
altruismo altruismo
. .


In [37]:
#Dal risultato superiori possiamo vedere che le parole "complesse", dove possibile possono essere semplificate.
#ESEMPIO:
# esempi -> esempio
# hanno -> avere

# Sentiment Analysis

In [1]:
import pandas as pd

#Carichiamo il nostro df
#Questo df contiene una lista di recensioni provenienti da diversi siti come (Amazon, IMDB e Yelp)
df = pd.read_csv(r"C:\Users\Marco\Downloads\sentiment labelled sentences\sentiment labelled sentences\df.csv", encoding='UTF-8')
df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Recensione,Esito
0,Yelp,0,"Good case, Excellent value.",1
1,Yelp,1,Great for the jawbone.,1
2,Yelp,2,Tied to charger for conversations lasting more...,0
3,Yelp,3,The mic is great.,1
4,Yelp,4,I have to jiggle the plug to get it to line up...,0


In [24]:
#Importiamo una lista di stopwords(Parole poco significative per il nostro scopo)
nlp = spacy.load('en_core_web_lg')
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)


In [3]:
#Esempio di stopwords
stopwords[:10]

['within',
 'around',
 'will',
 'had',
 'while',
 'full',
 'much',
 'anywhere',
 "'d",
 'anyway']

In [4]:
# Importiamo il modulo string per avere la lista di tutta la punteggiatura (./,/;...)
import string
punctuations = string.punctuation

In [23]:
# Creiamo un Parser con spacy impostando lingua inglese
from spacy.lang.en import English
parser = English()


In [6]:
#A questo punto dovremo "pulire" la nostra frase da tutto ciò che non è necessario a fare il training del nostro algoritmo di ML
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    #A questo punto ogni "token" ossia ogni parola viene lemmatizzata ossia portata alla forma base (Es. words -> word),
    #viene portata in lower case e ad essa vengono tolti eventuali spazi vuoti prima e dopo di essa
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    #In seguito vengono tolte tutte le stopwords e la punteggiatura
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

In [7]:
print(spacy_tokenizer("The commercials are the most misleading.I Hate him"))

['commercials', 'misleading', 'hate']


In [8]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [9]:
#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [10]:
# Vectorization
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1)) 
classifier = LinearSVC()

In [11]:
# Features and Labels
X = df['Recensione']
ylabels = df['Esito']

In [12]:
# Splittiamo il dataset in test e train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=42)


In [13]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
#Creaiamo la pipeline, ossia l'insieme delle azioni che dovranno essere fatte per ottenere i dati elaborati
#Cleaner - Tokenizzazione e pulizia dei dati
#Vettorizzazione dei dati
#Classificazione dei dati
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

In [14]:
# Fit our data
pipe.fit(X_train,y_train)

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x000002CFD57C08C8>),
                ('vectorizer',
                 CountVectorizer(tokenizer=<function spacy_tokenizer at 0x000002CF8AB42558>)),
                ('classifier', LinearSVC())])

In [15]:
# Predicting with a test dataset
#Proviamo a predirre il dataset di test
sample_prediction = pipe.predict(X_test)

In [16]:
# Stampiamo i risultati
# 1 = Recensione Positivia
# 0 = Recensione Negativa
for (sample,pred) in zip(X_test,sample_prediction):
    print(sample,"Prediction=>",pred)

Disappointment.. I hate anything that goes in my ear. Prediction=> 0
It is a true classic.   Prediction=> 1
Great product. Prediction=> 1
Audio Quality is poor, very poor. Prediction=> 0
It finds my cell phone right away when I enter the car. Prediction=> 1
It is simple to use and I like it. Prediction=> 1
Was not happy. Prediction=> 1
The headsets are easy to use and everyone loves them. Prediction=> 1
Their Research and Development division obviously knows what they're doing. Prediction=> 1
Still it's quite interesting and entertaining to follow.   Prediction=> 1
All three broke within two months of use. Prediction=> 0
Oh yeah, and the storyline was pathetic too.   Prediction=> 0
IT'S REALLY EASY. Prediction=> 1
Every element of this story was so over the top, excessively phony and contrived that it was painful to sit through.   Prediction=> 0
The battery works great! Prediction=> 1
I am so tired of clichés that is just lazy writing, and here they come in thick and fast.   Prediction

He owns the film, just as Spacek owned Coal Miner's Daughter" and Quaid owned "Great Balls of Fire.   Prediction=> 0
A standout scene.   Prediction=> 0
Worst hour and a half of my life!Oh my gosh!   Prediction=> 0
Its not user friendly. Prediction=> 0
Also, it's a real treat to see Anthony Quinn playing Crazy Horse.   Prediction=> 1
This frog phone charm is adorable and very eye catching. Prediction=> 1
I am very impressed with the job that Motorola did on the sturdiness of this phone. Prediction=> 1
I received my headset in good time and was happy with it. Prediction=> 1
Would recommend this item. Prediction=> 0
The only good thing was that it fits comfortably on small ears. Prediction=> 1
very good product, well made. Prediction=> 1
The writer, Gorman Bechard, undoubtedly did his homework because all references are industry and character-age appropriate.   Prediction=> 1
I like design and look of Jabra behing the ear headsets and 5020 is pretty comfortible to wear 24 hours a day with

Finally, after three or four times the spring of the latch broke and I could not use it any longer on the visor. Prediction=> 0
the movie is littered with overt racial slurs towards the black cast members and in return the whites are depicted as morons and boobs.   Prediction=> 0
Disapointing Results. Prediction=> 0
These are fabulous! Prediction=> 1
Crisp and Clear. Prediction=> 1
This may be the only bad film he ever made.   Prediction=> 0
The film deserves strong kudos for taking this stand, for having exceptional acting from its mostly lesser-known cast and for the super-intelligent script that doesn't insult the audience or take the easy way out when it comes to white racism.   Prediction=> 1
Don't make the same mistake I did. Prediction=> 0
Even in my BMW 3 series which is fairly quiet, I have trouble hearing what the other person is saying. Prediction=> 0
Virgin Wireless rocks and so does this cheap little phone! Prediction=> 1
In fact, I liked it better than Interview With a Va

It was horrendous.   Prediction=> 0
I can hear while I'm driving in the car, and usually don't even have to put it on it's loudest setting. Prediction=> 0
I highly recommend these and encourage people to give them a try. Prediction=> 1
I have two more years left in this contract and I hate this phone. Prediction=> 0
Jawbone Era is awesome too! Prediction=> 1
If you are looking for a movie with a terrific cast, some good music(including a Shirley Jones rendition of "The Way You Look Tonight"), and an uplifting ending, give this one a try.   Prediction=> 0
Reception is terrible and full of static. Prediction=> 0
Bad Purchase. Prediction=> 0
I Was Hoping for More. Prediction=> 0
I came over from Verizon because cingulair has nicer cell phones.... the first thing I noticed was the really bad service. Prediction=> 0
It lasts less than 3o minutes, if I actually try to use the phone.My wife has the same phone with the same problem. Prediction=> 0
You can't beat the price on these. Prediction=

Even if you love bad movies, do not watch this movie.   Prediction=> 0
This is the phone to get for 2005.... I just bought my S710a and all I can say is WOW! Prediction=> 1
We would recommend these to others. Prediction=> 1
This product is great... it makes working a lot easier I can go to the copier while waiting on hold for something. Prediction=> 1
Then I exchanged for the same phone, even that had the same problem. Prediction=> 0
All the other cases I've tried normally fall apart after a few months but this one seems to be in for the long haul. Prediction=> 1
Excellent sound, battery life and inconspicuous to boot!. Prediction=> 1
They made this case too small and is very difficult to install. Prediction=> 0
Pros:-Good camera - very nice pictures , also has cool styles like black and white, and more. Prediction=> 1
What a waste of time! Prediction=> 0
Love This Phone. Prediction=> 1
You can not answer calls with the unit, never worked once! Prediction=> 0
NOBODY identifies with the

This phone works great. Prediction=> 1
One thing I hate is the mode set button at the side. Prediction=> 0
I have seen many movies starring Jaclyn Smith, but my god this was one of her best, though it came out 12 years ago.   Prediction=> 0
Good transmit audio. Prediction=> 1
Not enough volume. Prediction=> 0
NOT WHAT I WANTED. Prediction=> 1
Good value, works fine - power via USB, car, or wall outlet. Prediction=> 1
I was very excited to get this headset because I thought it was really cute. Prediction=> 0
horrible, had to switch 3 times. Prediction=> 0
I've tried several different earpieces for my cell phone and this Jabra one is the first one I've found that fits my ear comfortably. Prediction=> 1
Worst customer service. Prediction=> 0
So I basically threw my money out the window for nothing. Prediction=> 0
The battery works great! Prediction=> 1
Last time buying from you. Prediction=> 0
Logitech Bluetooth Headset is a 10!. Prediction=> 1
This product is ideal for people like me who

In [17]:
# Accuracy
print("Accuracy: ",pipe.score(X_test,y_test))
print("Accuracy: ",pipe.score(X_test,sample_prediction))

Accuracy:  0.8961748633879781
Accuracy:  1.0


In [18]:
print("Accuracy: ",pipe.score(X_train,y_train))

Accuracy:  0.9904371584699454


In [19]:
# Another random review
pipe.predict(["I've bougth a new mac last year. This is a top laptop"])

array([0], dtype=int64)