### **1) Utilize o arquivo moviesreviews.tsv**

In [174]:
import pandas as pd

In [175]:
file = pd.read_table('moviereviews.tsv',sep='\t')
file.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


### **2) Utilize a análise de sentimentos do nltk (VADER) para classificar os reviews.**

In [176]:
import numpy as np
import nltk

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [177]:
nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/matheus/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [178]:
file['label'].value_counts()

neg    1000
pos    1000
Name: label, dtype: int64

Remoção de valores nulos:

In [179]:
file.dropna(inplace=True)

In [180]:
file['label'].value_counts()

neg    983
pos    982
Name: label, dtype: int64

In [181]:
file['label'] = np.where(file['label'] == 'pos', 1, 0)
file.head()

Unnamed: 0,label,review
0,0,how do films like mouse hunt get into theatres...
1,0,some talented actresses are blessed with a dem...
2,1,this has been an extraordinary year for austra...
3,1,according to hollywood movies made in last few...
4,0,my first press screening of 1998 and already i...


Adicionando coluna de score:

In [182]:
file['score'] = file['review'].apply(lambda reviews: sid.polarity_scores(reviews))
file.head()

Unnamed: 0,label,review,score
0,0,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co..."
1,0,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com..."
2,1,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com..."
3,1,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co..."
4,0,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co..."


Adicionando uma coluna Compound

In [183]:
file['compound'] = file['score'].apply(lambda scores: scores['compound'])
file.head()

Unnamed: 0,label,review,score,compound
0,0,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125
1,0,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618
2,1,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951
3,1,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972
4,0,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484


Adicionando uma coluna Compound resultado

In [200]:
file['compound_result'] = file['compound'].apply(lambda comp: 1 if comp >0 else 0)
file.head()

Unnamed: 0,label,review,score,compound,compound_result
0,0,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,0
1,0,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,0
2,1,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,1
3,1,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,1
4,0,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,0


Acurácia:

In [201]:
accuracy_score(file['label'],file['compound_result'])

0.6340966921119593

Classification report:

In [186]:
print(classification_report(file['label'],file['compound_result']))

              precision    recall  f1-score   support

           0       0.71      0.45      0.55       983
           1       0.60      0.82      0.69       982

    accuracy                           0.63      1965
   macro avg       0.66      0.63      0.62      1965
weighted avg       0.66      0.63      0.62      1965



Matriz de confusão:

In [187]:
confusion_matrix(file['label'],file['compound_result'])

array([[441, 542],
       [177, 805]])

### **3) Realize experimentos fazendo pré-processamento.**

In [188]:
import re

In [189]:
X = file['review'].str.lower().tolist()

Limpando o texto:

In [190]:
X2 = []
for sent in X:
    new_sent = re.sub(r'\s+', ' ', new_sent )
    X2.append(new_sent)

Removendo stopwords:

In [219]:
text_tokens = [word_tokenize(text) for text in X ]

In [226]:
nltk.download('stopwords')
stopwords =  nltk.corpus.stopwords.words('english')

text_tokens = [[token for token in text if token not in stopwords] for text in text_tokens]


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/matheus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Lematizando:

In [220]:
# len = WordNetLemmatizer()
# text_len = [[len.lemmatize(t) for t in text] for text in text_tokens]

In [227]:
text_lower = [" ".join(t) for t in text_tokens]

In [228]:
y = file['label']

In [229]:
df_upgrade = pd.DataFrame({"review":text_lower,
                            "label":y})
df_upgrade.head()

Unnamed: 0,review,label
0,films like mouse hunt get theatres ? n't law s...,0
1,talented actresses blessed demonstrated wide a...,0
2,extraordinary year australian films . `` shine...,1
3,"according hollywood movies made last decades ,...",1
4,first press screening 1998 already 've gotten ...,0


In [230]:
df_upgrade['score'] = df_upgrade['review'].apply(lambda reviews: sid.polarity_scores(reviews))

df_upgrade['compound'] = df_upgrade['score'].apply(lambda scores: scores['compound'])

df_upgrade['compound_result'] = df_upgrade['compound'].apply(lambda comp: 1 if comp >0 else 0)

df_upgrade.head()

Unnamed: 0,review,label,score,compound,compound_result
0,films like mouse hunt get theatres ? n't law s...,0,"{'neg': 0.163, 'neu': 0.67, 'pos': 0.167, 'com...",-0.5349,0
1,talented actresses blessed demonstrated wide a...,0,"{'neg': 0.17, 'neu': 0.682, 'pos': 0.147, 'com...",-0.8996,0
2,extraordinary year australian films . `` shine...,1,"{'neg': 0.108, 'neu': 0.667, 'pos': 0.224, 'co...",0.9952,1
3,"according hollywood movies made last decades ,...",1,"{'neg': 0.113, 'neu': 0.69, 'pos': 0.197, 'com...",0.9944,1
4,first press screening 1998 already 've gotten ...,0,"{'neg': 0.156, 'neu': 0.707, 'pos': 0.136, 'co...",-0.9011,0


Acurácia:

In [231]:
accuracy_score(df_upgrade['label'],df_upgrade['compound_result'])

0.6132315521628499

Classification report:


In [None]:
print(classification_report(df_upgrade['label'],df_upgrade['compound_result']))

              precision    recall  f1-score   support

           0       0.70      0.39      0.50       983
           1       0.58      0.83      0.68       982

    accuracy                           0.61      1965
   macro avg       0.64      0.61      0.59      1965
weighted avg       0.64      0.61      0.59      1965



Matriz de confusão:

In [None]:
confusion_matrix(file['label'],file['compound_result'])

array([[427, 556],
       [164, 818]])

### **4) Compare o melhor resultado, com os resultados do exercício 6.**