### SUPERVISED SENTIMENT ANALYSIS

Lo scopo di questo jupyter è quello di classificare le revisioni di Amazon attraverso delle tecniche principalmente utilizzate per i Social Media. Ci si aspetta che tali tecniche siano più sensibili nel riconoscere il sentimento rispetto alla classificazione pura. 

In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [5]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.4 gigabytes of available RAM

You are using a high-RAM runtime!


In [6]:
from time import time
import pandas as pd
import numpy as np
import random

import string
import os.path
import re

import seaborn as sns
import matplotlib.pyplot as plt

# Le emoji e il controllo sulla lingua non servono (ci sono pochi documenti in spagnolo)
# perché fineranno nella coda della distribuzione e saranno dimenticati.
import nltk
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words = stop_words + ['would'] + ['-PRON-'] 

import spacy
nlp = spacy.load('en', disable=['parser', 'ner'])

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# NLP pipeline.
# Check language: it's so slow.
# if detect(doc) != 'en':
# return []

def pipe(doc):
  doc = doc.lower()
  # Drop digits. 
  doc = re.sub(r'\d+', ' ', doc) 
  # Drop punctuation.
  doc = re.sub('['+string.punctuation+']', ' ', doc) 
  # Drop newline.
  doc = re.sub(r'\n+', ' ', doc) 
  # Drop URL.
  if 'www.' in doc or 'http:' in doc or 'https:' in doc or '.com' in doc:
    doc = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", " ", doc)
  # Drop emoji.
  # doc = demoji.replace(string = doc, repl = " ")
  # Drop extra spaces.
  doc = re.sub(r'\s\s+', ' ', doc) 
  # Tpkenize and Lemmatization.
  doc = nlp(doc)
  doc = [token.lemma_ for token in doc]   
  # Tokenize.
  # doc = text_to_word_sequence(doc)
  # Drop short words.
  doc = [word for word in doc if len(word) > 2]
  # Drop stop words.
  doc = [word for word in doc if word not in stop_words]
  return doc

In [8]:

# NLP pipeline.
class preprocessing:
  # Initialize.
  def __init__(self, path, name_df):
    self.path = path
    self.name_df = name_df

  # Load data.
  def loader(self):
    print('IMPORT DATASET ' + self.name_df)
    if os.path.isfile(self.path) == False: 
      print('Set not exists.')
      raise SystemExit("Stop right there!")
    else:
      with open(self.path, encoding="utf8") as file:
       self.df = file.readlines()
    print(f"Size: {len(self.df)}")

 # Sampling.
  def sampling(self, size):
    print('SAMPLING ' + self.name_df)
    random.seed = 20201230
    self.df = random.sample(self.df, size)  
    print(f"Size: {len(self.df)}")

  # From list to data frame
  def data_frame(self):
    print('CREATE DATASET: REVIEWS - LABELS ' + self.name_df)
    X = []
    labels = []

    for rev in self.df:
      _, label, sent = re.split(r'__label__(\d)', rev)
      label = int(label[0]) -1
      labels.append(label)
      X.append(sent)

    self.df = pd.DataFrame(list(zip(X, labels)), columns=['Review', 'Labels']) 

  # NLP pipeline.
  def pipeline(self):
    print('NLP pipeline ' + self.name_df)
    t1 = time()
    self.df["Clean_Review"] = self.df["Review"].apply(pipe)
    t2 = time()
    print(f"Time: {t2-t1}")

  # Empty review,
  def drop_empty_review(self):
    print('DROP EMPTY REVIEW ' + self.name_df)
    self.df.drop(self.df[self.df['Clean_Review'].map(len) == 0].index)

In [9]:
%%time

        #-- IMPORT SET.

train_path = 'drive/MyDrive/Text Mining/train.ft.txt'

train = preprocessing(train_path, 'TRAIN')

train.loader()


      #-- SAMPLING.

train_sample_size = 250000
train.sampling(train_sample_size)

      #-- CREATE A DATASET WITH REVIEW - LABEL.

train.data_frame()

      #-- PREPROCESSING.

train.pipeline()

train.drop_empty_review()

IMPORT DATASET TRAIN
Size: 3600000
SAMPLING TRAIN
Size: 250000
CREATE DATASET: REVIEWS - LABELS TRAIN
NLP pipeline TRAIN
Time: 1432.0943808555603
DROP EMPTY REVIEW TRAIN
CPU times: user 23min 43s, sys: 5.89 s, total: 23min 49s
Wall time: 24min 19s


### ***SENTIMENT SUPERVISED ANALYSIS***

In [10]:
# Define a threshold to define positive, negative and neutral reviews.
def assign_sent(value, thr):
  # dataset doesn't contain neutral comments.
  thr = thr
  if -thr <= value <= thr:
    # put this value to neutral reviews: they will be dropped.
    value = -999
  else: 
    if  value < 0:
      value = 0
    else:
      value = 1
  return value

#### ***AFFIN***

In [11]:
!pip install afinn

Collecting afinn
[?25l  Downloading https://files.pythonhosted.org/packages/86/e5/ffbb7ee3cca21ac6d310ac01944fb163c20030b45bda25421d725d8a859a/afinn-0.1.tar.gz (52kB)
[K     |██████▎                         | 10kB 16.8MB/s eta 0:00:01[K     |████████████▌                   | 20kB 20.9MB/s eta 0:00:01[K     |██████████████████▊             | 30kB 12.0MB/s eta 0:00:01[K     |█████████████████████████       | 40kB 9.1MB/s eta 0:00:01[K     |███████████████████████████████▏| 51kB 5.1MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.3MB/s 
[?25hBuilding wheels for collected packages: afinn
  Building wheel for afinn (setup.py) ... [?25l[?25hdone
  Created wheel for afinn: filename=afinn-0.1-cp36-none-any.whl size=53450 sha256=ce5347867d84584299149c965f1fd68a71c85e79ac3e4b3ecb7a67c982895dfb
  Stored in directory: /root/.cache/pip/wheels/b5/1c/de/428301f3333ca509dcf20ff358690eb23a1388fbcbbde008b2
Successfully built afinn
Installing collected packages: afinn
Succ

In [12]:
%%time

from afinn import Afinn

afinn = Afinn()

def affin_exe(review):
  # compute che score provided by Afinn for each review.
  aff = afinn.score(review)
# define the sentiment of the review.
  aff = assign_sent(aff, 2)
  return aff

train.df['Affin_Label'] = train.df['Review'].apply(affin_exe)

# Save the non neutral reviews.
# I have to create an appo matrix! I will delete object of train.df
train_appo = train.df
train_appo = train_appo[train_appo['Affin_Label'] != -999]

train_appo['Affin_Label'].value_counts()

cm = pd.crosstab(train_appo.Affin_Label, train_appo.Labels)
print(f"\n Confusion matrix: \n\n {cm}")
# risultato un po' pessimo.
print(classification_report(train_appo.Labels, train_appo.Affin_Label))

# Si osserva che l'algoritmo riesce a inferire bene i commenti positivi
# ma non è molto performante per quelli negativi.


 Confusion matrix: 

 Labels           0       1
Affin_Label               
0            35884    3960
1            45706  105980
              precision    recall  f1-score   support

           0       0.90      0.44      0.59     81590
           1       0.70      0.96      0.81    109940

    accuracy                           0.74    191530
   macro avg       0.80      0.70      0.70    191530
weighted avg       0.78      0.74      0.72    191530

CPU times: user 11min 17s, sys: 193 ms, total: 11min 17s
Wall time: 11min 22s


#### ***OPINION LEXICON***

In [18]:
nltk.download('opinion_lexicon')
from nltk.corpus import opinion_lexicon
# Liste di termini positivi e negativi offerti. 
pos_list=set(opinion_lexicon.positive())
neg_list=set(opinion_lexicon.negative())

[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


In [19]:
%%time

# compute the score of each review on the basis of the word 
# of opinion lexicon whithin each of them.
def sentiment_sum(sentence):
  counter = 0
  for word in sentence:
    if word in pos_list:
      counter += 1
    elif word in neg_list:
      counter -= 1

  counter = assign_sent(counter, 1)
  return counter

train.df['Count_Sent'] = train.df['Clean_Review'].apply(sentiment_sum)

train_appo = train.df
train_appo = train_appo[train_appo['Count_Sent'] != -999]

train_appo['Count_Sent'].value_counts()

cm = pd.crosstab(train_appo.Labels, train_appo.Count_Sent)
print(f"\n Confusion matrix: \n\n {cm}", '\n')

# risultato un po' pessimo.
print(classification_report(train_appo.Count_Sent, train_appo.Labels))
# Anche qui non classifica bene i commenti negativi.


 Confusion matrix: 

 Count_Sent      0      1
Labels                  
0           43344  32157
1            5551  94753 

              precision    recall  f1-score   support

           0       0.57      0.89      0.70     48895
           1       0.94      0.75      0.83    126910

    accuracy                           0.79    175805
   macro avg       0.76      0.82      0.77    175805
weighted avg       0.84      0.79      0.80    175805

CPU times: user 1.77 s, sys: 9.48 ms, total: 1.78 s
Wall time: 1.79 s


#### ***VADER***

In [15]:
 !pip install vaderSentiment

Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/76/fc/310e16254683c1ed35eeb97386986d6c00bc29df17ce280aed64d55537e9/vaderSentiment-3.3.2-py2.py3-none-any.whl (125kB)
[K     |██▋                             | 10kB 13.3MB/s eta 0:00:01[K     |█████▏                          | 20kB 15.9MB/s eta 0:00:01[K     |███████▉                        | 30kB 14.4MB/s eta 0:00:01[K     |██████████▍                     | 40kB 12.0MB/s eta 0:00:01[K     |█████████████                   | 51kB 5.8MB/s eta 0:00:01[K     |███████████████▋                | 61kB 6.4MB/s eta 0:00:01[K     |██████████████████▏             | 71kB 6.4MB/s eta 0:00:01[K     |████████████████████▉           | 81kB 6.5MB/s eta 0:00:01[K     |███████████████████████▍        | 92kB 7.2MB/s eta 0:00:01[K     |██████████████████████████      | 102kB 7.5MB/s eta 0:00:01[K     |████████████████████████████▋   | 112kB 7.5MB/s eta 0:00:01[K     |███████████████████████████████▏|

In [16]:
%%time
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Define vader score.
analyzer = SentimentIntensityAnalyzer()
train.df['Vader_sent'] = train.df['Review'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Encode lab 0 or 1.
train.df['Vader_sent'] = train.df['Vader_sent'].apply(lambda x: assign_sent(x, 0.1))

train_appo = train.df
train_appo = train_appo[train_appo['Vader_sent'] != -999]

train_appo['Vader_sent'].value_counts()

cm = pd.crosstab(train_appo.Labels, train_appo.Vader_sent)
acc = np.sum(np.diag(cm)) / np.sum(cm.values)
print(f"\n Confusion matrix: \n\n {cm}, \n\n Accuracy: {acc*100}%")

# risultato un po' pessimo.
print(classification_report(train_appo.Labels, train_appo.Vader_sent))

# Sbaglia nel classificare i commenti negativi e li scambia per positivi.


 Confusion matrix: 

 Vader_sent      0       1
Labels                   
0           59586   58298
1            7298  115142, 

 Accuracy: 72.70518133852633%
              precision    recall  f1-score   support

           0       0.89      0.51      0.64    117884
           1       0.66      0.94      0.78    122440

    accuracy                           0.73    240324
   macro avg       0.78      0.72      0.71    240324
weighted avg       0.78      0.73      0.71    240324

CPU times: user 3min 31s, sys: 648 ms, total: 3min 31s
Wall time: 3min 33s
