### CLASSIFICAZIONE PURA

Lo scopo di questo lavoro è quello di classificare alcune recensioni Amazon attraverso metodi tradizionali di machine learning. 
In particolare, le rappresentazioni utilizzate sono BOW con pesi binari, frequenze e tf-idf ed un vocabolario costituito da 2-gram. 

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.4 gigabytes of available RAM

You are using a high-RAM runtime!


In [3]:
from time import time
import pandas as pd
import numpy as np
import random
import string
import os.path
import re

# Plot.
import seaborn as sns
import matplotlib.pyplot as plt

# NLP pipeline.
import nltk
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words = stop_words + ['would'] + ['-PRON-'] # -PRON- viene aggiunto dal lemmatizer e va tolto.
import spacy # lemmatization
nlp = spacy.load('en', disable=['parser', 'ner'])

!pip install demoji
import demoji
demoji.download_codes()

# Rappresentazione.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectPercentile

# Classificatori.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import cross_validate

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Collecting demoji
  Downloading https://files.pythonhosted.org/packages/7b/fd/265f1ad2d745d6f46d1ede83d0054327e87154e9f14b252c1e272749e657/demoji-0.3.0-py2.py3-none-any.whl
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Installing collected packages: colorama, demoji
Successfully installed colorama-0.4.4 demoji-0.3.0
Downloading emoji data ...
... OK (Got response in 1.41 seconds)
Writing emoji data to /root/.demoji/codes.json ...
... OK


In [4]:
# Codifica le sigle nei rispettivi classificatori. 
def decode(c):
  decoder = {'KNN': 'KNN', 
             'LSVC': 'Linear SVC', 
             'Logistic': 'Logistic', 
             'GNB': 'Gaussian Naive Bayes', 
             'Tree': 'Decision Tree Classifier', 
             'RF': 'Random Forest',
             'SGD':'Stochastic Gradient Descent Classifier',
             'MNB': 'Multinomial Naive Bayes', 
             'ADAB': 'Adaboost',
             'BNB': 'Bernullian Naive Bayes'}
  return decoder[c] 


# NLP pipeline.

def pipe(doc):
  doc = doc.lower()   # Lower case. 
  doc = re.sub(r'\d+', ' ', doc)  # Drop digits.
  doc = re.sub('['+string.punctuation+']', ' ', doc) # Drop punctuation. 
  doc = re.sub(r'\n+', ' ', doc) # Drop newline.
  if 'www.' in doc or 'http:' in doc or 'https:' in doc or '.com' in doc: # Drop URL.
    doc = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", " ", doc)
  doc = demoji.replace(string = doc, repl = " ")  # Drop emoji.
  doc = re.sub(r'\s\s+', ' ', doc) # Drop extra spaces. 
  doc = nlp(doc)  
  doc = [token.lemma_ for token in doc]  # Tokenize and Lemmatization. 
  doc = [word for word in doc if len(word) > 2] # Drop short words.
  doc = [word for word in doc if word not in stop_words] # Drop stop words.
  return doc

In [5]:
# NLP pipeline: load, sample, clean dataset.

class preprocessing:
  # Initialize.
  def __init__(self, path, name_df):
    self.path = path
    self.name_df = name_df

  # Load data.
  def loader(self):
    print('IMPORT DATASET ' + self.name_df)
    if os.path.isfile(self.path) == False: 
      print('Set not exists.')
      raise SystemExit("Stop right there!")
    else:
      with open(self.path, encoding="utf8") as file:
       self.df = file.readlines()
    print(f"Size: {len(self.df)}")

 # Sampling.
  def sampling(self, size):
    print('SAMPLING ' + self.name_df)
    random.seed = 20201230
    self.df = random.sample(self.df, size)  
    print(f"Size: {len(self.df)}")

  # From list to data frame
  def data_frame(self):
    print('CREATE DATASET: REVIEWS - LABELS ' + self.name_df)
    X = []
    labels = []

    for rev in self.df:
      _, label, sent = re.split(r'__label__(\d)', rev)
      label = int(label[0]) - 1
      labels.append(label)
      X.append(sent)

    self.df = pd.DataFrame(list(zip(X, labels)), columns=['Review', 'Labels']) 

  # NLP pipeline.
  def pipeline(self):
    print('NLP pipeline ' + self.name_df)
    t1 = time()
    self.df["Clean_Review"] = self.df["Review"].apply(pipe)
    t2 = time()
    print(f"Time: {t2-t1}")

  # Empty review,
  def drop_empty_review(self):
    print('DROP EMPTY REVIEW ' + self.name_df)
    self.df.drop(self.df[self.df['Clean_Review'].map(len) == 0].index)

# Set of possible classification techniques.
class classification:
    # Initialize.
    def __init__(self, X_train, y_train, cv):
        self.seed = 20201230
        self.X_train = X_train
        self.y_train = y_train
        self.cv = cv

    # Select the model.
    def select_model(self, class_type):
      possible_class = ['KNN', 'LSVC', 'Logistic', 'Tree', 'RF', 'SGD', 'MNB', 'ADAB', 'BNB']

      if class_type in possible_class:
        if class_type == possible_class[0]:
          clf = KNeighborsClassifier(n_neighbors = 5)
        if class_type == possible_class[1]:
          clf = LinearSVC(random_state=0, max_iter=1000)
        if class_type == possible_class[2]:
          clf = LogisticRegression(random_state=0, max_iter=1000)
        if class_type == possible_class[3]:
          clf = DecisionTreeClassifier()
        if class_type == possible_class[4]:
          clf = RandomForestClassifier()
        if class_type == possible_class[5]:
          clf = SGDClassifier(max_iter=1000, tol=1e-3)
        if class_type == possible_class[6]:
          clf = MultinomialNB()
        if class_type == possible_class[7]:
          clf = AdaBoostClassifier(n_estimators=100, random_state=0)
        if class_type == possible_class[8]:
          clf = BernoulliNB()

        self.clf = clf
        # Cross validation.
        self.score = cross_validate(self.clf, self.X_train, self.y_train, cv=self.cv, scoring = ['accuracy', 'f1'])
        self.N = int(self.X_train.shape[0]/cv)
        # Performance.
        perf = []
        for key in self.score:          
          perf.append([key, self.score[key].mean(), self.score[key].std() * 2/np.sqrt(self.N)])

        perf = np.array(perf)
        perf =  pd.DataFrame(perf, columns = ['Measure', 'Mean', 'StandError'])
        perf['Measure'] = ['TrainTime', 'TestTime', 'F1', 'Accuracy']
        print(perf)
        self.perf = perf
      else:
        raise SystemExit("Select one of the provided classifiers!")


### ***IMPORT AND PREPROCESSING***

In [6]:
 %%time

        #-- IMPORT SET.

train_path = 'drive/MyDrive/Text Mining/train.ft.txt'

train = preprocessing(train_path, 'TRAIN')

train.loader()

      #-- SAMPLING.

train_sample_size = 250000
train.sampling(train_sample_size)

      #-- CREATE A DATASET WITH REVIEW - LABEL.

train.data_frame()

      #-- PREPROCESSING.

train.pipeline()

      #-- DROP EMPTY REVIEWS.

train.drop_empty_review()

IMPORT DATASET TRAIN
Size: 3600000
SAMPLING TRAIN
Size: 250000
CREATE DATASET: REVIEWS - LABELS TRAIN
NLP pipeline TRAIN
Time: 1982.5033721923828
DROP EMPTY REVIEW TRAIN
CPU times: user 33min 3s, sys: 6.36 s, total: 33min 9s
Wall time: 33min 30s


### ***FEATURE EXTRACTION - SELECTION - WEIGHTED***


In [15]:
# Number of min and max gram.
gram_min = 2
gram_max = 2

#### ***BINARY MATRIX***

In [16]:
%%time

    #-- FEATURE EXTRACTION: BINARY BOW.

def dummy(doc):
  return doc

vectorizer = CountVectorizer(
        tokenizer=dummy,
        preprocessor=dummy,
        min_df = 0.0001, # 0.0001*100000 = 10, ragionevole
        # max_df = 0.8, # stop words removal: it's useless if I had removed stop words!
        ngram_range = (gram_min, gram_max),
        binary = True
    )  

train_BOW = vectorizer.fit_transform(train.df['Clean_Review'])

print(train_BOW.shape)

      #-- FEATURE SELECTION

select_percentile = SelectPercentile(chi2, percentile = 10)
train_BOW = select_percentile.fit_transform(train_BOW, train.df['Labels'])

print(train_BOW.shape)

(250000, 35539)
(250000, 3553)
CPU times: user 39.5 s, sys: 805 ms, total: 40.3 s
Wall time: 40.2 s


##### ***CLASSIFICATION***


In [18]:
%%time

cv = 5
learn = classification(train_BOW, train.df['Labels'], cv)

my_class = ['KNN', 'LSVC', 'Logistic', 'SGD', 'MNB', 'ADAB', 'BNB']

for c in my_class:
  print(f"                                                            ***** {decode(c)} ****")
  learn.select_model(c)

# del train_BOW, test_BOW, learn

                                                            ***** KNN ****
     Measure                  Mean              StandError
0  TrainTime  0.029909420013427734    3.48113532821378e-05
1   TestTime    187.78434057235717     0.06976580160524748
2         F1    0.7256199999999999  1.8177788644387045e-05
3   Accuracy    0.7371862454360342  0.00012463130498352245
                                                            ***** Linear SVC ****
     Measure                  Mean              StandError
0  TrainTime    10.362391471862793    0.004675259326221499
1   TestTime  0.028976631164550782  2.3834370009457774e-06
2         F1              0.814432   5.028685712987151e-06
3   Accuracy    0.8268141109264825  6.9776989458652035e-06
                                                            ***** Logistic ****
     Measure                  Mean              StandError
0  TrainTime     4.690478324890137    0.002310207422521882
1   TestTime  0.028639793395996094  3.1327035670318446e

#### ***FREQUENCY MATRIX***

In [19]:
%%time

    #-- FEATURE EXTRACTION: FREQUENCY BOW.
def dummy(doc):
  return doc

vectorizer = CountVectorizer(
        tokenizer=dummy,
        preprocessor=dummy,
        min_df = 0.0001, # 0.001*100000 = 100, ragionevole
        # max_df = 0.7,
        ngram_range = (gram_min, gram_max),
        binary = False, 
        # vocabulary = my_dic
    )  

# Restituisce un amatrice di TIPO sparso quindi non bisogna convertirla. 
train_freq = vectorizer.fit_transform(train.df['Clean_Review'])

print(train_freq.shape)

      #-- FEATURE SELECTION

select_percentile = SelectPercentile(chi2, percentile=10)
train_freq = select_percentile.fit_transform(train_freq, train.df['Labels'])

print(train_freq.shape)

(250000, 35539)
(250000, 3553)
CPU times: user 37.3 s, sys: 780 ms, total: 38.1 s
Wall time: 38 s


##### ***CLASSIFICATION***

In [20]:
%%time

learn = classification(train_freq, train.df['Labels'], cv)

my_class = ['KNN', 'LSVC', 'Logistic', 'SGD', 'MNB', 'ADAB', 'BNB']

for c in my_class:
  print(f"                                                            ***** {decode(c)} ****")
  learn.select_model(c)

                                                            ***** KNN ****
     Measure                 Mean              StandError
0  TrainTime  0.03053746223449707  3.8089382893811635e-05
1   TestTime   191.47071652412416     0.02031208658878715
2         F1   0.7251879999999999  1.4998069209068289e-05
3   Accuracy   0.7366409616866082  0.00011507305517374916
                                                            ***** Linear SVC ****




     Measure                 Mean             StandError
0  TrainTime    32.27579636573792    0.01120961817550929
1   TestTime  0.02977900505065918  5.902192017424547e-06
2         F1   0.8149040000000001  7.046724061576457e-06
3   Accuracy   0.8272321324320162   8.82861213659801e-06
                                                            ***** Logistic ****
     Measure                 Mean              StandError
0  TrainTime    4.785932731628418    0.004766517368205458
1   TestTime  0.02878575325012207    2.81966110114813e-06
2         F1             0.814508  1.0005182657003084e-05
3   Accuracy   0.8265989989243634  1.1170386732967066e-05
                                                            ***** Stochastic Gradient Descent Classifier ****
     Measure                Mean              StandError
0  TrainTime  0.4791764736175537  0.00026696413261475285
1   TestTime  0.0292604923248291   5.231326128745559e-06
2         F1            0.809484   1.098385724597691e-05
3   Acc

#### ***TF-IDF***

In [21]:
%%time

    #-- TF-IDF

def dummy(doc):
  return doc

vectorizer = TfidfVectorizer(
        tokenizer=dummy,
        preprocessor=dummy,
        min_df = 0.0001, # 0.001*100000 = 100, ragionevole
        # max_df = 0.7,
        ngram_range = (gram_min, gram_max),
        binary = False,
        # vocabulary = my_dic
    )  

train_tf_idf = vectorizer.fit_transform(train.df['Clean_Review'])

print(train_tf_idf.shape)

      #-- FEATURE SELECTION

select_percentile = SelectPercentile(chi2, percentile=10)
train_tf_idf = select_percentile.fit_transform(train_tf_idf, train.df['Labels'])

print(train_tf_idf.shape)

(250000, 35539)
(250000, 3554)
CPU times: user 38.8 s, sys: 858 ms, total: 39.6 s
Wall time: 39.6 s


##### ***CLASSIFICATION***


In [22]:
%%time

learn = classification(train_tf_idf, train.df['Labels'], cv)

my_class = ['KNN', 'LSVC', 'Logistic', 'SGD', 'MNB', 'ADAB', 'BNB']

for c in my_class:
  print(f"                                                            ***** {decode(c)} ****")
  learn.select_model(c)


                                                            ***** KNN ****
     Measure                  Mean              StandError
0  TrainTime  0.030478382110595705   4.738568745019273e-06
1   TestTime    195.24522523880006     0.02123646129729317
2         F1              0.755972  1.7249083453911367e-05
3   Accuracy     0.754446958254523  0.00012296331113154975
                                                            ***** Linear SVC ****
     Measure                  Mean              StandError
0  TrainTime     1.965989637374878  0.00024523441824141903
1   TestTime  0.029101991653442384   8.236781509753625e-06
2         F1    0.8185439999999999  1.9718466471812684e-05
3   Accuracy    0.8105136682317232   2.370114760514954e-05
                                                            ***** Logistic ****
     Measure                  Mean              StandError
0  TrainTime    3.4239308834075928    0.009269846231382084
1   TestTime  0.028853225708007812   5.715673299536257e