### CLASSIFICAZIONE PURA

Lo scopo di questo lavoro è quello di classificare alcune recensioni Amazon attraverso metodi tradizionali di machine learning. 
In particolare, le rappresentazioni utilizzate sono BOW con pesi binari, frequenze e tf-idf ed un vocabolario costituito solo da 1-gram. 

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.4 gigabytes of available RAM

You are using a high-RAM runtime!


In [12]:
from time import time
import pandas as pd
import numpy as np
import random
import string
import os.path
import re

# Plot.
import seaborn as sns
import matplotlib.pyplot as plt

# NLP pipeline.
import nltk
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words = stop_words + ['would'] + ['-PRON-'] # -PRON- viene aggiunto dal lemmatizer e va tolto.
import spacy # lemmatization
nlp = spacy.load('en', disable=['parser', 'ner'])

!pip install demoji
import demoji
demoji.download_codes()

# Rappresentazione.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectPercentile

# Classificatori.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import cross_validate

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Downloading emoji data ...
... OK (Got response in 0.12 seconds)
Writing emoji data to /root/.demoji/codes.json ...
... OK


In [4]:
# Codifica le sigle nei rispettivi classificatori. 
def decode(c):
  decoder = {'KNN': 'KNN', 
             'LSVC': 'Linear SVC', 
             'Logistic': 'Logistic', 
             'GNB': 'Gaussian Naive Bayes', 
             'Tree': 'Decision Tree Classifier', 
             'RF': 'Random Forest',
             'SGD':'Stochastic Gradient Descent Classifier',
             'MNB': 'Multinomial Naive Bayes', 
             'ADAB': 'Adaboost',
             'BNB': 'Bernullian Naive Bayes'}
  return decoder[c] 


# NLP pipeline.

def pipe(doc):
  doc = doc.lower()   # Lower case. 
  doc = re.sub(r'\d+', ' ', doc)  # Drop digits.
  doc = re.sub('['+string.punctuation+']', ' ', doc) # Drop punctuation. 
  doc = re.sub(r'\n+', ' ', doc) # Drop newline.
  if 'www.' in doc or 'http:' in doc or 'https:' in doc or '.com' in doc: # Drop URL.
    doc = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", " ", doc)
  doc = demoji.replace(string = doc, repl = " ")  # Drop emoji.
  doc = re.sub(r'\s\s+', ' ', doc) # Drop extra spaces. 
  doc = nlp(doc)  
  doc = [token.lemma_ for token in doc]  # Tokenize and Lemmatization. 
  doc = [word for word in doc if len(word) > 2] # Drop short words.
  doc = [word for word in doc if word not in stop_words] # Drop stop words.
  return doc

In [5]:
# NLP pipeline: load, sample, clean dataset.
class preprocessing:
  # Initialize.
  def __init__(self, path, name_df):
    self.path = path
    self.name_df = name_df

  # Load data.
  def loader(self):
    print('IMPORT DATASET ' + self.name_df)
    if os.path.isfile(self.path) == False: 
      print('Set not exists.')
      raise SystemExit("Stop right there!")
    else:
      with open(self.path, encoding="utf8") as file:
       self.df = file.readlines()
    print(f"Size: {len(self.df)}")

 # Sampling.
  def sampling(self, size):
    print('SAMPLING ' + self.name_df)
    random.seed = 20201230
    self.df = random.sample(self.df, size)  
    print(f"Size: {len(self.df)}")

  # From list to data frame
  def data_frame(self):
    print('CREATE DATASET: REVIEWS - LABELS ' + self.name_df)
    X = []
    labels = []

    for rev in self.df:
      _, label, sent = re.split(r'__label__(\d)', rev)
      label = int(label[0]) - 1
      labels.append(label)
      X.append(sent)

    self.df = pd.DataFrame(list(zip(X, labels)), columns=['Review', 'Labels']) 

  # NLP pipeline.
  def pipeline(self):
    print('NLP pipeline ' + self.name_df)
    t1 = time()
    self.df["Clean_Review"] = self.df["Review"].apply(pipe)
    t2 = time()
    print(f"Time: {t2-t1}")

  # Empty review,
  def drop_empty_review(self):
    print('DROP EMPTY REVIEW ' + self.name_df)
    self.df.drop(self.df[self.df['Clean_Review'].map(len) == 0].index)

# Set of possible classification techniques.
class classification:
    # Initialize.
    def __init__(self, X_train, y_train, cv):
        self.seed = 20201230
        self.X_train = X_train
        self.y_train = y_train
        self.cv = cv

    # Select the model.
    def select_model(self, class_type):
      possible_class = ['KNN', 'LSVC', 'Logistic', 'Tree', 'RF', 'SGD', 'MNB', 'ADAB', 'BNB']

      if class_type in possible_class:
        if class_type == possible_class[0]:
          clf = KNeighborsClassifier(n_neighbors = 5)
        if class_type == possible_class[1]:
          clf = LinearSVC(random_state=0, max_iter=1000)
        if class_type == possible_class[2]:
          clf = LogisticRegression(random_state=0, max_iter=1000)
        if class_type == possible_class[3]:
          clf = DecisionTreeClassifier()
        if class_type == possible_class[4]:
          clf = RandomForestClassifier()
        if class_type == possible_class[5]:
          clf = SGDClassifier(max_iter=1000, tol=1e-3)
        if class_type == possible_class[6]:
          clf = MultinomialNB()
        if class_type == possible_class[7]:
          clf = AdaBoostClassifier(n_estimators=100, random_state=0)
        if class_type == possible_class[8]:
          clf = BernoulliNB()

        self.clf = clf
        # Cross validation.
        self.score = cross_validate(self.clf, self.X_train, self.y_train, cv=self.cv, scoring = ['accuracy', 'f1'])
        self.N = int(self.X_train.shape[0]/cv)
        # Performance dell'algoritmo.
        perf = []
        for key in self.score:          
          perf.append([key, self.score[key].mean(), self.score[key].std() * 2/np.sqrt(self.N)])

        perf = np.array(perf)
        perf =  pd.DataFrame(perf, columns = ['Measure', 'Mean', 'StandError'])
        perf['Measure'] = ['TrainTime', 'TestTime', 'F1', 'Accuracy']
        print(perf)
        self.perf = perf
      else:
        raise SystemExit("Select one of the provided classifiers!")


### ***IMPORT AND PREPROCESSING***

In [6]:
 %%time

        #-- IMPORT SET.

train_path = 'drive/MyDrive/Text Mining/train.ft.txt'

train = preprocessing(train_path, 'TRAIN')

train.loader()

      #-- SAMPLING.

train_sample_size = 250000
train.sampling(train_sample_size)

      #-- CREATE A DATASET WITH REVIEW - LABEL.

train.data_frame()

      #-- PREPROCESSING.

train.pipeline()

      #-- DROP EMPTY REVIEWS.

train.drop_empty_review()

IMPORT DATASET TRAIN
Size: 3600000
SAMPLING TRAIN
Size: 250000
CREATE DATASET: REVIEWS - LABELS TRAIN
NLP pipeline TRAIN
Time: 2074.779326438904
DROP EMPTY REVIEW TRAIN
CPU times: user 34min 32s, sys: 8.47 s, total: 34min 40s
Wall time: 34min 46s


### ***FEATURE EXTRACTION - SELECTION - WEIGHTED***


In [7]:
# Number of min and max gram.
gram_min = 1
gram_max = 1

#### ***BINARY MATRIX***

In [8]:
%%time

    #-- FEATURE EXTRACTION: BINARY BOW.

def dummy(doc):
  return doc

vectorizer = CountVectorizer(
        tokenizer=dummy,
        preprocessor=dummy,
        min_df = 0.0001, # 0.0001*100000 = 10, ragionevole
        # max_df = 0.8, # stop words removal: it's useless if I had removed stop words!
        ngram_range = (gram_min, gram_max),
        binary = True
    )  

train_BOW = vectorizer.fit_transform(train.df['Clean_Review'])

print(train_BOW.shape)

      #-- FEATURE SELECTION

select_percentile = SelectPercentile(chi2, percentile = 10)
train_BOW = select_percentile.fit_transform(train_BOW, train.df['Labels'])

print(train_BOW.shape)

(250000, 14138)
(250000, 1414)
CPU times: user 6.93 s, sys: 454 ms, total: 7.38 s
Wall time: 7.39 s


###### ***CLASSIFICATION***



In [13]:
%%time

cv = 5
learn = classification(train_BOW, train.df['Labels'], cv)

my_class = ['KNN', 'LSVC', 'Logistic', 'SGD', 'MNB', 'ADAB', 'BNB']

for c in my_class:
  print(f"                                                            ***** {decode(c)} ****")
  learn.select_model(c)

# del train_BOW, test_BOW, learn

                                                            ***** KNN ****
     Measure                Mean              StandError
0  TrainTime  0.0650641918182373   0.0001627359304443132
1   TestTime   366.1159992694855     0.20861989063311057
2         F1            0.707724  1.4579942386717673e-05
3   Accuracy  0.7394337598322811   8.368565873105576e-06
                                                            ***** Linear SVC ****
     Measure                 Mean              StandError
0  TrainTime   45.843698263168335    0.003343354167321854
1   TestTime  0.03264493942260742  3.4664502692565706e-06
2         F1             0.874964  1.4487467687625725e-05
3   Accuracy   0.8757086310613117  1.4083834320409967e-05
                                                            ***** Logistic ****
     Measure                  Mean              StandError
0  TrainTime     7.714233684539795   0.0015569618152876516
1   TestTime  0.032081222534179686   4.588141428438142e-06
2         F

#### ***FREQUENCY MATRIX***

In [14]:
%%time

    #-- FEATURE EXTRACTION: FREQUENCY BOW.
    
def dummy(doc):
  return doc

vectorizer = CountVectorizer(
        tokenizer=dummy,
        preprocessor=dummy,
        min_df = 0.0001, # 0.001*100000 = 100, ragionevole
        # max_df = 0.7,
        ngram_range = (gram_min, gram_max),
        binary = False, 
        # vocabulary = my_dic
    )  

# Restituisce un amatrice di TIPO sparso quindi non bisogna convertirla. 
train_freq = vectorizer.fit_transform(train.df['Clean_Review'])

print(train_freq.shape)

      #-- FEATURE SELECTION

select_percentile = SelectPercentile(chi2, percentile=10)
train_freq = select_percentile.fit_transform(train_freq, train.df['Labels'])

print(train_freq.shape)

(250000, 14138)
(250000, 1414)
CPU times: user 6.99 s, sys: 192 ms, total: 7.18 s
Wall time: 7.18 s


##### ***CLASSIFICATION***

In [15]:
%%time

learn = classification(train_freq, train.df['Labels'], cv)

my_class = ['KNN', 'LSVC', 'Logistic', 'SGD', 'MNB', 'ADAB', 'BNB']

for c in my_class:
  print(f"                                                            ***** {decode(c)} ****")
  learn.select_model(c)

                                                            ***** KNN ****
     Measure                 Mean              StandError
0  TrainTime  0.04533343315124512  3.8723125853056595e-05
1   TestTime   362.12245497703555      0.1769799883243751
2         F1             0.705504  1.8402225952313688e-05
3   Accuracy   0.7344875823629196  1.5678424107273437e-05
                                                            ***** Linear SVC ****




     Measure                  Mean              StandError
0  TrainTime     50.42601866722107   0.0063329044665848015
1   TestTime  0.032227706909179685   7.247391530546355e-06
2         F1    0.8767760000000001  1.5295447688773445e-05
3   Accuracy    0.8776897652176585    1.45783084658232e-05
                                                            ***** Logistic ****
     Measure                 Mean              StandError
0  TrainTime   10.180205821990967    0.003383445817106665
1   TestTime  0.03125123977661133   4.465297953818192e-06
2         F1   0.8766679999999999  1.5899927043857842e-05
3   Accuracy   0.8772641482075352  1.5394059815732048e-05
                                                            ***** Stochastic Gradient Descent Classifier ****
     Measure                 Mean              StandError
0  TrainTime   1.0050169944763183  0.00038818613692800463
1   TestTime  0.03099517822265625  2.7079653668464685e-06
2         F1              0.87438    1.193636460569

#### ***TF-IDF***

In [16]:
%%time

    #-- TF-IDF

def dummy(doc):
  return doc

vectorizer = TfidfVectorizer(
        tokenizer=dummy,
        preprocessor=dummy,
        min_df = 0.0001, # 0.001*100000 = 100, ragionevole
        # max_df = 0.7,
        ngram_range = (gram_min, gram_max),
        binary = False, 
        # vocabulary = my_dic
    )  

train_tf_idf = vectorizer.fit_transform(train.df['Clean_Review'])

print(train_tf_idf.shape)

      #-- FEATURE SELECTION

select_percentile = SelectPercentile(chi2, percentile=10)
train_tf_idf = select_percentile.fit_transform(train_tf_idf, train.df['Labels'])

print(train_tf_idf.shape)

(250000, 14138)
(250000, 1414)
CPU times: user 7.35 s, sys: 17 ms, total: 7.37 s
Wall time: 7.37 s


##### CLASSIFICATION

In [17]:
%%time

learn = classification(train_tf_idf, train.df['Labels'], cv)

my_class = ['KNN', 'LSVC', 'Logistic', 'SGD', 'MNB', 'ADAB', 'BNB']

for c in my_class:
  print(f"                                                            ***** {decode(c)} ****")
  learn.select_model(c)


                                                            ***** KNN ****
     Measure                 Mean              StandError
0  TrainTime  0.04699997901916504  1.7428642662618577e-05
1   TestTime   349.25233454704284      0.2603049593730844
2         F1             0.692836   3.726513008161922e-05
3   Accuracy   0.7173839705012711  2.7293960727249858e-05
                                                            ***** Linear SVC ****
     Measure                  Mean              StandError
0  TrainTime    2.5988096237182616  0.00026463158242891774
1   TestTime  0.030098342895507814  3.2561881359059688e-06
2         F1    0.8776119999999998   1.208852679196265e-05
3   Accuracy    0.8778765793122693  1.1761169676723211e-05
                                                            ***** Logistic ****
     Measure                  Mean              StandError
0  TrainTime    3.3754664421081544     0.00801084806040223
1   TestTime  0.030335092544555665   5.927930711854171e-06
2