<a href="https://colab.research.google.com/github/MarioAvolio/Amazon-Fine-Foods-reviews-Transformers-Text-Classification/blob/main/Amazon_Fine_Food_Review_Text_Classification_with_Standard_approaches.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Classification: a "standard" approach


**Mario Avolio: 880995 - https://marioavolio.netlify.app/**

Credits: 
- https://www.oreilly.com/library/view/practical-natural-language/9781492054047/

Dataset:
- https://snap.stanford.edu/data/web-FineFoods.html



In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt # plotting
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Constants and Methods

In [None]:
PATH_PROJ = "/content/drive/MyDrive/data-proj/"
# if not os.path.exists(PATH_PROJ):
#   PATH_PROJ = "/content/drive/MyDrive/shared/data-proj/"

PATH_DATASET = PATH_PROJ+"preprocessed.csv"


# Data 

In [None]:

df = pd.read_csv(PATH_DATASET)
df[df['text'].isnull()]

Unnamed: 0,text,score


In [None]:
def convert_to_list(row):
  try:
    return list(row.split(","))
  except:
    print(row)

df.text = df.text.apply(convert_to_list)
df

Unnamed: 0,text,score
0,"[bought, several, vitality, canned, dog, food,...",5.0
1,"[product, arrived, labeled, jumbo, salted, pea...",1.0
2,"[this, confection, around, centuries, light, p...",4.0
3,"[if, looking, secret, ingredient, robitussin, ...",2.0
4,"[great, taffy, great, price, there, wide, asso...",5.0
...,...,...
35165,"[once, tasted, hazelnut, coffee, hooked, now, ...",5.0
35166,"[has, maxwell, house, quit, making, coffee, ca...",5.0
35167,"[nutty, smooth, subtle, wonderful, aroma, love...",5.0
35168,"[price, right, taste, good, we, buying, harmon...",5.0


In [None]:
df.text.iloc[1]

['product',
 'arrived',
 'labeled',
 'jumbo',
 'salted',
 'peanuts',
 'peanuts',
 'actually',
 'small',
 'sized',
 'unsalted',
 'not',
 'sure',
 'error',
 'vendor',
 'intended',
 'represent',
 'product',
 'jumbo']

# Model - Text Classification

The challenge of text classification is to “learn” this categorization from a
collection of examples for each of these categories and predict the categories for new,
unseen products and new customer reviews.


## A Simple Classifier Without the Text Classification Pipeline: lexicon-based sentiment analysis

The **AFINN** lexicon is a list of **English terms** manually rated for valence with an integer between -5 (negative) and +5 (positive) by Finn Årup Nielsen between 2009 and 2011.

https://arxiv.org/pdf/1103.2903.pdf


In [None]:
!pip install afinn
from afinn import Afinn
afinn = Afinn(emoticons=True)

In [None]:
def apply_afinn(row):
  score = 0
  try:
    for word in row:
      score += afinn.score(word)
  except:
    print(row)
    
  return score

In [None]:
len(df.text.iloc[10])

In [None]:
apply_afinn(df.text.iloc[10])

In [None]:
df['afinn'] = df["text"].apply(apply_afinn) #new attribute/column 
#check out how apply works

df[['score', 'afinn', 'text']].head(10)

In [None]:
df.iloc[0]

In [None]:
df.afinn.value_counts() # df. column_name .value_counts()

In [None]:
#let's compute the range of afinn scores in this dataset

In [None]:
abs(min(df.afinn.value_counts().index.astype(int)) - max(df.afinn.value_counts().index.astype(int)))

In [None]:
#let's visualize the histogram of frequencies

In [None]:
df.afinn.plot(kind='hist', #takes the column name as input
        alpha=0.7,
        bins = abs(min(df.afinn.value_counts().index.astype(int)) - max(df.afinn.value_counts().index.astype(int)))-1,
        title='Histogram Of Afinn scores',
        rot=45,
        grid=True,
        figsize=(12,8),
        fontsize=12, 
        color=['#364F6B'])
plt.xlabel('Afinn Scores')
plt.ylabel("Number of Sentences");

In [None]:
#let's check the distribution of sentiment values

In [None]:
df.score.value_counts() #df. column_name .value_counts()

In [None]:
confusion = pd.crosstab(df.score, df.afinn) #confusion matrix
confusion

compute a 3-class confusion matrix 
- positive (+1) 
- neutral (0) 
- negative (-1)

In [None]:
# Compute 3-class confusion matrix
confusion = pd.crosstab(np.sign(df.score - 3), 
                        np.sign(df.afinn))
confusion

In [None]:
shw = plt.imshow(confusion)
bar = plt.colorbar(shw)

In [None]:
accuracy_3_class = np.sum(np.diag(confusion)) / np.sum(confusion.values)
accuracy_3_class

 we compute a 2-class confusion matrix excluding neutral sentiments

In [None]:
# Compute 2-class confusion matrix
confusion_2_class = confusion.iloc[[0, 2], [0, 2]] #beware! 
confusion_2_class

In [None]:
shw = plt.imshow(confusion_2_class)
bar = plt.colorbar(shw)

How much is the accuracy?

In [None]:
accuracy_2_class = np.sum(np.diag(confusion_2_class)) / np.sum(confusion_2_class.values)
accuracy_2_class

Use as **baseline** the most frequent class: it gives better results

In [None]:
accuracy_2_class_baseline = confusion_2_class.sum().max() / np.sum(confusion_2_class.values)
accuracy_2_class_baseline

### Custom lexicon


In [None]:
!git clone https://github.com/mhbashari/NRC-Persian-Lexicon
!mv "/content/NRC-Persian-Lexicon/NRC-Emotion-Lexicon-v0.92-InManyLanguages-web.xlsx" "/content/NRC-Emotion-Lexicon-v0.92-InManyLanguages-web.xlsx"

In [None]:
import pandas as pd
# import the lexicon
lexicon_df = pd.read_excel("NRC-Emotion-Lexicon-v0.92-InManyLanguages-web.xlsx", engine="openpyxl")

#many languages
#8 emotion types
#https://github.com/sebastianruder/emotion_proposition_store/tree/master/NRC-Emotion-Lexicon-v0.92


#The NRC emotion lexicon is a list of words and their associations with
#eight emotions (anger, fear, anticipation, trust, surprise, sadness,
#joy, and disgust) and two sentiments (negative and positive). The
#annotations were manually done through Amazon's Mechanical Turk. Refer
#to publications below for more details: http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm 


lexicon_df.head(10)

In [None]:
# create a dict mapping word to value
lexicon = {}

#https://www.w3schools.com/python/ref_func_zip.asp

for word, pos, neg in zip(lexicon_df["English Word"], lexicon_df["Positive"], lexicon_df["Negative"]):
  if pos:
    value = 1
  elif neg:
    value = -1 #i do not consider 0's 
  else:
    continue
  lexicon[str(word).lower()] = value #lower case

In [None]:
# this custom function will return the sentiment associated to a sentence via the sum of single words
def myscore(sentence):
  sentiment = 0
  for word in sentence: 
    sentiment += lexicon.get(word.lower()) if lexicon.get(word.lower()) is not None else 0 #+= operator
  return sentiment

In [None]:
# apply myscore function
df['myscore'] = df["text"].apply(myscore)
df[['score', 'afinn','myscore', 'text']].tail(30)

In [None]:
confusion = pd.crosstab(np.sign(df.score - 3), np.sign(df.myscore))
confusion_2_class = confusion.iloc[[0, 2], [0, 2]]
confusion_2_class

In [None]:
shw = plt.imshow(confusion_2_class)
bar = plt.colorbar(shw)

In [None]:
accuracy_2_class = np.sum(np.diag(confusion_2_class)) / np.sum(confusion_2_class.values)
accuracy_2_class

lower than the baseline (0.9219343235862253)! BEWARE

what about the confusion matrix between the afinn score and the new score?

In [None]:
confusion = pd.crosstab(np.sign(df.afinn - 3), np.sign(df.myscore))
confusion_2_class = confusion.iloc[[0, 2], [0, 2]]
confusion_2_class

In [None]:
shw = plt.imshow(confusion_2_class)
bar = plt.colorbar(shw)

In [None]:
accuracy_2_class = np.sum(np.diag(confusion_2_class)) / np.sum(confusion_2_class.values)
accuracy_2_class

## Feature Engeneering and standard ML classifiers


In [None]:
df.score.value_counts()

5.0    22015
4.0     5071
1.0     3218
3.0     2860
2.0     2006
Name: score, dtype: int64

In [None]:
def balance_by_score(score_value, number_of_instances, data):
  return data[data == score_value].sample(number_of_instances, random_state=1).index.to_list()

In [None]:
from sklearn.model_selection import train_test_split
#Step 1: train-test split
X = df.text
#the column text contains textual data to extract features from.
y = df.score
#this is the column we are learning to predict.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2)

In [None]:
# y_train.value_counts()

In [None]:
# indx = []
# for i in range(1,6):
#   indx.extend(balance_by_score(i, 1628, y_train))

# len(indx)

In [None]:
# def intersection(lst1, lst2):
#     return list(set(lst1) & set(lst2))

# len(intersection(X_train.index.to_list(), indx))

In [None]:
# X_train = X_train.loc[indx]
# y_train = y_train.loc[indx]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#Step 2-3: Pre-process and Vectorize train and test data
vect = CountVectorizer(stop_words=None, lowercase=True, max_features=5000)
#clean is a function we defined for pre-processing, seen in the notebook.
X_train_dtm = vect.fit_transform(X_train.apply(lambda x: " ".join(x)))
X_test_dtm = vect.transform(X_test.apply(lambda x: " ".join(x)))
print(X_train_dtm.shape, X_test_dtm.shape)

(28136, 5000) (7034, 5000)


In [None]:
def analisys(y_test, y_pred_class):
  print("\n Accuracy: ", accuracy_score(y_test, y_pred_class))
  print("Report: \n",classification_report(y_test, y_pred_class))

  cm = confusion_matrix(y_test, y_pred_class)

  disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                display_labels=[1,2,3,4,5])

  disp.plot()

  plt.show()

In [None]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

def svm_classifier(X_train_balanced, y_train_balanced, X_ts, y_ts):
  classifier = LinearSVC() 
  classifier.fit(X_train_balanced, y_train_balanced) #fit the model with training data
  y_pred_class = classifier.predict(X_ts)
  analisys(y_ts, y_pred_class)

def mnb_classifier(X_train_balanced, y_train_balanced, X_ts, y_ts):
  nb = MultinomialNB() #instantiate a Multinomial Naive Bayes classifier
  nb.fit(X_train_balanced, y_train_balanced)#train the mode
  y_pred_class = nb.predict(X_ts)#make class predictions for test data
  analisys(y_ts, y_pred_class)

def lr_classifier(X_train_balanced, y_train_balanced, X_ts, y_ts):
  logreg = LogisticRegression(max_iter=10000)
  logreg.fit(X_train_balanced, y_train_balanced)
  y_pred_class = logreg.predict(X_ts)
  analisys(y_ts, y_pred_class)


### Balancig Data

https://imbalanced-learn.org/stable/

In [None]:
!pip install imbalanced-learn
from imblearn.over_sampling import *
from imblearn.under_sampling import *
from imblearn.combine import *
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
def balancing_data(method, X_train_not_balanced, y_train_not_balanced):
  autopct = "%.2f"
  fig, axs = plt.subplots(ncols=2, figsize=(10, 5))
  y_train_not_balanced.value_counts().plot.pie(autopct=autopct, ax=axs[0])
  axs[0].set_title("Original")
  X_train_balanced, y_train_balanced = method.fit_resample(X_train_not_balanced, y_train_not_balanced)
  y_train_balanced.value_counts().plot.pie(autopct=autopct, ax=axs[1])
  axs[1].set_title("Balanced")
  fig.tight_layout()
  print(y_train_balanced.value_counts())
  return X_train_balanced, y_train_balanced

In [None]:
X_train_balanced, y_train_balanced = balancing_data(BorderlineSMOTE(random_state=42), X_train_dtm, y_train)

### Naive Bayes Classifier


In [None]:
mnb_classifier(X_train_balanced, y_train_balanced, X_test_dtm, y_test)

### Logistic Regression


In [None]:
lr_classifier(X_train_balanced, y_train_balanced, X_test_dtm, y_test)

### Support Vector Machine


In [None]:
svm_classifier(X_train_balanced, y_train_balanced, X_test_dtm, y_test)

## Using Neural Embeddings in Text Classification


### Word Embeddings
We use [GoogleNews-vectors-negative300](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/view?resourcekey=0-wjGZdNAUop6WykTtMip30g). This is a large model that can be seen as a dictionary where the keys are words in the
vocabulary and the values are their learned embedding representations. Given a
query word, if the word’s embedding is present in the dictionary, it will return the
same

In [None]:
data_path= PATH_PROJ + "GoogleNews-vectors-negative300.bin" # from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/view?resourcekey=0-wjGZdNAUop6WykTtMip30g
# !cp $data_path "/content/GoogleNews-vectors-negative300.bin"

In [None]:
from gensim.models import KeyedVectors

#Load W2V model. This will take some time.
w2v_model = KeyedVectors.load_word2vec_format(data_path, binary=True)
print('done loading Word2Vec')

done loading Word2Vec


How do we use this pre-learned embedding to represent features? A simple approach is just
to average the embeddings for individual words in text.

In [None]:
# Creating a feature vector by averaging all embeddings for all sentences
def embedding_feats(list_of_lists, w2v_model):
  DIMENSION = 300
  zero_vector = np.zeros(DIMENSION)
  feats = []
  for tokens in list_of_lists:
    feat_for_this = np.zeros(DIMENSION)
    count_for_this = 0
    
    for token in tokens:
      if token in w2v_model:
        feat_for_this += w2v_model[token]
        count_for_this +=1
    
    feats.append(feat_for_this/count_for_this)
    
  return feats

In [None]:
train_vectors = embedding_feats(X_train, w2v_model)
print(len(train_vectors))
test_vectors = embedding_feats(X_test, w2v_model)
print(len(test_vectors))

28136
7034


In [None]:
X_train_balanced, y_train_balanced = balancing_data(BorderlineSMOTE(random_state=42), train_vectors, y_train)

In [None]:
np.argwhere(np.isnan(np.array(train_vectors)))

array([], shape=(0, 2), dtype=int64)

We treat the resulting
embedding vector as the feature vector that represents the entire text. Once this feature engineering is done, the final step is similar to what we did in the previous section: use these features and train a classifier. 


### Naive Bayes Classifier

Naive bayes classifier does not allow for negative values in the document vectors. But when we use document+word vectors, Z will have some negatives. It should be possible to translate/scale all vectors uniformly to avoid negatives, but we do not bother as we have enough simulations to run anyway. So basically naive bayes classifier is used ONLY with pure document vectors here.

credits: https://towardsdatascience.com/word-embeddings-and-document-vectors-when-in-doubt-simplify-8c9aaeec244e

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_balanced_for_mnb = scaler.fit_transform(X_train_balanced)
X_ts_for_mnb = scaler.transform(test_vectors)


mnb_classifier(X_train_balanced_for_mnb, y_train_balanced, X_ts_for_mnb, y_test)

### Logistic Regression


In [None]:
lr_classifier(X_train_balanced, y_train_balanced, test_vectors, y_test)

### Support Vector Machine


In [None]:
svm_classifier(X_train_balanced, y_train_balanced, test_vectors, y_test)

### Subword Embeddings and fastText

FastText embeddings are based on the idea of
enriching word embeddings with subword-level information. Thus, the embedding
representation for each word is represented as a sum of the representations of individual character n-grams. While fastText is a general-purpose library to learn the embeddings, it also supports
off-the-shelf text classification by providing end-to-end classifier training and testing;
i.e., we don’t have to handle feature extraction separately. 

https://en.wikipedia.org/wiki/FastText


In [None]:
PATH_DATASET = PATH_PROJ+"food.csv"

In [None]:
df = pd.read_csv(PATH_DATASET)

In [None]:
!pip install fasttext==0.9.2


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


So, the first step
involves cleaning the text to
remove extraneous characters, similar to what we did in the pre-processing steps for
the other classifier examples we’ve seen so far.

In [None]:
# Lets do some cleaning of this text
def clean_it(text,normalize=True):
    # Replacing possible issues with data. We can add or reduce the replacemtent in this chain
    s = str(text).replace(',',' ').replace('"','').replace('\'',' \' ').replace('.',' . ').replace('(',' ( ').\
            replace(')',' ) ').replace('!',' ! ').replace('?',' ? ').replace(':',' ').replace(';',' ').lower()
    
    # normalizing / encoding the text
    if normalize:
        s = s.normalize('NFKD').str.encode('ascii','ignore').str.decode('utf-8')
    
    return s

# Now lets define a small function where we can use above cleaning on datasets
def clean_df(data, cleanit= False, shuffleit=False, encodeit=False, label_prefix='__class__'):
    # Defining the new data
    df = data[['text']].copy(deep=True)
    df['score'] = label_prefix + data['score'].astype(str) + ' '
    
    # cleaning it
    if cleanit:
        df['text'] = df['text'].apply(lambda x: clean_it(x,encodeit))
    
    # shuffling it
    if shuffleit:
        df.sample(frac=1).reset_index(drop=True)
            
    return df

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, random_state=1, test_size=0.2)

In [None]:
df_train_cleaned = clean_df(df_train, True, True)
df_train_cleaned

Unnamed: 0,text,score
6584,these are really good ! ! ! also be sure ...,__class__5.0
22631,this is among the best sencha i have had . ma...,__class__5.0
10870,great flavor . i snack on these when i need ...,__class__5.0
31464,although the size of the can is deceiving ( o...,__class__5.0
24686,the fda has issued a warning about pet treats ...,__class__1.0
...,...,...
7813,i ' m not vegetarian but i ' m on the fat sma...,__class__4.0
32511,i love this cereal . it is sweetened with mo...,__class__5.0
5192,great price and my cat loves this food . he ...,__class__5.0
12172,i am always feeding my dogs chicken jerky i l...,__class__5.0


In [None]:
df_test_cleaned = clean_df(df_test, True, True)
df_test_cleaned

Unnamed: 0,text,score
15799,i really like nantucket blend . i drink way ...,__class__1.0
5828,hi i ordered this product from another websit...,__class__3.0
23130,smooth coffee with rich flavorful aroma alon...,__class__2.0
9609,fondarific usually has glowing reviews . . ....,__class__1.0
16266,the price was awesome the shipping was incred...,__class__5.0
...,...,...
1785,love these pop-chips ! the variety bag was a ...,__class__5.0
16901,i like the idea behind this product but the t...,__class__3.0
32457,i have been giving these chews to my mini-pinc...,__class__1.0
20844,we brought this product home from jamaica and ...,__class__5.0


In [None]:
# Write files to disk as fastText classifier API reads files from disk.
train_file = PATH_PROJ + '/fasttext_train.csv'
df_train_cleaned.to_csv(train_file, header=None, index=False, columns=['score','text'] )

test_file = PATH_PROJ + '/fasttext_test.csv'
df_test_cleaned.to_csv(test_file, header=None, index=False, columns=['score','text'])

Now that we have the train and test files written into disk in a format fastText wants, we are ready to use it for text classification!

In [None]:
%%time
## Using fastText for feature extraction and training
from fasttext import train_supervised 
"""fastText expects and training file (csv), a model name as input arguments.
label_prefix refers to the prefix before label string in the dataset.
default is __label__. In our dataset, it is __class__. 
There are several other parameters which can be seen in: 
https://pypi.org/project/fasttext/
"""

model = train_supervised(input=train_file, label="__class__", lr=1.0, epoch=75, loss='ova', wordNgrams=2, dim=200, thread=2, verbose=100)

CPU times: user 5min 38s, sys: 4.69 s, total: 5min 43s
Wall time: 3min 58s


In [None]:
def f1_score_compute(precision, recall):
  return 2 * (precision * recall) / (precision + recall)

# Precision = True Positive / (True Positive + False Positive) 
# Recall = True Positive / (True Positive + False Negative) 
# F1 Score = 2 * (Precision * Recall) / (Precision + Recall)

for k in range(1,6):
  results = model.test(test_file,k=k) # you should specify k parameter to get the top-k predicted classes. DOC: https://fasttext.cc/docs/en/supervised-tutorial.html#advanced-readers-precision-and-recall
  recall = results[2]*100
  precision = results[1]*100
  f1 = f1_score_compute(precision, recall)
  print(f"Test Samples: {results[0]} Precision@{k} : {results[1]*100:2.4f} Recall@{k} : {results[2]*100:2.4f}  --- F1_score@{k}: {f1}")

Test Samples: 7035 Precision@1 : 72.6084 Recall@1 : 72.6084  --- F1_score@1: 72.60838663823739
Test Samples: 7035 Precision@2 : 42.7150 Recall@2 : 85.4300  --- F1_score@2: 56.95332859511965
Test Samples: 7035 Precision@3 : 30.6326 Recall@3 : 91.8977  --- F1_score@3: 45.94882729211087
Test Samples: 7035 Precision@4 : 23.9090 Recall@4 : 95.6361  --- F1_score@4: 38.25444207533759
Test Samples: 7035 Precision@5 : 20.0000 Recall@5 : 100.0000  --- F1_score@5: 33.333333333333336
