<a href="https://colab.research.google.com/github/MWFK/NLP-from-Zero-to-Hero/blob/main/0_Binary_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libs

In [16]:
import re
import string
import pandas as pd
import numpy as np
from functools import lru_cache

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics

# Data

In [23]:
# Source od the data https://www.kaggle.com/sid321axn/amazon-alexa-reviews
df_amazon = pd.read_excel(r'/content/amazon_alexa.xlsx')
print(df_amazon.shape)
X         = df_amazon['verified_reviews'] 
ylabels   = df_amazon['feedback']

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.25, shuffle=True, stratify=ylabels)

(3150, 5)


# Config

In [6]:
# configuration
punctuations = string.punctuation
nlp = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Processing

In [7]:
@lru_cache(maxsize=10000)
def spacy_tokenizer(sentence):
    
    mytokens = []
    # Remove trailling and overflow white spaces
    #sentence = re.sub("\s\s+" , " ", sentence.strip()) #takes too much time
    
    # Lemmatizing each token and converting each token into lowercase
    mytokens = [mytokens.append(word.lemma_) or word.lemma_ for word in nlp(re.sub("\s\s+" , " ", sentence.strip()))] # it automatically lowercase the letters

    # Removing stop words and punctuation
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    return mytokens

# Model 1

In [24]:
%%time

text_clf = Pipeline([('vect' , CountVectorizer(tokenizer=spacy_tokenizer)),
                     ('tfidf', TfidfTransformer()),
                     ('clf'  , LogisticRegression())])

model     = text_clf.fit(X_train, y_train)

CPU times: user 15.7 s, sys: 69.3 ms, total: 15.8 s
Wall time: 15.8 s


In [25]:
predicted = model.predict(X_test)

print("Logistic Regression Accuracy : {:0.4f}".format(metrics.accuracy_score (y_test, predicted)))
print("Logistic Regression Precision: {:0.4f}".format(metrics.precision_score(y_test, predicted)))
print("Logistic Regression Recall   : {:0.4f}".format(metrics.recall_score   (y_test, predicted)))

Logistic Regression Accuracy : 0.9201
Logistic Regression Precision: 0.9199
Logistic Regression Recall   : 1.0000


# Model 2

In [26]:
text_clf   = Pipeline([ ('tfidf', TfidfVectorizer()),               
                        ('clf'  , SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42))])

parameters = {'tfidf__ngram_range': [(1, 1), (1, 2)],              
              'tfidf__use_idf'    : (True, False),
              'clf__alpha'        : (1e-2, 1e-3)}

gs_clf   = GridSearchCV(estimator=text_clf, param_grid=parameters, n_jobs=-1)
model_gs = gs_clf.fit(X_train, y_train)

print('{:0.4f}'.format(model_gs.best_score_))
print(model_gs.best_params_)

0.9213
{'clf__alpha': 0.001, 'tfidf__ngram_range': (1, 1), 'tfidf__use_idf': True}


In [27]:
predicted = model_gs.predict(X_test)

print("Logistic Regression Accuracy : {:0.4f}".format(metrics.accuracy_score (y_test, predicted)))
print("Logistic Regression Precision: {:0.4f}".format(metrics.precision_score(y_test, predicted)))
print("Logistic Regression Recall   : {:0.4f}".format(metrics.recall_score   (y_test, predicted)))

Logistic Regression Accuracy : 0.9188
Logistic Regression Precision: 0.9188
Logistic Regression Recall   : 1.0000
