# Text classification

In [1]:
import pandas as pd
import numpy as np
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
df = pd.read_csv('data/amazon_alexa.tsv', sep='\t')
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [3]:
df.shape

(3150, 5)

## Load language model, punctuation and stopwords

In [4]:
# load (medium-size) English language model and stopwords
nlp = spacy.load('en_core_web_md')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
# list of punctuation
punctuations = string.punctuation

## Custom tokenizer

In [5]:
# creating our tokenizer function
def custom_tokenizer(sentence):
    
    # Creating our token object, which is used to create documents with linguistic annotations.
    tokens = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    tokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens ]

    # Removing stop words
    tokens = [ word for word in tokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return tokens

## Custom transformer

In [6]:
# basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()


# custom transformer inherits sklearn class
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # clean text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

In [7]:
X = df['verified_reviews']
y = df['feedback']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

In [8]:
# vec = TfidfVectorizer(tokenizer=custom_tokenizer, max_features=1500)
vec = CountVectorizer(tokenizer=custom_tokenizer)
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vec),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x0000028A06864100>),
                ('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function custom_tokenizer at 0x0000028A06838E50>,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept

In [9]:
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:", metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:", metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:", metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.9248677248677248
Logistic Regression Precision: 0.9363336992316136
Logistic Regression Recall: 0.9849884526558892


In [10]:
print('Confusion matrix:')
pd.DataFrame(metrics.confusion_matrix(y_test, predicted))

Confusion matrix:


Unnamed: 0,0,1
0,21,58
1,13,853


## Document vectors

In [11]:
# custom transformer inherits sklearn class
class doc_vectorizer(TransformerMixin):
    
    def transform(self, X, **transform_params):
        # vectorize text
        return [nlp(text).vector for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

In [12]:
# Create pipeline using Bag of Words
pipe = Pipeline([('vectorizer', doc_vectorizer()),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 <__main__.doc_vectorizer object at 0x0000028A0C84AC70>),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [13]:
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:", metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:", metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:", metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.9322751322751323
Logistic Regression Precision: 0.9330453563714903
Logistic Regression Recall: 0.9976905311778291


In [14]:
print('Confusion matrix:')
pd.DataFrame(metrics.confusion_matrix(y_test, predicted))

Confusion matrix:


Unnamed: 0,0,1
0,17,62
1,2,864
