In [1]:
!pip install -U spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install pyspellchecker

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install bs4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
# Import Libraries
import pandas as pd
import numpy as np
from pathlib import Path
import textwrap as tw
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
import joblib
import re
from bs4 import BeautifulSoup
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Token
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from collections import Counter
from xgboost import XGBClassifier
from sklearn import metrics
from spellchecker import SpellChecker

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
data_folder = Path('/content/drive/MyDrive/NLP')

In [7]:
!python -m spacy download en_core_web_sm

2022-09-15 14:47:52.281029: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 231 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [8]:
nlp = spacy.load('en_core_web_sm')

In [9]:
spam_file = data_folder / 'spam.csv'

In [10]:
# creating Dataframe
spam = pd.read_csv(spam_file, index_col=0,encoding = 'ISO-8859-1')

In [11]:
print(f'Shape of data set is : {spam.shape}')

Shape of data set is : (5572, 4)


In [12]:
spam.head()

Unnamed: 0_level_0,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
v1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ham,"Go until jurong point, crazy.. Available only ...",,,
ham,Ok lar... Joking wif u oni...,,,
spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
ham,U dun say so early hor... U c already then say...,,,
ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [13]:
spam.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5572 entries, ham to ham
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v2          5572 non-null   object
 1   Unnamed: 2  50 non-null     object
 2   Unnamed: 3  12 non-null     object
 3   Unnamed: 4  6 non-null      object
dtypes: object(4)
memory usage: 217.7+ KB


In [14]:
spam.drop(columns = ['Unnamed: 2','Unnamed: 3', 'Unnamed: 4'], inplace=True)
spam.reset_index(inplace=True)
spam.rename(columns={'v1':'label', 'v2':'message'},inplace=True)

In [15]:
spam.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
# Checking for missing values
spam.isna().sum()

label      0
message    0
dtype: int64

In [17]:
# Checking distribution of class labels for train dataset
spam['label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: label, dtype: float64

In [18]:
# Checking distribution of class labels for train dataset
spam['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [19]:
save_model_folder = Path('/content/drive/MyDrive/NLP/saved_models')

In [20]:
#Using accuracy as the metric is not optimal to making the best prediction especially due to the fact that our data set is imbalanced

#To achieve an optimal result we would like to maximize two components:

#(1) The "True Positive Rate" aka Sensitivity aka Recall. Given by: TP/(TP+FN)

#(2) The Precision - How many of the positive predictions, are in fact correct. Given by: TP/(TP+FP)

#To obtain a balance between both we use F Beta Measure which is given by: (2 x Precision x Recall)/(Precision+Recall)
#F2-measure puts more attention on increasing reacall and minimizing false negatives which is critical for our problem statement given that we would not want to miss any actual spam messages.

In [21]:
# Creating a scorer for F2 score so that we can given an emphases on the minority class predictions i.e higher recall
from sklearn.metrics import fbeta_score, make_scorer
f2score = make_scorer(fbeta_score, beta=2)
f2score

make_scorer(fbeta_score, beta=2)

In [22]:
spam['label'] = spam['label'].map({'spam':1, 'ham':0}).astype(int)

In [23]:
spam.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


**FINAL** **PIPELINE**

**Reasoning behind choosing Final Pipeline**

In File 1, We created three different models:

Model 1 : # Data Preprocessing + Sparse Embeddings (TF-IDF) + ML Model pipeline

Model 2: #Featurization (TF-IDF) + Feature Engineering + ML Model pipeline

Model 3: #Feature Engineering + ML Model pipeline 

Now we had to choose amongst these for the final pipeline. In this case we want a model which is better at generalising the result. 

Hence we consider the model with the Best Cross Validation Score as our final model. 

In our case Model 1 even though shows overfitting had the highest cross validation score of 0.65. Hence I have choosen Model 1 as the final pipeline.

In [24]:
# Sample 40% of data
spam_smaller = spam.sample(frac=0.4, replace=True, random_state=1)

In [25]:
X = spam_smaller['message'].values
y = spam_smaller['label'].values
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0)
print(f'X_train: {X_train.shape} y_train: {y_train.shape}')
print(f'X_test: {X_test.shape} y_test: {y_test.shape}')

X_train: (1671,) y_train: (1671,)
X_test: (558,) y_test: (558,)


In [26]:
# Defining Custom Classes

In [27]:
class SpacyPreprocessor(BaseEstimator, TransformerMixin):
    np.random.seed(0)
    def __init__(self, lammetize=True, lower=True, remove_stop=True, 
                 remove_punct=True, remove_email=True, remove_url=True, remove_num=False, stemming = False,
                 add_user_mention_prefix=True, remove_hashtag_prefix=False):
        self.remove_stop = remove_stop
        self.remove_punct = remove_punct
        self.remove_num = remove_num
        self.remove_url = remove_url
        self.remove_email = remove_email
        self.lammetize = lammetize
        self.lower = lower
        self.stemming = stemming
        self.add_user_mention_prefix = add_user_mention_prefix
        self.remove_hashtag_prefix = remove_hashtag_prefix

 # helpfer functions for basic cleaning 

    def basic_clean(self, text):
        
        '''
        This fuction removes HTML tags from text
        '''
        if (bool(BeautifulSoup(text, "html.parser").find())==True):         
            soup = BeautifulSoup(text, "html.parser")
            text = soup.get_text()
        else:
            pass
        return re.sub(r'[\n\r]',' ', text) 

    # helper function for pre-processing with spacy and Porter Stemmer
    
    def spacy_preprocessor(self,texts):

        final_result = []
        nlp = spacy.load('en_core_web_sm', disable=['parser','ner'])
        
        ## Add @ as a prefix so that we can separate the word from its token
        prefixes = list(nlp.Defaults.prefixes)

        if self.add_user_mention_prefix:
            prefixes += ['@']

        ## Remove # as a prefix so that we can keep hashtags and words together
        if self.remove_hashtag_prefix:
            prefixes.remove(r'#')

        prefix_regex = spacy.util.compile_prefix_regex(prefixes)
        nlp.tokenizer.prefix_search = prefix_regex.search

        matcher = Matcher(nlp.vocab)
        if self.remove_stop:
            matcher.add("stop_words", [[{"is_stop" : True}]])
        if self.remove_punct:
            matcher.add("punctuation",[ [{"is_punct": True}]])
        if self.remove_num:
            matcher.add("numbers", [[{"like_num": True}]])
        if self.remove_url:
            matcher.add("urls", [[{"like_url": True}]])
        if self.remove_email:
            matcher.add("emails", [[{"like_email": True}]])
            
        Token.set_extension('is_remove', default=False, force=True)

        cleaned_text = []
        for doc in nlp.pipe(texts,batch_size= 500,disable=['parser','ner'], n_process = 3):
            matches = matcher(doc)
            for _, start, end in matches:
                for token in doc[start:end]:
                    token._.is_remove =True
                    
            if self.lammetize:              
                text = ' '.join(token.lemma_ for token in doc if (token._.is_remove==False))
            elif self.stemming:
                text = ' '.join(PorterStemmer().stem(token.text) for token in doc if (token._.is_remove==False))
            else:
                text = ' '.join(token.text for token in doc if (token._.is_remove==False))
                                   
            if self.lower:
                text=text.lower()
            cleaned_text.append(text)
        return cleaned_text

    def fit(self, X,y=None):
        return self

    def transform(self, X, y=None):
        try:
            if str(type(X)) not in ["<class 'list'>","<class 'numpy.ndarray'>"]:
                raise Exception('Expected list or numpy array got {}'.format(type(X)))
            x_clean = [self.basic_clean(text) for text in X]
            x_clean_final = self.spacy_preprocessor(x_clean)
            return x_clean_final
        except Exception as error:
            print('An exception occured: ' + repr(error))

In [28]:
class FeatureEngineering(BaseEstimator, TransformerMixin):
    np.random.seed(0)
    nlp = spacy.load('en_core_web_sm', disable=['parser'])
    spell = SpellChecker()
    def __init__(self, word_count=False, char_count=False, char_count_wo_space=False, 
                 avg_word_length=False, digit_count=False, noun_count= True, propernoun_count=True, 
                 verb_count=True, aux_count= True, adj_count= True, ner_count= True, misspelled_count=True):
        self.word_count = word_count
        self.char_count = char_count
        self.char_count_wo_space = char_count_wo_space
        self.avg_word_length = avg_word_length
        self.digit_count = digit_count
        self.noun_count = noun_count
        self.propernoun_count = propernoun_count
        self.verb_count = verb_count
        self.aux_count = aux_count
        self.adj_count = adj_count
        self.ner_count = ner_count
        self.misspelled_count= misspelled_count
  
    def fit(self, X,y=None):
        return self

    #Useful functions

    def wordCount(self,text):
        return len(text.split())

    def charCount(self,text):
        return len(text)

    def charCountWithoutSpace(self,text):
        count = 0
        for word in text.split():
            count += len(word)
        return count

    def avgWordLength(self,text):
        word_length = 0
        for token in text.split():
            word_length += len(token)
        word_count = len(text.split())
        if word_count == 0:
            return 0
        else:
            return word_length/word_count

    def digitCount(self,text):
        count = 0
        for i in text:
            if i.isdigit():
                count += 1
        return count

    

    def nouncount(self, text):
      doc = nlp(text)
      noun_tokens = [token.text for token in doc if(token.pos_== 'NOUN')]        
      return len(noun_tokens)

    def propernouncount(self, text):
      doc = nlp(text)
      prnoun_tokens = [token.text for token in doc if(token.pos_== 'PROPN')]       
      return len(prnoun_tokens)
    
    def verbcount(self, text):
      doc = nlp(text)
      verb_tokens = [token.text for token in doc if(token.pos_== 'VERB')]         
      return len(verb_tokens)
    
    def auxcount(self, text):
      doc = nlp(text)
      aux_tokens = [token.text for token in doc if(token.pos_== 'AUX')]        
      return len(aux_tokens)

    def adjcount(self, text):
      doc = nlp(text)
      adj_tokens = [token.text for token in doc if(token.pos_== 'ADJ')]        
      return len(adj_tokens)

    def nercount(self, text):
      doc = nlp(text)
      ner = [entity.text for entity in doc.ents]       
      return len(ner)

    def misspelledcount(self,text):
      doc = nlp(text)
      tokens = [token.text for token in doc]
      misspelled_tokens = SpellChecker().unknown(tokens)       
      return len(misspelled_tokens)

    def transform(self, X,y=None):
        try:
            if str(type(X)) not in ["<class 'list'>","<class 'numpy.ndarray'>"]:
                raise Exception('Expected list or numpy array got {}'.format(type(X)))
            final_result = []
            for index,item in enumerate(X):
                res = []
                if self.word_count:
                    res.append(self.wordCount(item))
                if self.char_count:
                    res.append(self.charCount(item))
                if self.char_count_wo_space:
                    res.append(self.charCountWithoutSpace(item))
                if self.avg_word_length:
                    res.append(self.avgWordLength(item))
                if self.digit_count:
                    res.append(self.digitCount(item))
                if self.noun_count:
                    res.append(self.nouncount(item))
                if self.propernoun_count:
                    res.append(self.propernouncount(item))
                if self.verb_count:
                    res.append(self.verbcount(item))
                if self.aux_count:
                    res.append(self.auxcount(item))
                if self.adj_count:
                    res.append(self.adjcount(item))
                if self.ner_count:
                    res.append(self.nercount(item))
                if self.misspelled_count:
                    res.append(self.misspelledcount(item))
                final_result.append(res)
            return np.array(final_result)
        except Exception as error:
            print('An exception occured: ' + repr(error))


In [29]:
class DenseTransformer(TransformerMixin):

  def fit(self, X, y=None, **fit_params):
      return self

  def transform(self, X, y=None, **fit_params):
      return X.todense()

In [30]:
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate 2 plots: the test and training learning curve, the training
    samples vs fit times curve.

    Parameters
    ----------
    estimator : estimator instance
        An estimator instance implementing `fit` and `predict` methods which
        will be cloned for each validation.

    title : str
        Title for the chart.

    X : array-like of shape (n_samples, n_features)
        Training vector, where ``n_samples`` is the number of samples and
        ``n_features`` is the number of features.

    y : array-like of shape (n_samples) or (n_samples, n_features)
        Target relative to ``X`` for classification or regression;
        None for unsupervised learning.

    axes : array-like of shape (3,), default=None
        Axes to use for plotting the curves.

    ylim : tuple of shape (2,), default=None
        Defines minimum and maximum y-values plotted, e.g. (ymin, ymax).

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, default=None
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like of shape (n_ticks,)
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the ``dtype`` is float, it is regarded
        as a fraction of the maximum size of the training set (that is
        determined by the selected validation method), i.e. it has to be within
        (0, 1]. Otherwise it is interpreted as absolute sizes of the training
        sets. Note that for classification the number of samples usually have
        to be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    if axes is None:
        _, axes = plt.subplots(1, 2, figsize=(10, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True,
                       random_state=123)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    return plt

In [31]:
# count examples in each class
counter = Counter(spam_smaller['label'])
counter


Counter({0: 1956, 1: 273})

In [32]:
# estimate scale_pos_weight value
estimate = counter[0] / counter[1]

In [33]:
Classifier_1 = Pipeline([('preprocessor', SpacyPreprocessor(remove_stop=False, remove_email=False, remove_url=False )),
                  ('vectorizer', TfidfVectorizer(analyzer='word', token_pattern=r"[\S]+")),
                  ('classifier', XGBClassifier(scale_pos_weight=estimate))
                 ])

In [34]:
param_grid_classifier_1 = {'preprocessor__lammetize' : [True, False],
                'vectorizer__max_features': [100, 500, None],
                'vectorizer__max_df': [0.2, 0.8, 1],
                'vectorizer__min_df': [0.01,0.5, 1]
                }

In [35]:
# Using Gridserach to  fine tune hyperparameters using cross validation
# As we have imbalanced data set, we will use scoring method of f2score.
grid_classifier_1 = GridSearchCV(estimator=Classifier_1, param_grid=param_grid_classifier_1, cv = 2, scoring= f2score, n_jobs= 1, verbose = 4)

In [36]:
# Fit the model on training data
grid_classifier_1.fit(X_train, y_train)

Fitting 2 folds for each of 54 candidates, totalling 108 fits
[CV 1/2] END preprocessor__lammetize=True, vectorizer__max_df=0.2, vectorizer__max_features=100, vectorizer__min_df=0.01;, score=0.807 total time=   6.9s
[CV 2/2] END preprocessor__lammetize=True, vectorizer__max_df=0.2, vectorizer__max_features=100, vectorizer__min_df=0.01;, score=0.830 total time=   6.5s
[CV 1/2] END preprocessor__lammetize=True, vectorizer__max_df=0.2, vectorizer__max_features=100, vectorizer__min_df=0.5;, score=nan total time=   3.7s
[CV 2/2] END preprocessor__lammetize=True, vectorizer__max_df=0.2, vectorizer__max_features=100, vectorizer__min_df=0.5;, score=nan total time=   3.6s
[CV 1/2] END preprocessor__lammetize=True, vectorizer__max_df=0.2, vectorizer__max_features=100, vectorizer__min_df=1;, score=0.789 total time=   7.6s
[CV 2/2] END preprocessor__lammetize=True, vectorizer__max_df=0.2, vectorizer__max_features=100, vectorizer__min_df=1;, score=0.827 total time=   7.5s
[CV 1/2] END preprocessor_

48 fits failed out of a total of 108.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py", line 355, in _fit
    **fit_params_steps[name],
  File "/usr/local/lib/python3.7/dist-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-package

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('preprocessor',
                                        SpacyPreprocessor(remove_email=False,
                                                          remove_stop=False,
                                                          remove_url=False)),
                                       ('vectorizer',
                                        TfidfVectorizer(token_pattern='[\\S]+')),
                                       ('classifier',
                                        XGBClassifier(scale_pos_weight=7.164835164835165))]),
             n_jobs=1,
             param_grid={'preprocessor__lammetize': [True, False],
                         'vectorizer__max_df': [0.2, 0.8, 1],
                         'vectorizer__max_features': [100, 500, None],
                         'vectorizer__min_df': [0.01, 0.5, 1]},
             scoring=make_scorer(fbeta_score, beta=2), verbose=4)

In [38]:
print("Best cross-validation score: {:.2f}".format(grid_classifier_1.best_score_))
print("\nBest parameters: ", grid_classifier_1.best_params_)
print("\nBest Estimator: ", grid_classifier_1.best_estimator_)

Best cross-validation score: 0.85

Best parameters:  {'preprocessor__lammetize': True, 'vectorizer__max_df': 0.8, 'vectorizer__max_features': 100, 'vectorizer__min_df': 0.01}

Best Estimator:  Pipeline(steps=[('preprocessor',
                 SpacyPreprocessor(remove_email=False, remove_stop=False,
                                   remove_url=False)),
                ('vectorizer',
                 TfidfVectorizer(max_df=0.8, max_features=100, min_df=0.01,
                                 token_pattern='[\\S]+')),
                ('classifier',
                 XGBClassifier(scale_pos_weight=7.164835164835165))])


In [37]:
print('Train score: {:.4f}'.format(grid_classifier_1.score(X_train, y_train)))
print('Test score: {:.4f}'.format(grid_classifier_1.score(X_test, y_test)))

Train score: 0.9535
Test score: 0.8651


In [40]:
# predicted values for Test data set
y_test_pred = grid_classifier_1.predict(X_test)

In [41]:
print('\nTest set classification report:\n\n',classification_report(y_test, y_test_pred ))


Test set classification report:

               precision    recall  f1-score   support

           0       0.98      0.98      0.98       490
           1       0.86      0.87      0.86        68

    accuracy                           0.97       558
   macro avg       0.92      0.92      0.92       558
weighted avg       0.97      0.97      0.97       558

