** I have created this to explain the topics of the class once more **

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style
style.use('Solarize_Light2')
%matplotlib inline

In [2]:
df = pd.read_csv('SMSSpamCollection.csv', index_col=0)

In [35]:
df['target'] = df['target'].map({'ham':0, 'spam':1})

In [36]:
df.head()

Unnamed: 0,target,sms
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...\r\n
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

We want to transform our text using TF-IDF and use a classifer to predict

** 1. Manual way (Not using Pipeline) **

In [38]:
X = df['sms'].values
y = df['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [39]:
tfidf = TfidfVectorizer(strip_accents='unicode',
                        ngram_range=(1, 1),
                        stop_words='english')

In [40]:
tfidf.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [41]:
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [42]:
lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [43]:
test_predictions = lr.predict(X_test_tfidf)
print(classification_report(y_test, test_predictions))

             precision    recall  f1-score   support

          0       0.96      1.00      0.98       712
          1       0.98      0.77      0.86       125

avg / total       0.96      0.96      0.96       837



The result is not bad but we want to have a better recall on spam, lets try NB.

In [44]:
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [45]:
test_predictions = nb.predict(X_test_tfidf)
print(classification_report(y_test, test_predictions))

             precision    recall  f1-score   support

          0       0.97      1.00      0.98       712
          1       1.00      0.82      0.90       125

avg / total       0.97      0.97      0.97       837



NB is actually have better recall and precision for spam

Now we want to make a composite model that does TF-IDF then NB or LogisticRegression.<br>
We can use Pipeline to make a model out of other models.<br>
sklearn's Pipeline gets *transfromers* for all it's steps except last one, the last one should be a model that *predicts*.<br>
**transformers** can be feature extractor like TF-IDF, could be dimentionality reduction models like PCA,SVD,TSNE or scalers like StandardScaler, MinMaxScaler etc<br>
The last step should be a model that predicts, **classifer/regressor** like LogisticRegression, KNN, Kmeans, Desicion Trees etc.

Now we will use TF-IDF and LogisticRegression for our Pipeline beause I want you show how to gridsearch for both our models, *(NB doesn't have any hyper-parameter to search for)*

Defining the steps, each step is a *Tuple* in the format of ('name', model)

In [46]:
tfidf = TfidfVectorizer(strip_accents='unicode')
lr = LogisticRegression()

tf_lr = [('tfvect', tfidf), ('logi', lr)]

my_pipeline = Pipeline(steps=tf_lr)

for parameters of Gridsearch we need to pass the name of the model in Pipeline followed by two *underlines* '__' and then name of the parameter

In [49]:
params = {'tfvect__ngram_range': [(1, 1), (2, 2), (1, 2)],
          'tfvect__stop_words': [None, 'english'],
          'tfvect__min_df':[1, 5],
          'tfvect__max_features':[None, 100, 1000, 10000],
          'tfvect__use_idf':[True, False],
          'tfvect__sublinear_tf':[True, False],
          'logi__C':[0.1, 1.0, 10.0, 100.0],
          'logi__penalty':['l1', 'l2']}

gs_tf_lr = GridSearchCV(estimator=my_pipeline, param_grid=params, scoring='roc_auc',
                        cv=3, verbose=1)

In [50]:
gs_tf_lr.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.7s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfvect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'tfvect__ngram_range': [(1, 1)], 'tfvect__stop_words': ['english'], 'tfvect__min_df': [1], 'tfvect__max_features': [None], 'tfvect__use_idf': [True], 'tfvect__sublinear_tf': [True], 'logi__C': [10.0], 'logi__penalty': ['l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [51]:
gs_tf_lr.best_params_

{'logi__C': 10.0,
 'logi__penalty': 'l2',
 'tfvect__max_features': None,
 'tfvect__min_df': 1,
 'tfvect__ngram_range': (1, 1),
 'tfvect__stop_words': 'english',
 'tfvect__sublinear_tf': True,
 'tfvect__use_idf': True}

In [52]:
gs_tf_lr.best_score_

0.9922145882114418

In [53]:
clf = gs_tf_lr.best_estimator_
test_predictions = clf.predict(X_test)
print(classification_report(y_test, test_predictions))

             precision    recall  f1-score   support

          0       0.98      1.00      0.99       712
          1       0.98      0.90      0.94       125

avg / total       0.98      0.98      0.98       837



In [54]:
type(clf)

sklearn.pipeline.Pipeline

You can save models with sklearn and load them later

In [55]:
from sklearn.externals import joblib

In [56]:
joblib.dump(clf, 'SMS_spam_detection_model.pkl')

['SMS_spam_detection_model.pkl']

You can later on load the model and do prediction

In [57]:
model = joblib.load('SMS_spam_detection_model.pkl')

In [58]:
text = "Hey, Where are you?"

0 means not spam, and 1 means spam

In [59]:
model.predict([text])

array([0], dtype=int64)