In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk as nltk
import nltk.corpus  
import re
import matplotlib.pyplot as plt

from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

import sys
import time

# Setting up our Data

In [3]:
df = pd.read_csv("Food_Inspections.csv")

In [4]:
df = df.dropna()

In [5]:
df.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
33,2360021,I LOVE SUSHI,I LOVE SUSHI,2594986.0,Restaurant,Risk 1 (High),233 N MICHIGAN AVE,CHICAGO,IL,60601.0,02/06/2020,Canvass,Fail,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,41.886567,-87.624385,"(-87.62438467059714, 41.886567370886944)"
45,2359627,BABA'S VILLAGE,BABA'S VILLAGE,2684586.0,Restaurant,Risk 1 (High),100 W RANDOLPH ST,CHICAGO,IL,60601.0,01/30/2020,Canvass,Pass w/ Conditions,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,41.884586,-87.63101,"(-87.63101044588599, 41.88458626715456)"
61,2356815,SUBWAY,SUBWAY,2516813.0,Restaurant,Risk 1 (High),4036 N NARRAGANSETT,CHICAGO,IL,60634.0,01/16/2020,Canvass,Pass,"55. PHYSICAL FACILITIES INSTALLED, MAINTAINED ...",41.954073,-87.786743,"(-87.78674347222507, 41.95407306311155)"
66,2356510,HAILEY'S HOAGIES,HAILEY'S HOAGIES,2583295.0,Restaurant,Risk 1 (High),1055 W 63RD ST,CHICAGO,IL,60621.0,01/10/2020,Canvass,Fail,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.779537,-87.651917,"(-87.65191733069446, 41.779536944269374)"
70,2356434,SIMPLY THALIA,SIMPLY THALIA,2059952.0,Restaurant,Risk 1 (High),108 N STATE ST,CHICAGO,IL,60602.0,01/09/2020,Canvass,Fail,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.883423,-87.628022,"(-87.62802165207536, 41.88342263701488)"


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 153313 entries, 33 to 211347
Data columns (total 17 columns):
Inspection ID      153313 non-null int64
DBA Name           153313 non-null object
AKA Name           153313 non-null object
License #          153313 non-null float64
Facility Type      153313 non-null object
Risk               153313 non-null object
Address            153313 non-null object
City               153313 non-null object
State              153313 non-null object
Zip                153313 non-null float64
Inspection Date    153313 non-null object
Inspection Type    153313 non-null object
Results            153313 non-null object
Violations         153313 non-null object
Latitude           153313 non-null float64
Longitude          153313 non-null float64
Location           153313 non-null object
dtypes: float64(4), int64(1), object(12)
memory usage: 21.1+ MB


In [7]:
df.shape

(153313, 17)

In [8]:
df_violations = df[['Violations', 'Results']]

In [9]:
df_violations.shape

(153313, 2)

In [10]:
df_violations.head()

Unnamed: 0,Violations,Results
33,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,Fail
45,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,Pass w/ Conditions
61,"55. PHYSICAL FACILITIES INSTALLED, MAINTAINED ...",Pass
66,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",Fail
70,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",Fail


In [11]:
df_violations['Results'].value_counts()

Pass                  84663
Fail                  37035
Pass w/ Conditions    31039
No Entry                485
Not Ready                62
Out of Business          29
Name: Results, dtype: int64

In [12]:
def comments_extraction(violation_comments):
    only_comments = ""
    if type(violation_comments) == str:
        violation_comments = violation_comments.split(' | ')
        for v in violation_comments:
            v = v.split('Comments:')
            if len(v) == 2:
                only_comments += v[1]
    return only_comments

In [13]:
df_violations["comments"] = df_violations["Violations"].apply(comments_extraction)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [14]:
df_violations.head()

Unnamed: 0,Violations,Results,comments
33,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,Fail,"FOUND NO HANDWASHING SIGNAGE AT HAND SINK, IN..."
45,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,Pass w/ Conditions,THE PERSON IN CHARGE DOES NOT HAVE A CITY OF ...
61,"55. PHYSICAL FACILITIES INSTALLED, MAINTAINED ...",Pass,INSTRUCTED TO DETAIL CLEAN AND MAINTAIN FLOOR...
66,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",Fail,PIC DOESN'T HAVE A FOOD MANAGERS CERTIFICATE....
70,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",Fail,NO DESIGNATED PERSON IN CHARGE. OBSERVED NO F...


In [15]:
df_violations['flag'] = df_violations['Results'].map({'Fail':0, 'Pass':1, 'Out of Business':2,'Pass w/ Conditions':3,'No Entry':4,'Not Ready':5})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [16]:
df_violations.head()

Unnamed: 0,Violations,Results,comments,flag
33,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,Fail,"FOUND NO HANDWASHING SIGNAGE AT HAND SINK, IN...",0
45,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,Pass w/ Conditions,THE PERSON IN CHARGE DOES NOT HAVE A CITY OF ...,3
61,"55. PHYSICAL FACILITIES INSTALLED, MAINTAINED ...",Pass,INSTRUCTED TO DETAIL CLEAN AND MAINTAIN FLOOR...,1
66,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",Fail,PIC DOESN'T HAVE A FOOD MANAGERS CERTIFICATE....,0
70,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",Fail,NO DESIGNATED PERSON IN CHARGE. OBSERVED NO F...,0


In [17]:
df_final = df_violations[['comments','flag']]

In [18]:
df_final = df_final.loc[df_final['flag'].isin([1,0])]

In [19]:
df_final.head()

Unnamed: 0,comments,flag
33,"FOUND NO HANDWASHING SIGNAGE AT HAND SINK, IN...",0
61,INSTRUCTED TO DETAIL CLEAN AND MAINTAIN FLOOR...,1
66,PIC DOESN'T HAVE A FOOD MANAGERS CERTIFICATE....,0
70,NO DESIGNATED PERSON IN CHARGE. OBSERVED NO F...,0
71,OBSERVED NO WRITTEN EMPLOYEE HEALTH POLICY ON...,0


# Processing the text from the Comments

In [20]:
# defining our predictor and response variable
X = df_final.comments
y = df_final.flag
print(X.shape)
print(y.shape)

(121698,)
(121698,)


In [21]:
# Splitting our data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(91273,)
(91273,)
(30425,)
(30425,)


# CountVectorizer

In [22]:
# Transforming/vectorizing train data using CountVectorizer
countvectorizer = CountVectorizer(stop_words = 'english',min_df=2,max_df=0.5)
X_train_cv = countvectorizer.fit_transform(X_train)
X_train_cv.shape

(91273, 14436)

In [23]:
# Creating a document term matrix for the train data using CountVectorizer
X_train_cv_dtm = pd.DataFrame(X_train_cv.toarray(), columns=countvectorizer.get_feature_names())
X_train_cv_dtm

Unnamed: 0,00,000,0005a,001,002,003,0030,00367,004,005,...,zipcode,ziploc,ziplock,zone,zucchini,½ed,½f,½o,½time,½ï
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91268,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
91269,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
91270,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
91271,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Creating a document term matrix for the test data CountVectorizer
X_test_cv = countvectorizer.transform(X_test)
X_test_cv_dtm = pd.DataFrame(X_test_cv.toarray(), columns=countvectorizer.get_feature_names())
X_test_cv_dtm

Unnamed: 0,00,000,0005a,001,002,003,0030,00367,004,005,...,zipcode,ziploc,ziplock,zone,zucchini,½ed,½f,½o,½time,½ï
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30420,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30421,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30422,0,0,0,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
30423,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TF-IDF Vectorizer

In [25]:
# Transforming/Vectorizing the train data using TF-IDF Vectorizer
tfidfvectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidfvectorizer.fit_transform(X_train)
X_train_tfidf.shape

(91273, 32822)

In [26]:
# Creating the Document term matrix for the train data using TF-IDF vectorizer
X_train_tfidf_dtm = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidfvectorizer.get_feature_names())
X_train_tfidf_dtm

Unnamed: 0,00,000,000058831,0001,0005a,0008,000c,000lbs,001,002,...,zumba,zumex,zurich,½c,½ed,½f,½o,½s,½time,½ï
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91269,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91270,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# Creating the Document term matrix for the test data using TF-IDF vectorizer
X_test_tfidf = tfidfvectorizer.transform(X_test)
X_test_tfidf_dtm = pd.DataFrame(X_test_tfidf.toarray(), columns=tfidfvectorizer.get_feature_names())
X_test_tfidf_dtm

Unnamed: 0,00,000,000058831,0001,0005a,0008,000c,000lbs,001,002,...,zumba,zumex,zurich,½c,½ed,½f,½o,½s,½time,½ï
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Modelling : Logistic Regression with CountVectorizer

In [28]:
# Creating an instance of the logistic regression model
logreg = LogisticRegression()

In [29]:
# fitting our logistic regression model on the CountVectorized train data
%time logreg.fit(X_train_cv_dtm, y_train)



CPU times: user 52.1 s, sys: 17.1 s, total: 1min 9s
Wall time: 45.3 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
# predicting the classes on the CountVectorized  test data using the logistic regression model
y_pred_lr_test_cv = logreg.predict(X_test_cv_dtm)

In [31]:
# predicting the probabilities on the CountVectorized test data using the logistic regression model
y_pred_lr_test_cv_proba = logreg.predict_proba(X_test_cv_dtm)

In [32]:
# Calculating accuracy of the logistic regression model on the CountVectorized test data
accuracy_lr_test_cv = metrics.accuracy_score(y_test, y_pred_lr_test_cv)
print(accuracy_lr_test_cv)

0.9763023829087921


In [33]:
# Calculating entire classification report for the CountVectorized test data of the logistic regression model
print(classification_report(y_test, y_pred_lr_test_cv))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96      9348
           1       0.97      0.99      0.98     21077

    accuracy                           0.98     30425
   macro avg       0.98      0.97      0.97     30425
weighted avg       0.98      0.98      0.98     30425



In [34]:
# ROC_AUC score for the CountVectorized test data of the logistic regression model
roc_auc_score(y_test, y_pred_lr_test_cv_proba[:,1])

0.9896924695843423

In [36]:
# predicting the classes on the CountVectorized train data using the logistic regression model
y_pred_lr_train_cv = logreg.predict(X_train_cv_dtm)

In [37]:
# predicting the probabilities on the train data using the CountVectorizer using the logistic regression model
y_pred_lr_train_cv_proba = logreg.predict_proba(X_train_cv_dtm)

In [38]:
# Calculating accuracy of the logistic regression model on the CountVectorized train data
accuracy_lr_train_cv = metrics.accuracy_score(y_train, y_pred_lr_train_cv)
print(accuracy_lr_train_cv)

0.9849024355504914


In [39]:
# Calculating entire classification report for the CountVectorized train data of the logistic regression model
print(classification_report(y_train, y_pred_lr_train_cv))

              precision    recall  f1-score   support

           0       0.99      0.96      0.97     27687
           1       0.98      1.00      0.99     63586

    accuracy                           0.98     91273
   macro avg       0.99      0.98      0.98     91273
weighted avg       0.98      0.98      0.98     91273



In [40]:
# ROC_AUC score for the CountVectorized train data of the logistic regression model
roc_auc_score(y_train, y_pred_lr_train_cv_proba[:,1])

0.9959457478732379

# Logistic Regression with TF-IDF Vectorizer

In [42]:
# fitting our logistic regression model on the TF-IDF vectorized train data
%time logreg.fit(X_train_tfidf_dtm, y_train)



CPU times: user 18.1 s, sys: 18.1 s, total: 36.1 s
Wall time: 40.9 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [43]:
# predicting on the classes of TF-IDF vectorized test data  using the logistic regression model
y_pred_lr_test_tfidf = logreg.predict(X_test_tfidf_dtm)

In [44]:
# predicting the probabilities on the TF-IDF vectorized test data  using the logistic regression model
y_pred_lr_test_tfidf_proba = logreg.predict_proba(X_test_tfidf_dtm)

In [45]:
# Calculating accuracy of the logistic regression model on the TF-IDF vectorized test data
accuracy_lr_test_tfidf = metrics.accuracy_score(y_test, y_pred_lr_test_tfidf)
print(accuracy_lr_test_tfidf)

0.9732456861133936


In [46]:
# Calculating entire classification report for the TF-IDF Vectorized test data of the logistic regression model
print(classification_report(y_test, y_pred_lr_test_tfidf))

              precision    recall  f1-score   support

           0       0.98      0.93      0.96      9348
           1       0.97      0.99      0.98     21077

    accuracy                           0.97     30425
   macro avg       0.98      0.96      0.97     30425
weighted avg       0.97      0.97      0.97     30425



In [47]:
# ROC_AUC score for the TF-IDF Vectorized test data of the logistic regression model
roc_auc_score(y_test, y_pred_lr_test_tfidf_proba[:,1])

0.9890151844362102

In [49]:
# predicting the classes on the TF-IDF Vectorized train data using the logistic regression model
y_pred_lr_train_tfidf = logreg.predict(X_train_tfidf_dtm)

In [50]:
# predicting the probabilities on the TF-IDF vectorized train data  using the logistic regression model
y_pred_lr_train_tfidf_proba = logreg.predict_proba(X_train_tfidf_dtm)

In [51]:
# Calculating accuracy of the logistic regression model on the TF-IDF vectorized train data
accuracy_lr_train_tfidf = metrics.accuracy_score(y_train, y_pred_lr_train_tfidf)
print(accuracy_lr_train_tfidf)

0.9763895127803403


In [52]:
# Calculating entire classification report for the TF-IDF Vectorized train data of the logistic regression model
print(classification_report(y_train, y_pred_lr_train_tfidf))

              precision    recall  f1-score   support

           0       0.99      0.93      0.96     27687
           1       0.97      0.99      0.98     63586

    accuracy                           0.98     91273
   macro avg       0.98      0.96      0.97     91273
weighted avg       0.98      0.98      0.98     91273



In [53]:
# ROC_AUC score for the TF-IDF Vectorized train data of the logistic regression model
roc_auc_score(y_test, y_pred_lr_test_tfidf_proba[:,1])

0.9890151844362102

The classification metrics between the train and the test data are pretty much the same which means that there is no overfitting.

# Random Forest with CountVectorizer

In [54]:
# Creating an instance of the Random forest model
rf = RandomForestClassifier()


In [55]:
# fitting our Random forest model on the CountVectorized train data
%time rf.fit(X_train_cv_dtm, y_train)



CPU times: user 2min 53s, sys: 10.6 s, total: 3min 4s
Wall time: 3min 13s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [56]:
# predicting the classes on the CountVectorized test data using the Random forest model
y_pred_rf_test_cv = rf.predict(X_test_cv_dtm)

In [57]:
# predicting the probabilities on the CountVectorized test data using the Random forest model
y_pred_rf_test_cv_proba = rf.predict_proba(X_test_cv_dtm)

In [58]:
# Calculating accuracy of the Random forest model on the CountVectorized test data
accuracy_rf_test_cv = metrics.accuracy_score(y_test, y_pred_rf_test_cv)
print(accuracy_rf_test_cv)

0.9748562037797863


In [59]:
# Calculating entire classification report for the CountVectorized test data of the Random forest model
print(classification_report(y_test, y_pred_rf_test_cv))

              precision    recall  f1-score   support

           0       0.97      0.94      0.96      9348
           1       0.98      0.99      0.98     21077

    accuracy                           0.97     30425
   macro avg       0.97      0.97      0.97     30425
weighted avg       0.97      0.97      0.97     30425



In [60]:
# ROC_AUC score for the CountVectorized test data of the Random forest model
roc_auc_score(y_test, y_pred_rf_test_cv_proba[:,1])

0.986265727197192

In [62]:
# predicting on the CountVectorized train data using the Random forest model
y_pred_rf_train_cv = rf.predict(X_train_cv_dtm)

In [66]:
# predicting the probabilities on the CountVectorized test data using the Random forest model
y_pred_rf_train_cv_proba = rf.predict_proba(X_train_cv_dtm)

In [63]:
# Calculating accuracy of the Random forest model on the CountVectorized train data
accuracy_rf_train_cv = metrics.accuracy_score(y_train, y_pred_rf_train_cv)
print(accuracy_rf_train_cv)

0.9968117625146539


In [64]:
# Calculating entire classification report for the CountVectorized train data of the Random forest model
print(classification_report(y_train, y_pred_rf_train_cv))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     27687
           1       1.00      1.00      1.00     63586

    accuracy                           1.00     91273
   macro avg       1.00      1.00      1.00     91273
weighted avg       1.00      1.00      1.00     91273



In [67]:
# ROC_AUC score for the CountVectorized train data of the Random forest model
roc_auc_score(y_train, y_pred_rf_train_cv_proba[:,1])

0.9997961366872847

# Random Forest with TF-IDF Vectorizer


In [68]:
# fitting our logistic regression model on the TF-IDF vectorized train data
%time rf.fit(X_train_tfidf_dtm, y_train)

CPU times: user 4min 18s, sys: 57.2 s, total: 5min 15s
Wall time: 6min 9s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [69]:
# predicting classes on the TF-IDF vectorized test data  using the Random forest model
y_pred_rf_test_tfidf = rf.predict(X_test_tfidf_dtm)

In [70]:
# predicting the probabilities on the TF-IDF vectorized test data  using the Random forest model
y_pred_rf_test_tfidf_proba = rf.predict_proba(X_test_tfidf_dtm)

In [71]:
# Calculating accuracy of the random forest model on the TF-IDF vectorized test data
accuracy_rf_test_tfidf = metrics.accuracy_score(y_test, y_pred_rf_test_tfidf)
print(accuracy_rf_test_tfidf)

0.9736400986031224


In [72]:
# Calculating entire classification report for the TF-IDF Vectorized test data of the Random forest model
print(classification_report(y_test, y_pred_rf_test_tfidf))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96      9348
           1       0.97      0.99      0.98     21077

    accuracy                           0.97     30425
   macro avg       0.97      0.96      0.97     30425
weighted avg       0.97      0.97      0.97     30425



In [73]:
# ROC_AUC score for the TF-IDF Vectorized test data of the Random forest model
roc_auc_score(y_test, y_pred_rf_test_tfidf_proba[:,1])

0.9859789123358007

In [74]:
# predicting classes on the TF-IDF Vectorized train data using the Random forest model
y_pred_rf_train_tfidf = rf.predict(X_train_tfidf_dtm)

In [75]:
# predicting the probabilities on the TF-IDF vectorized train data  using the Random forest model
y_pred_rf_train_tfidf_proba = rf.predict_proba(X_train_tfidf_dtm)

In [76]:
# Calculating accuracy of the Random forest model on the TF-IDF vectorized train data
accuracy_rf_train_tfidf = metrics.accuracy_score(y_train, y_pred_rf_train_tfidf)
print(accuracy_rf_train_tfidf)

0.9969541923679511


In [77]:
# Calculating entire classification report for the TF-IDF Vectorized train data of the Random forest model
print(classification_report(y_train, y_pred_rf_train_tfidf))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     27687
           1       1.00      1.00      1.00     63586

    accuracy                           1.00     91273
   macro avg       1.00      1.00      1.00     91273
weighted avg       1.00      1.00      1.00     91273



In [78]:
# ROC_AUC score for the TF-IDF Vectorized train data of the Random forest model
roc_auc_score(y_train, y_pred_rf_train_tfidf_proba[:,1])

0.9997129708049969

The accuracy score, ROC_AUC scores, classification metrics for both the classes in terms of recall, precision , F1 score and Area under the curve are pretty much the same for both logistic regression model and random forest model.

Random forest model is known to be more robust and it is expected to perform well even if we feed more data to it. The text pre-processing technique or the vectorizer that i would like to go with is TF-IDF vectorizer because it takes less time to compute and also it took only 32822 features or columns to deliver good results compared to 2016861 features used by the count vectorizer . This helped because it prevents the kernel from crashing on the jupyter python notebook. My kernel kept crashing with count vectorizer because it happened to use too much numeric features or columns. But after removing the ngrams parameter from the count vectorizer , the number of features taken into consideration came down and then the kernel became relatively stable without crashing any further.

The accuracy and the ROC AUC scores of count vectorizer are relatively little better compared to TF-IDF vectorizer but I would like to still go with TF-IDF vectorizer because it is less draining on the memory and computation resources. 

Logistic regressionn wih the TF-IDF vectorizer had the best timing as it took less time compared to others.
For this particular use , the combination of logistic regression and TF-IDF had the best overall performance .

So I would like to go with logistic regression with TF-IDF because they take less amount of time with less resources with overall good performance.
