In [1]:
from sklearn.naive_bayes import *
from sklearn.dummy import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.calibration import *
from sklearn.linear_model import *
from sklearn.multiclass import *
from sklearn.svm import *
from sklearn.metrics import f1_score, confusion_matrix
import pandas as pd
import os
from sklearn.model_selection import train_test_split, cross_val_score

np.random.seed(3)

In [2]:
def analyze_models(classifiers, vectorizers, train_data, test_data):
    results = []
    for classifier in classifiers:
      for vectorizer in vectorizers:
        string = ''
        string += classifier.__class__.__name__ + ' with ' + vectorizer.__class__.__name__

        # train
        vectorize_text = vectorizer.fit(train_data.features)
        train_vectorized_text = vectorize_text.transform(train_data.features)
        trained_classifier = classifier.fit(train_vectorized_text, train_data.label)
        train_score = trained_classifier.score(train_vectorized_text, train_data.label)
        train_f1_score = f1_score(trained_classifier.predict(train_vectorized_text), train_data.label)

        # score
        test_vectorized_text = vectorize_text.transform(test_data.features)
        test_score = trained_classifier.score(test_vectorized_text, test_data.label)
        test_f1_score = f1_score(trained_classifier.predict(test_vectorized_text), test_data.label)
        string += ' Has Scores: ' + ' Train : ' + str(train_score) + ' Test : ' + str(test_score) \
                    + ' Train : ' + ' and F1 Scores : ' + ' Train : ' + str(train_f1_score) + ' Test : ' \
                    + str(test_f1_score) + '\n'
        results.append(classifier.__class__.__name__ + '() , ' + vectorizer.__class__.__name__
                       + '() , ' + str(train_score) + ' , ' + str(test_score) + ' , ' + str(train_f1_score) 
                       + ' , ' + str(test_f1_score))
        print(string)
    return results

In [3]:
sms_data = pd.read_csv(r"D:\Narendra\AIGenie_Capstone_ALL\sms.csv", encoding = 'latin1')
sms_data['label'] = sms_data['RESULT'].replace({'ham':0, 'spam':1})
sms_data.head(5)

Unnamed: 0,No,RESULT,SMS,label
0,1,ham,"Go until jurong point, crazy.. Available only ...",0.0
1,2,ham,Ok lar... Joking wif u oni...,0.0
2,3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1.0
3,4,ham,U dun say so early hor... U c already then say...,0.0
4,5,ham,"Nah I don't think he goes to usf, he lives aro...",0.0


In [4]:
sms_data.columns = ['No', 'result', 'features', 'label']
test_data = sms_data[sms_data.isna().sum(axis = 1) > 0]
train_data = sms_data[sms_data.isna().sum(axis = 1) == 0]

In [5]:
train_data.shape, test_data.shape

((5540, 4), (32, 4))

In [6]:
train_x, test_x = train_test_split(train_data[['features', 'label']], test_size = 0.2, 
                                    shuffle = True, stratify = train_data['label'])

In [7]:
models_summary = analyze_models(
    [
        BernoulliNB(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        CalibratedClassifierCV(),
        DummyClassifier(),
        PassiveAggressiveClassifier(),
        RidgeClassifier(),
        RidgeClassifierCV(),
        SGDClassifier(),
        OneVsRestClassifier(SVC(kernel='linear')),
        OneVsRestClassifier(LogisticRegression()),
        KNeighborsClassifier()
    ],
    [
        CountVectorizer(),
        TfidfVectorizer(),
        HashingVectorizer()
    ],
    train_x,
    test_x
)

BernoulliNB with CountVectorizer Has Scores:  Train : 0.986687725631769 Test : 0.9828519855595668 Train :  and F1 Scores :  Train : 0.9478337754199824 Test : 0.9314079422382672
BernoulliNB with TfidfVectorizer Has Scores:  Train : 0.986687725631769 Test : 0.9828519855595668 Train :  and F1 Scores :  Train : 0.9478337754199824 Test : 0.9314079422382672
BernoulliNB with HashingVectorizer Has Scores:  Train : 0.8664259927797834 Test : 0.8664259927797834 Train :  and F1 Scores :  Train : 0.0 Test : 0.0
RandomForestClassifier with CountVectorizer Has Scores:  Train : 1.0 Test : 0.983754512635379 Train :  and F1 Scores :  Train : 1.0 Test : 0.935251798561151
RandomForestClassifier with TfidfVectorizer Has Scores:  Train : 1.0 Test : 0.9783393501805054 Train :  and F1 Scores :  Train : 1.0 Test : 0.911764705882353
RandomForestClassifier with HashingVectorizer Has Scores:  Train : 1.0 Test : 0.9747292418772563 Train :  and F1 Scores :  Train : 1.0 Test : 0.8955223880597014
AdaBoostClassifier w



DummyClassifier with CountVectorizer Has Scores:  Train : 0.7736913357400722 Test : 0.7671480144404332 Train :  and F1 Scores :  Train : 0.12595744680851065 Test : 0.13523131672597866
DummyClassifier with TfidfVectorizer Has Scores:  Train : 0.7680505415162455 Test : 0.7788808664259927 Train :  and F1 Scores :  Train : 0.12294372294372295 Test : 0.13058419243986255
DummyClassifier with HashingVectorizer Has Scores:  Train : 0.7673736462093863 Test : 0.7671480144404332 Train :  and F1 Scores :  Train : 0.14117647058823526 Test : 0.14545454545454545
PassiveAggressiveClassifier with CountVectorizer Has Scores:  Train : 1.0 Test : 0.9864620938628159 Train :  and F1 Scores :  Train : 1.0 Test : 0.9477351916376306
PassiveAggressiveClassifier with TfidfVectorizer Has Scores:  Train : 1.0 Test : 0.9891696750902527 Train :  and F1 Scores :  Train : 1.0 Test : 0.9591836734693877
PassiveAggressiveClassifier with HashingVectorizer Has Scores:  Train : 1.0 Test : 0.9855595667870036 Train :  and F1 

In [8]:
model_summary = list(map(lambda x:x.split(','), models_summary))
models_summary = pd.DataFrame(model_summary, columns = ['classifier', 'vectorizer', 'train_score', 
                                                        'test_score', 'train_f1_score', 'test_f1_score'])

In [9]:
models_summary.sort_values(['test_f1_score', 'train_f1_score'],  ascending = False)

Unnamed: 0,classifier,vectorizer,train_score,test_score,train_f1_score,test_f1_score
37,SGDClassifier(),TfidfVectorizer(),0.9990974729241876,0.9909747292418772,0.9966159052453468,0.9657534246575342
22,CalibratedClassifierCV(),TfidfVectorizer(),0.999548736462094,0.990072202166065,0.9983136593591906,0.9624573378839592
40,OneVsRestClassifier(),TfidfVectorizer(),0.9963898916967509,0.990072202166065,0.9863247863247864,0.9621993127147768
28,PassiveAggressiveClassifier(),TfidfVectorizer(),1.0,0.9891696750902528,1.0,0.9591836734693876
21,CalibratedClassifierCV(),CountVectorizer(),1.0,0.9882671480144404,1.0,0.9547038327526132
36,SGDClassifier(),CountVectorizer(),0.9993231046931408,0.9882671480144404,0.9974640743871512,0.9547038327526132
39,OneVsRestClassifier(),CountVectorizer(),0.9997743682310468,0.9882671480144404,0.9991546914623838,0.9543859649122808
12,ExtraTreesClassifier(),CountVectorizer(),1.0,0.9882671480144404,1.0,0.9540636042402826
23,CalibratedClassifierCV(),HashingVectorizer(),0.9984205776173284,0.9873646209386282,0.9940728196443692,0.952054794520548
34,RidgeClassifierCV(),TfidfVectorizer(),1.0,0.9873646209386282,1.0,0.951048951048951


In [10]:
vectorize_text = TfidfVectorizer(encoding = 'latin1').fit(train_x.features)
train_vectorized_text = vectorize_text.transform(train_x.features)

In [11]:
cross_val_score(PassiveAggressiveClassifier(), train_vectorized_text, train_x.label, cv=10)

array([0.97972973, 0.98873874, 0.97968397, 0.97968397, 0.98419865,
       0.98871332, 0.98194131, 0.98871332, 0.98419865, 0.97968397])

In [12]:
clf = PassiveAggressiveClassifier().fit(train_vectorized_text, train_x.label)

In [13]:
test_vectorizer_text = vectorize_text.transform(test_x.features)

In [14]:
np.unique(clf.predict(test_vectorizer_text), return_counts = True)

(array([0., 1.]), array([962, 146], dtype=int64))

In [15]:
np.unique(test_x.label, return_counts = True)

(array([0., 1.]), array([960, 148], dtype=int64))