# SMS SPAM/HAM CLASSIFIER 

## Import required packages

In [1]:
from sklearn.naive_bayes import *
from sklearn.dummy import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.calibration import *
from sklearn.linear_model import *
from sklearn.multiclass import *
from sklearn.svm import *
from sklearn.metrics import f1_score, confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
import re
import os
import warnings
warnings.filterwarnings('ignore')

np.random.seed(3)
pd.set_option('display.max_columns', None) #to view all the columns data from a pandas dataframe

## Defining the functions 

### Function to analyse classifiers

In [2]:
def analyze_models(classifiers, vectorizers, train_data, test_data):
    """
    This function analyses the performance of given combination of classifier and vectorizer
    Input:
        classifiers - list of classifiers used for analysis
        vectorizers - list of vectorizers used for analysis
        train_data - training data
        test_data - Testing data
        
    Output:
        results - contains the classifier, vectorizer along with their scores on test and train data
    """
    results = []
    for classifier in classifiers:
      for vectorizer in vectorizers:
        string = ''
        string += classifier.__class__.__name__ + ' with ' + vectorizer.__class__.__name__

        # train
        vectorize_text = vectorizer.fit(train_data.features)
        
        train_vectorized_text = vectorize_text.transform(train_data.features)
        
        trained_classifier = classifier.fit(train_vectorized_text, train_data.label)
        
        train_score = trained_classifier.score(train_vectorized_text, train_data.label)
        train_f1_score = f1_score(trained_classifier.predict(train_vectorized_text), train_data.label)
        

        # test
        test_vectorized_text = vectorize_text.transform(test_data.features)
        
        test_score = trained_classifier.score(test_vectorized_text, test_data.label)
        test_f1_score = f1_score(trained_classifier.predict(test_vectorized_text), test_data.label)
        
        string += ' Has Scores: ' + ' Train : ' + str(train_score) + ' Test : ' + str(test_score) \
                    + ' Train : ' + ' and F1 Scores : ' + ' Train : ' + str(train_f1_score) + ' Test : ' \
                    + str(test_f1_score) + '\n'
        
        results.append([classifier, vectorizer, str(train_score), str(test_score), str(train_f1_score), 
                       str(test_f1_score)])
        print(string)
        
    return results



### Function to Preprocess the data

In [3]:
def preprocess(data):
    """
    This function preprocesses the given data
    
    preprocess - 
                    converting the data to lower case
                    removing special characters other than dollar($)
                    to represent amounts as different categories
    Input:
        data - the data to be preprocessed
    
    Output:
        numpy array containing the preprocessed data
    
    """
    preprocessed_data = data.str.lower()
    preprocessed_data = preprocessed_data.map(lambda document: re.split('[^a-zA-Z0-9\'$]', document))
    preprocessed_data = preprocessed_data.map(lambda document: list(filter(lambda word : len(word)>1, document)))
    preprocessed_data = preprocessed_data.map(lambda document: ' '.join(document))
    
    return preprocessed_data.values


### Function to vectorize the data

In [4]:
def vectorize(data, train = True, tfidf_vectorizer = None):
    """
    This function fits the vectorizer with train data and transforms the test data using given vectorizer
    
    Input:
        data - data to be used to fit the vectorizer or to be transformed
    
    Output:
        A python dictionary with the transformed feature data as 'features' and fitted vectorizer as 'vectorizer'
    
    
    If train = True : 
        fits the vectorizer with train data
    else : 
        transforms the test data using given vectorizer
        

    """
    if train:
        
        #initialising Vectorizer
        tfidf = TfidfVectorizer()
        
        #Fitting the vectorizers with train data
        tfidf_vectorizer = tfidf.fit(data)
        tfidf_vetorized_text = tfidf_vectorizer.transform(data)
        
    else:
        
        #Transforming the Test data
        tfidf_vetorized_text = tfidf_vectorizer.transform(data)
        
    return {'features' : tfidf_vetorized_text, 
            'tfidf_vectorizer' : tfidf_vectorizer}


### Function to train the stacking classifier model

In [5]:
def train_model(features, label, stacking_models, cv = 10):
    """
    This function trains the stacking classifier with given estimators, data and returns the trained estimator 
    
    Input:
        features - feature data for the model
        label - label data for the model
        stacking_models - estimators to be used for stacking
        cv - cross validation for stacking classifier
        
    Ouput:
        A python Dictionary containing the trained model as 'model' and the stacking classifier as 'classifier'
    """
    st_classifier = StackingClassifier( estimators = stacking_models, final_estimator = SVC(kernel='rbf', random_state = 3), 
                                   n_jobs = -1, cv = cv, stack_method = 'predict')
    print("Trainging with ", list(map(lambda x:x[0], stacking_models)), sep = '\n')
    trained_model = st_classifier.fit(features, label)
    print("Training Score : ", trained_model.score(features, label))
    return {'model' : trained_model, 'classifier': st_classifier}


## Reading the SMS data

In [6]:
sms_data = pd.read_csv(r"D:\Narendra\AIGenie_Capstone_ALL\sms.csv", encoding = 'latin1') # Update the file path
sms_data['label'] = sms_data['RESULT'].replace({'ham':0, 'spam':1}) # Converting the result feature into integer labels
sms_data.head(5)

Unnamed: 0,No,RESULT,SMS,label
0,1,ham,"Go until jurong point, crazy.. Available only ...",0.0
1,2,ham,Ok lar... Joking wif u oni...,0.0
2,3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1.0
3,4,ham,U dun say so early hor... U c already then say...,0.0
4,5,ham,"Nah I don't think he goes to usf, he lives aro...",0.0


In [7]:
sms_data.columns = ['No', 'Result', 'sms', 'label'] # Renaming the feature names of SMS data

## Separating the train and test data

Here the data contains both train and test data in the same file. So, we need to separate the data into train and test data.
Data with label is used for training and without label is used for prediction.

In [8]:
#Filtering data from pandas dataframe
test_data = sms_data[sms_data.isna().sum(axis = 1) > 0] 
train_data = sms_data[sms_data.isna().sum(axis = 1) == 0]

## Preprocessing the train and test data

In [9]:
train_data['features'] = preprocess(train_data.loc[:, 'sms'])
test_data['features'] = preprocess(test_data.loc[:, 'sms'])
train_data.shape, test_data.shape

((5540, 5), (32, 5))

## Splitting the Train data into train and validation sets

In [10]:
train_x, val_x = train_test_split(train_data[['features', 'label']], test_size = 0.2, 
                                    shuffle = True, stratify = train_data['label'])

## Analysing the Classifier and Vectorizer models

In [11]:
models_summary = analyze_models(
    [
        BernoulliNB(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state = 3),
        AdaBoostClassifier(random_state = 3),
        BaggingClassifier(random_state = 3, n_jobs = -1),
        ExtraTreesClassifier(random_state = 3, n_jobs = -1),
        GradientBoostingClassifier(random_state = 3),
        DecisionTreeClassifier(random_state = 3),
        CalibratedClassifierCV(),
        PassiveAggressiveClassifier(random_state = 3, n_jobs = -1),
        RidgeClassifier(random_state = 3),
        RidgeClassifierCV(),
        SGDClassifier(loss = 'modified_huber', random_state = 3, n_jobs = -1),
        OneVsRestClassifier(SVC(kernel='linear', random_state = 3, probability = True)),
        OneVsRestClassifier(LogisticRegression(random_state = 3, n_jobs = -1)),
        KNeighborsClassifier()
    ],
    [
        CountVectorizer(),
        TfidfVectorizer(),
        HashingVectorizer()
    ],
    train_x,
    val_x
)

BernoulliNB with CountVectorizer Has Scores:  Train : 0.986687725631769 Test : 0.9828519855595668 Train :  and F1 Scores :  Train : 0.9478337754199824 Test : 0.9314079422382672

BernoulliNB with TfidfVectorizer Has Scores:  Train : 0.986687725631769 Test : 0.9828519855595668 Train :  and F1 Scores :  Train : 0.9478337754199824 Test : 0.9314079422382672

BernoulliNB with HashingVectorizer Has Scores:  Train : 0.8664259927797834 Test : 0.8664259927797834 Train :  and F1 Scores :  Train : 0.0 Test : 0.0

RandomForestClassifier with CountVectorizer Has Scores:  Train : 1.0 Test : 0.9846570397111913 Train :  and F1 Scores :  Train : 1.0 Test : 0.9390681003584229

RandomForestClassifier with TfidfVectorizer Has Scores:  Train : 1.0 Test : 0.9846570397111913 Train :  and F1 Scores :  Train : 1.0 Test : 0.9390681003584229

RandomForestClassifier with HashingVectorizer Has Scores:  Train : 1.0 Test : 0.9774368231046932 Train :  and F1 Scores :  Train : 1.0 Test : 0.9077490774907748

AdaBoostCla

### checking the models performance by storing them into a pandas DataFrame

In [12]:
models_summary = pd.DataFrame(models_summary, columns = ['classifier', 'vectorizer', 'train_score', 
                                                        'test_score', 'train_f1_score', 'test_f1_score'])
models_summary.sort_values(['test_f1_score', 'train_f1_score'],  ascending = False)

Unnamed: 0,classifier,vectorizer,train_score,test_score,train_f1_score,test_f1_score
25,"PassiveAggressiveClassifier(C=1.0, average=Fal...","TfidfVectorizer(analyzer='word', binary=False,...",1.0,0.990072202166065,1.0,0.9624573378839592
22,"CalibratedClassifierCV(base_estimator=None, cv...","TfidfVectorizer(analyzer='word', binary=False,...",0.999548736462094,0.990072202166065,0.9983136593591906,0.9624573378839592
37,"OneVsRestClassifier(estimator=SVC(C=1.0, break...","TfidfVectorizer(analyzer='word', binary=False,...",0.9963898916967509,0.990072202166065,0.9863247863247864,0.9621993127147768
34,"SGDClassifier(alpha=0.0001, average=False, cla...","TfidfVectorizer(analyzer='word', binary=False,...",1.0,0.990072202166065,1.0,0.9619377162629756
21,"CalibratedClassifierCV(base_estimator=None, cv...","CountVectorizer(analyzer='word', binary=False,...",1.0,0.9882671480144404,1.0,0.9547038327526132
36,"OneVsRestClassifier(estimator=SVC(C=1.0, break...","CountVectorizer(analyzer='word', binary=False,...",0.9997743682310468,0.9882671480144404,0.9991546914623838,0.9543859649122808
23,"CalibratedClassifierCV(base_estimator=None, cv...","HashingVectorizer(alternate_sign=True, analyze...",0.9984205776173284,0.9873646209386282,0.9940728196443692,0.952054794520548
12,"(ExtraTreeClassifier(ccp_alpha=0.0, class_weig...","CountVectorizer(analyzer='word', binary=False,...",1.0,0.9873646209386282,1.0,0.950354609929078
13,"(ExtraTreeClassifier(ccp_alpha=0.0, class_weig...","TfidfVectorizer(analyzer='word', binary=False,...",1.0,0.9873646209386282,1.0,0.950354609929078
24,"PassiveAggressiveClassifier(C=1.0, average=Fal...","CountVectorizer(analyzer='word', binary=False,...",1.0,0.986462093862816,1.0,0.9477351916376306


## Reading the top 5 models to be used for Stacking

From the results we can see that the TFIDF Vectorizer is performing well when compared to others. So, will use TFIDF Vectorizer for vectorizing the data and will take 5 unique classifiers from above results to be used for stacking

In [13]:
top_models = models_summary.sort_values(['test_f1_score', 'train_f1_score'],  ascending = False) # Storing the sorted data

stacking_models = []
for model in top_models.values:
    """
    generating the stacking models with data stored in tuples with name and classifier object
    
    Will take the unique classifiers from top performing models
    """
    
    model_name = model[0].__class__.__name__.replace('(', '').replace(')', '')
    current_stack = [x[0][0] for x in zip(stacking_models)]
    
    if model_name in current_stack:
        #checking whether the model is in the stacking list or not
        pass
    
    else:
        stacking_models.append((model_name, model[0]))
        
    if len(stacking_models)>=5:
        
        #reading only 5 models
        break

print("Top 5 models : ", sep = '\n')
stacking_models

Top 5 models : 


[('PassiveAggressiveClassifier',
  PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                              early_stopping=False, fit_intercept=True,
                              loss='hinge', max_iter=1000, n_iter_no_change=5,
                              n_jobs=-1, random_state=3, shuffle=True, tol=0.001,
                              validation_fraction=0.1, verbose=0,
                              warm_start=False)),
 ('CalibratedClassifierCV',
  CalibratedClassifierCV(base_estimator=None, cv=None, method='sigmoid')),
 ('OneVsRestClassifier',
  OneVsRestClassifier(estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                    class_weight=None, coef0=0.0,
                                    decision_function_shape='ovr', degree=3,
                                    gamma='scale', kernel='linear', max_iter=-1,
                                    probability=True, random_state=3,
                                    shrinking=Tr

## Transforming the data with vectorizer and training the model

In [14]:
vectorized_ouput = vectorize(train_x.features)

trained_tfidf_vectorizer = vectorized_ouput['tfidf_vectorizer']

model_trained = train_model(vectorized_ouput['features'], train_x.label, stacking_models)

Trainging with 
['PassiveAggressiveClassifier', 'CalibratedClassifierCV', 'OneVsRestClassifier', 'SGDClassifier', 'ExtraTreesClassifier']
Training Score :  0.9995487364620939


## Checking Model performance on Test Data

In [15]:
vectorized_ouput = vectorize(val_x.features, train = False, tfidf_vectorizer = trained_tfidf_vectorizer)
print("Test Score : ", f1_score(model_trained['model'].predict(vectorized_ouput['features']), val_x.label))

Test Score :  0.9657534246575342


We can see that the stacked model and the top model given almost same performance

In [16]:
print("Confusion Matrix : ", sep = '\n')
confusion_matrix(model_trained['model'].predict(vectorized_ouput['features']), val_x.label)

Confusion Matrix : 


array([[957,   7],
       [  3, 141]], dtype=int64)

## Predicting the test data

In [17]:
#Vectorize Test data
test_vectorized_ouput = vectorize(test_data.features, train = False, tfidf_vectorizer = trained_tfidf_vectorizer)
#Predicting on Test data
test_data['predicted'] = model_trained['model'].predict(test_vectorized_ouput['features']).astype(int)

## Visualizing the predicted results

In [18]:
test_data['Result'] = test_data['predicted'].replace(1, 'spam').replace(0, 'ham')
test_data

Unnamed: 0,No,Result,sms,label,features,predicted
95,96,spam,Your free ringtone is waiting to be collected....,,your free ringtone is waiting to be collected ...,1
124,125,ham,I am going to sao mu today. Will be done only ...,,am going to sao mu today will be done only at 12,0
347,348,ham,Dis is yijue. I jus saw ur mail. In case huimi...,,dis is yijue jus saw ur mail in case huiming h...,0
567,568,ham,Oooh bed ridden ey? What are YOU thinking of?,,oooh bed ridden ey what are you thinking of,0
598,599,spam,You have an important customer service announc...,,you have an important customer service announc...,1
733,734,ham,Lol you won't feel bad when I use her money to...,,lol you won't feel bad when use her money to t...,0
940,941,ham,Better. Made up for Friday and stuffed myself ...,,better made up for friday and stuffed myself l...,0
1099,1100,ham,NO GIFTS!! You trying to get me to throw mysel...,,no gifts you trying to get me to throw myself ...,0
1154,1155,spam,1000's of girls many local 2 u who r virgins 2...,,1000's of girls many local who virgins this re...,1
1349,1350,ham,"Nothing much, chillin at home. Any super bowl ...",,nothing much chillin at home any super bowl plan,0


## Storing predicted results into a file

In [19]:
test_data[['No', 'Result']].to_csv('submission_v1.csv', header = True, index = False)