**Hapuarachchi H.D.I.C.
 IT20617264**

Connecting to drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd drive/MyDrive

/content/drive/MyDrive


Importing dependencies

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
#reading data
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
#replacing null values with a null string
mail_data = df.where((pd.notnull(df)),'')
mail_data.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
#checking size of data set
mail_data.shape

(5572, 2)

In [8]:
#ensuring that data type are correct
mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Label      5572 non-null   object
 1   EmailText  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [9]:
#checking for missing values in the dataset 
mail_data.isnull().sum()

Label        0
EmailText    0
dtype: int64

In [10]:
#checking class count
mail_data['Label'].value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

Label Encoding

In [11]:
#labeling spam as 0 and ham as 1
df['Label'].replace(['ham', 'spam'], [1, 0], inplace = True)
df.head()

Unnamed: 0,Label,EmailText
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
#splitting data into independant and dependant
#.values converting into an array for easy computations
x=df.iloc[:,1].values
y=df.iloc[:,0].values
print(y)


[1 1 0 ... 1 1 1]


Splitting data into training and testing

In [13]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)


Number transactions X_train dataset:  (4457,)
Number transactions y_train dataset:  (4457,)
Number transactions X_test dataset:  (1115,)
Number transactions y_test dataset:  (1115,)


Data preprocessing with CountVectorizer()

In [41]:
#using count vectorizer
count_vector=CountVectorizer()

x_train = count_vector.fit_transform(X_train)
x_test = count_vector.transform(X_test)

Dealing with class imbalances in training data

In [42]:
# print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
# print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=0)
X_train_res, y_train_res = sm.fit_resample(x_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

After OverSampling, the shape of train_X: (7752, 7619)
After OverSampling, the shape of train_y: (7752,) 

After OverSampling, counts of label '1': 3876
After OverSampling, counts of label '0': 3876


Validation set approach

In [43]:
#Model building
#tuning
model=SVC(C=1, kernel='sigmoid')

In [44]:
model.fit(X_train_res,y_train_res)

SVC(C=1, kernel='sigmoid')

In [45]:
y_pred = model.predict(x_test)
y_pred

array([1, 1, 1, ..., 1, 1, 0])

In [46]:
accuracy_score(y_pred,y_test)

0.8762331838565023

In [35]:
confusion_matrix(y_test, y_pred)

array([[142,  24],
       [114, 835]])

In [47]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.55      0.86      0.67       166
           1       0.97      0.88      0.92       949

    accuracy                           0.88      1115
   macro avg       0.76      0.87      0.80      1115
weighted avg       0.91      0.88      0.89      1115



In [48]:
input_mail = [
"Hey, you have won a car !!!!. Conrgratzz",
"Dear applicant, Your CV has been recieved. Best regards",
"You have received $1000000 to your account",
"Join with our whatsapp group",
"Kindly check the previous email. Kind Regards"
]


In [49]:
#converting text to feature vector
input_mail = count_vector.transform(input_mail)

In [50]:
#prediction
model.predict(input_mail)

# expected output 
# ['spam', 'ham', 'spam', 'spam', 'ham']
# [ 0, 1, 0, 0, 1]

array([1, 1, 1, 0, 1])

**Hyperparameter Tuning**

K fold cross validation

In [None]:
# from sklearn.model_selection import KFold
#creating a KFold object with 5 splits 
# folds = KFold(n_splits = 5, shuffle = True, random_state = 0)

#instantiating a model with cost=1
# model = SVC(C = 1)

In [None]:
#computing the cross-validation scores 

# cv_results = cross_val_score(model, X_train_res,y_train_res, cv = folds, scoring = 'accuracy') 

In [None]:
# print 5 accuracies obtained from the 5 folds
# print(cv_results)
# print("mean accuracy = {}".format(cv_results.mean()))

Cross validations and grid search

In [25]:
#specifying range of parameter list for C,gamma and kernel
params = {'C': [0.1, 1, 10], 
          'gamma' : [1, 0.1, 0.01, 0.001, 0.0001], 
          'kernel' : ['rbf', 'linear','sigmoid']}
 
# model = SVC()

from sklearn.model_selection import GridSearchCV
#using 5 folds for validation
model_cv = GridSearchCV( estimator=SVC(), param_grid = params, 
                        scoring='accuracy',
                        refit = True, 
                        cv=5,
                        verbose = 3,
                        n_jobs=-1,
                        return_train_score=True)

In [26]:
# fitting the model
model_cv.fit(X_train_res, y_train_res) 

Fitting 5 folds for each of 45 candidates, totalling 225 fits


GridSearchCV(cv=5, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.1, 1, 10],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'linear', 'sigmoid']},
             return_train_score=True, scoring='accuracy', verbose=3)

In [43]:
best_score = model_cv.best_score_
best_C = model_cv.best_params_['C']
best_gamma = model_cv.best_params_['gamma']
best_kernel = model_cv.best_params_['kernel']

print(" The highest test accuracy is {0} at C = {1}".format(best_score, best_C))
print(" The highest test accuracy is {0} at gamma = {1}".format(best_score, best_gamma))
print(" The highest test accuracy is {0} at kernel = {1}".format(best_score, best_kernel))

 The highest test accuracy is 0.9543383873047565 at C = 1
 The highest test accuracy is 0.9543383873047565 at gamma = 1
 The highest test accuracy is 0.9543383873047565 at kernel = linear


In [44]:
#model
model = SVC( C=best_C , gamma = best_gamma ,kernel=best_kernel).fit(X_train_res,y_train_res)

# fit
# model.fit(X_train_res,y_train_res)

#predict
y_pred = model.predict(x_test)
y_pred

array([1, 0, 1, ..., 1, 1, 1])

In [49]:
print(model.score(x_test,y_test))

0.9219730941704036


In [30]:
confusion_matrix(y_test, y_pred)

array([[142,  24],
       [ 63, 886]])

In [31]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.69      0.86      0.77       166
           1       0.97      0.93      0.95       949

    accuracy                           0.92      1115
   macro avg       0.83      0.89      0.86      1115
weighted avg       0.93      0.92      0.93      1115



In [32]:
input_mail = [
"Hey, you have won a car !!!!. Conrgratzz",
"Dear applicant, Your CV has been recieved. Best regards",
"You have received $1000000 to your account",
"Join with our whatsapp group",
"Kindly check the previous email. Kind Regards"
]

#converting text to feature vector
input_data_features = count_vector.transform(input_mail)


#making prediction
prediction = model.predict(input_data_features)
print(prediction)


for i in prediction:
    if (i == 1):
      print('Ham mail')
    else:
      print('Spam mail')

#expected outputs
#['spam', 'ham', 'spam', 'spam', 'ham']

[0 1 0 0 1]
Spam mail
Ham mail
Spam mail
Spam mail
Ham mail
