#### Nevetha N G
#### MDS202128

In [1]:
import sklearn
import numpy as np
import pandas as pd
import sklearn.metrics as met
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

In [2]:
df1=pd.read_csv("training_data.csv")
df2=pd.read_csv("test_data.csv")

In [3]:
X_train=df1["X_train"]
y_train=df1["y_train"]
X_test=df2["X_test"]
y_test=df2["y_test"]

In [4]:
#fitting and transforming X_train using a Count Vectorizer with default parameters
vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)

### 1) Multinomial Naive Bayes Classifier Model

In [5]:
#fitting a multinomial Naive Bayes Classifier Model with smoothing alpha=0.1
model1 = sklearn.naive_bayes.MultinomialNB(alpha=0.1)
model_fit = model1.fit(X_train_vectorized, y_train)
predictions1 = model1.predict(vect.transform(X_test))

In evaluating the model’s performance, we can generate some predictions then look at the confusion matrix and AUC-ROC score to evaluate performance on the test dataset.

In [6]:
#confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, predictions1).ravel()
print(pd.DataFrame(confusion_matrix(y_test, predictions1),
             columns=['Predicted Spam', "Predicted Ham"], index=['Actual Spam', 'Actual Ham']))
print(f'\nTrue Positives: {tp}')
print(f'False Positives: {fp}')
print(f'True Negatives: {tn}')
print(f'False Negatives: {fn}')


print(f'True Positive Rate: { (tp / (tp + fn))}')
print(f'Specificity: { (tn / (tn + fp))}')
print(f'False Positive Rate: { (fp / (fp + tn))}')

             Predicted Spam  Predicted Ham
Actual Spam            1181              6
Actual Ham               11            196

True Positives: 196
False Positives: 6
True Negatives: 1181
False Negatives: 11
True Positive Rate: 0.9468599033816425
Specificity: 0.9949452401010952
False Positive Rate: 0.005054759898904802


In [7]:
#making predictions & looking at AUC score
aucscore = roc_auc_score(y_test, predictions1)
aucscore

0.9709025717413688

In [8]:
acc = met.accuracy_score(y_test, predictions1)
prec = met.precision_score(y_test, predictions1)
rec = met.recall_score(y_test, predictions1)
f1 = met.f1_score(y_test, predictions1)

model1_results = pd.DataFrame([['NB Classifier', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
model1_results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,NB Classifier,0.987805,0.970297,0.94686,0.958435


### 2 Support Vector Classifier Model

In [9]:
#defining an additional function
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [10]:
#fit and transfor x_train and X_test
vectorizer = TfidfVectorizer(min_df=5)
'''The goal of using tfidf is to scale down the impact of tokens that occur very frequently in a given corpus and 
that are hence empirically less informative than features that occur in a small fraction of the training corpus.'''

X_train_transformed = vectorizer.fit_transform(X_train)
X_train_transformed_with_length = add_feature(X_train_transformed, X_train.str.len())

X_test_transformed = vectorizer.transform(X_test)
X_test_transformed_with_length = add_feature(X_test_transformed, X_test.str.len())

In [11]:
# SVM creation
clf = SVC(C=10000)
model2=clf.fit(X_train_transformed_with_length, y_train)
predictions2 = clf.predict(X_test_transformed_with_length)

In evaluating the model’s performance, we can generate some predictions then look at the confusion matrix and AUC-ROC score to evaluate performance on the test dataset.

In [12]:
#confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, predictions2).ravel()
print(pd.DataFrame(confusion_matrix(y_test, predictions2),
             columns=['Predicted Spam', "Predicted Ham"], index=['Actual Spam', 'Actual Ham']))
print(f'\nTrue Positives: {tp}')
print(f'False Positives: {fp}')
print(f'True Negatives: {tn}')
print(f'False Negatives: {fn}')


print(f'True Positive Rate: { (tp / (tp + fn))}')
print(f'Specificity: { (tn / (tn + fp))}')
print(f'False Positive Rate: { (fp / (fp + tn))}')

             Predicted Spam  Predicted Ham
Actual Spam            1184              3
Actual Ham               13            194

True Positives: 194
False Positives: 3
True Negatives: 1184
False Negatives: 13
True Positive Rate: 0.9371980676328503
Specificity: 0.9974726200505476
False Positive Rate: 0.002527379949452401


In [13]:
roc_auc_score(y_test, predictions2)

0.9673353438416991

In [14]:
acc = met.accuracy_score(y_test, predictions2)
prec = met.precision_score(y_test, predictions2)
rec = met.recall_score(y_test, predictions2)
f1 = met.f1_score(y_test, predictions2)

model2_results = pd.DataFrame([['Support Vector Classifier', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
model2_results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Support Vector Classifier,0.988522,0.984772,0.937198,0.960396


### 3 Random Forest Model

In [15]:
model3 = RandomForestClassifier(random_state = 1,  n_estimators = 200,
                                    max_depth = 10, criterion = 'gini')
model3.fit(X_train_vectorized, y_train)
predictions3 = model3.predict(vect.transform(X_test))

In evaluating the model’s performance, we can generate some predictions then look at the confusion matrix and AUC-ROC score to evaluate performance on the test dataset.

In [16]:
#confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, predictions3).ravel()
print(pd.DataFrame(confusion_matrix(y_test, predictions3),
             columns=['Predicted Spam', "Predicted Ham"], index=['Actual Spam', 'Actual Ham']))
print(f'\nTrue Positives: {tp}')
print(f'False Positives: {fp}')
print(f'True Negatives: {tn}')
print(f'False Negatives: {fn}')


print(f'True Positive Rate: { (tp / (tp + fn))}')
print(f'Specificity: { (tn / (tn + fp))}')
print(f'False Positive Rate: { (fp / (fp + tn))}')

             Predicted Spam  Predicted Ham
Actual Spam            1187              0
Actual Ham              162             45

True Positives: 45
False Positives: 0
True Negatives: 1187
False Negatives: 162
True Positive Rate: 0.21739130434782608
Specificity: 1.0
False Positive Rate: 0.0


In [17]:
roc_auc_score(y_test, predictions3)

0.6086956521739131

In [18]:
acc = met.accuracy_score(y_test, predictions3)
prec = met.precision_score(y_test, predictions3)
rec = met.recall_score(y_test, predictions3)
f1 = met.f1_score(y_test, predictions3)

model_results3 = pd.DataFrame([['Random Forest Gini (n=200)', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

model_results3

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest Gini (n=200),0.883788,1.0,0.217391,0.357143


After training and testing these 3 models, it’s time to compare them. Comparing them based on AUC scores and Accuracy, we can see that The Naive Bayes Classifier had the highest scores, with the SVC model being marginally behind. 