In [349]:
# Importing Libraries 

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import re

# Functions 

def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

### Importing and splitting data

In [350]:
spam_data = pd.read_csv('spam.csv')
spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
spam_data.head(10)
X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], spam_data['target'],random_state=0)

### Data Exploration

In [351]:
#What percentage of the documents in spam_data are spam?
ratio_spam=(len(spam_data[spam_data['target']==1])*100)/spam_data.shape[0]
print('{:3.2f}% of the texts are SPAM texts.'.format(ratio_spam))

#What is the longest token in the vocabulary?
vectorizer = CountVectorizer() #CountVectorizer : Convert a collection of text documents to a matrix of token counts
X = vectorizer.fit_transform(X_train)
my_list=[(len(w),w) for w in vectorizer.get_feature_names()]    
print(f'The longest word in the data is : {max(my_list)[1]}\n')

#What is the average length of documents (number of characters) for not spam and spam documents?
df_spam=spam_data.copy()
df_spam['lenght']=df_spam['text'].str.len()
avgspam=df_spam[df_spam['target']==1].lenght.mean()
avgnspam=df_spam[df_spam['target']==0].lenght.mean()
print('The avg lenght of documents for spam documents: {:.2f}'.format(avgspam))
print('The avg lenght of documents for not spam documents: {:.2f}\n'.format(avgnspam))

#What is the average number of digits per document for not spam and spam documents?

df_spam1=spam_data.copy()
#df_spam1['digits']=df_spam1['text'].str.findall(r'\d').str.len()
df_spam1['digits']=df_spam1['text'].str.findall('\d').map(lambda x : len(x))
avgdigspam=df_spam1[df_spam1.target==1].digits.mean()
avgdignspam=df_spam1[df_spam1.target==0].digits.mean()
print('The average number of digits per document for  spam: {:.2f}'.format(avgdigspam))
print('The average number of digits per document for not spam : {:.2f}\n'.format(avgdignspam))

#What is the average number of non-word characters per document for not spam and spam documents?
df_spam2 = spam_data.copy()
#df_spam2['nn_word']=df_spam2['text'].str.findall(r'[^a-zA-Z0-9_]').str.len()
df_spam2['nn_word']=df_spam2['text'].str.findall(r'\W').str.len()
avgnwspam=df_spam2[df_spam2.target==1]['nn_word'].mean()
avgnwnspam=df_spam2[df_spam2.target==0]['nn_word'].mean()
print('The average number of non-word characters per document for spam: {:.2f}'.format(avgnwspam))
print('The average number of non-word characters per document for not spam : {:.2f}\n'.format(avgnwnspam))

# What 20 features have the smallest tf-idf and what 20 have the largest tf-idf?
vec = TfidfVectorizer().fit(X_train) #Convert a collection of text documents to a matrix of token counts (scale down the impact of tokens that occur very frequently)
X_trainVectf = vec.transform(X_train)
feature_names = np.array(vec.get_feature_names())
sorted_tfidf_index=X_trainVectf.toarray().max(0).argsort()
smallest_tf_idfs=pd.Series(np.sort(X_trainVectf.max(0).toarray()[0])[0:20],index=feature_names[sorted_tfidf_index[0:20]])
smallest_tf_idfs=smallest_tf_idfs.sort_index().sort_values(kind='mergesort')
largest_tf_idfs=pd.Series(np.sort(X_trainVectf.max(0).toarray()[0])[-21:-1],index=feature_names[sorted_tfidf_index[-21:-1]])
largest_tf_idfs=largest_tf_idfs.sort_index(ascending=False).sort_values(ascending=False,kind='mergesort')
print('The 20 words with SMALLEST tfidf :\n{}\n'.format(smallest_tf_idfs))
print('The 20 words with LARGEST tfidf : \n{}\n\n'.format(largest_tf_idfs))

13.41% of the texts are SPAM texts.
The longest word in the data is : com1win150ppmx3age16subscription

The avg lenght of documents for spam documents: 138.87
The avg lenght of documents for not spam documents: 71.02

The average number of digits per document for  spam: 15.76
The average number of digits per document for not spam : 0.30

The average number of non-word characters per document for spam: 29.04
The average number of non-word characters per document for not spam : 17.29

The 20 words with SMALLEST tfidf :
aaniye          0.074475
athletic        0.074475
chef            0.074475
companion       0.074475
courageous      0.074475
dependable      0.074475
determined      0.074475
exterminator    0.074475
healer          0.074475
listener        0.074475
organizer       0.074475
pest            0.074475
psychiatrist    0.074475
psychologist    0.074475
pudunga         0.074475
stylist         0.074475
sympathetic     0.074475
venaam          0.074475
diwali          0.091250
mo

### Modelling

In [352]:
# Modelling with CountVectorizer and MultinomialNB 
vec = CountVectorizer(min_df=3)
X_trainVectorized = vec.fit_transform(X_train)
clfMNB = MultinomialNB(alpha=0.1)
clfMNB.fit(X_trainVectorized, y_train)
y_predtest= clfMNB.predict(vec.transform(X_test))
score1 =roc_auc_score (y_test,y_predtest)
print('The AUC score using CountVectorizer and Multinomial Naive Bayes is {:.2f}.\n'.format(score1))

# Modelling with TfidfVectorizer and MultinomialNB 

vec=TfidfVectorizer(min_df=3).fit(X_train)
X_trainVectTf = vec.transform(X_train)
ClfMNB=MultinomialNB(alpha=0.1)
ClfMNB.fit(X_trainVectTf,y_train)
ypred=ClfMNB.predict(vec.transform(X_test))
score2=roc_auc_score(y_test,ypred)
print('The AUC score using TfidfVectorizer and Multinomial Naive Bayes is {:.2f}.\n'.format(score2))

# Modelling with TfidfVectorizer,adding one feature (lenght of document) and SVM  

vec=TfidfVectorizer(min_df=5).fit(X_train)
X_traintransformed=vec.transform(X_train)
X_traintrans_add=add_feature(X_traintransformed,X_train.str.len() )
ClfSVC=SVC(C=10000)
ClfSVC.fit(X_traintrans_add,y_train)
Xtesttransformedadd=add_feature(vec.transform(X_test),X_test.str.len() )
ypred= ClfSVC.predict(Xtesttransformedadd) 
score3=roc_auc_score(y_test,ypred)
print('The AUC score using TfidfVectorizer, adding one feature and SVM is {:.2f}.\n'.format(score3))

# Modelling with TfidfVectorizer,adding 2 features (lenght of document & nbr digits per document) and LogisticRegression 
vec=TfidfVectorizer(min_df=5, ngram_range=(1,3)).fit(X_train)
Xtraintrans=vec.transform(X_train)
Xtraintrans_add1=add_feature(Xtraintrans,X_train.str.len())
Xtraintrans_add2=add_feature(Xtraintrans_add1, X_train.str.findall(r'\d').str.len())

ClfLR=LogisticRegression(C=100,max_iter=1000)
ClfLR.fit(Xtraintrans_add2,y_train)
Xtesttrans=vec.transform(X_test)
Xtesttrans_add1=add_feature(Xtesttrans,X_test.str.len())
Xtesttrans_add2=add_feature(Xtesttrans_add1, X_test.str.findall(r'\d').str.len())
ypred=ClfLR.predict(Xtesttrans_add2)
score4=roc_auc_score(y_test,ypred)
print('The AUC score using TfidfVectorizer, adding two features and Logistic Regression is {:.2f}.\n'.format(score4))

# Modelling with CountVectorizer,adding 3 features and LogisticRegression 

vect=CountVectorizer(min_df=5,ngram_range=(2,5),analyzer='char_wb').fit(X_train)
Xtraintrans=vect.transform(X_train)
Xtesttrans=vect.transform(X_test)

Xtraintrans_add1=add_feature(Xtraintrans,X_train.str.len())
Xtraintrans_add2=add_feature(Xtraintrans_add1, X_train.str.findall(r'\d').str.len())
Xtraintrans_add3=add_feature(Xtraintrans_add2, X_train.str.findall(r'\W').str.len())

ClfLR=LogisticRegression(C=100,max_iter=1000).fit(Xtraintrans_add3,y_train)

Xtesttrans_add1=add_feature(Xtesttrans,X_test.str.len(),)
Xtesttrans_add2=add_feature(Xtesttrans_add1, X_test.str.findall(r'\d').str.len())
Xtesttrans_add3=add_feature(Xtesttrans_add2, X_test.str.findall(r'\W').str.len())

ypred=ClfLR.predict(Xtesttrans_add3)
score=roc_auc_score(y_test,ypred)

feature_names =np.array(vect.get_feature_names()+['length_of_doc', 'digit_count', 'non_word_char_count'])
ClfLR.coef_.argsort()
smallest_coef=feature_names[ClfLR.coef_[0].argsort()][0:10]
largest_coef=feature_names[ClfLR.coef_[0].argsort()][:-11:-1]

print('The AUC score using CountVectorizer, adding three features and Logistic Regression is {:.2f}'.format(score4))
print('   => The 10 largest coefficients from this model : {}'.format(largest_coef))
print('   => The 10 smallest coefficients from this model :{}'.format(smallest_coef))


The AUC score using CountVectorizer and Multinomial Naive Bayes is 0.97.

The AUC score using TfidfVectorizer and Multinomial Naive Bayes is 0.94.

The AUC score using TfidfVectorizer, adding one feature and SVM is 0.97.

The AUC score using TfidfVectorizer, adding two features and Logistic Regression is 0.97.

The AUC score using CountVectorizer, adding three features and Logistic Regression is 0.97
   => The 10 largest coefficients from this model : ['digit_count' 'ia' ' r' 'xt' 'ne' 'co' ' ba' ' x' 'ian ' '46']
   => The 10 smallest coefficients from this model :[' i' 'ca' '..' '. ' 'pe' ' go' ' m' 'if' 'us' 'go']
