# SPAM or HAM?

Text message data is explored and models trained to predict if a message is spam. 

In [1]:
import pandas as pd
import numpy as np

spam_data = pd.read_csv('spam.csv')

spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
spam_data.head(10)

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
5,FreeMsg Hey there darling it's been 3 week's n...,1
6,Even my brother is not like to speak with me. ...,0
7,As per your request 'Melle Melle (Oru Minnamin...,0
8,WINNER!! As a valued network customer you have...,1
9,Had your mobile 11 months or more? U R entitle...,1


In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], 
                                                    spam_data['target'], 
                                                    random_state=0)

### How Much SPAM?

In [3]:
total = spam_data.shape[0]
spam = (spam_data['target'] == 1).sum()
spam_percent = (spam/total)*100
spam_percent


13.406317300789663

In [4]:
#Fits the training data `X_train` using a Count Vectorizer with default parameters.
#Finds the longest token.


from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(X_train)
feature_list = vect.get_feature_names()
longest = max(feature_list, key=len)
longest


'com1win150ppmx3age16subscription'

In [5]:
#Fits and transform the training data `X_train` using a Count Vectorizer with default parameters.
#Fits a multinomial Naive Bayes classifier model with smoothing  `alpha=0.1`. 
#Finds the area under the curve (AUC) score using the transformed test data.


from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score

X_train_vectorized = vect.transform(X_train)
    
model = MultinomialNB(alpha=0.1)
model.fit(X_train_vectorized, y_train)
predictions = model.predict(vect.transform(X_test))
    
roc_auc_score(y_test, predictions)


0.97208121827411165

In [8]:
#Fits and transforms the training data `X_train` using a Tfidf Vectorizer with default parameters.
#Determines what 10 features have the smallest tf-idf and what 10 have the largest tf-idf.
#Returns a tuple of two series `(smallest tf-idfs series, largest tf-idfs series)` sorted by tf-idf value 
#and then alphabetically by feature name.


from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
    
#find the max tfidf for each feature , i.e. the max of each column in X_train_vectorized, 
#convert to array, making sure the result is a 1d array with length = number of features 
values =  X_train_vectorized.max(0).toarray()[0]
    
#get the feature names from the fitted vectorizer
index = vect.get_feature_names()
    
#convert array to series indexed by feature names
features_series = pd.Series(values,index=index)

features_series.nsmallest(10),features_series.nlargest(10)


(aaniye          0.074475
 athletic        0.074475
 chef            0.074475
 companion       0.074475
 courageous      0.074475
 dependable      0.074475
 determined      0.074475
 exterminator    0.074475
 healer          0.074475
 listener        0.074475
 dtype: float64, 146tf150p    1.0
 645          1.0
 anything     1.0
 anytime      1.0
 beerage      1.0
 done         1.0
 er           1.0
 havent       1.0
 home         1.0
 lei          1.0
 dtype: float64)

In [9]:
#Fits and transforms the training data `X_train` using a Tfidf Vectorizer ignoring terms that have a 
#document frequency < 3.
#Fits a multinomial Naive Bayes classifier model with smoothing `alpha=0.1`
#Computes the AUC score using the transformed test data.


vect = TfidfVectorizer(min_df=3).fit(X_train)
X_train_vectorized = vect.transform(X_train)
    
model = MultinomialNB(alpha=0.1)
model.fit(X_train_vectorized, y_train)
predictions = model.predict(vect.transform(X_test))

roc_auc_score(y_test, predictions)


0.94162436548223349

In [10]:
#Finds the average length of documents (number of characters) for not spam and spam documents
#Returns a tuple (average length not spam, average length spam).*


spam_len = spam_data.loc[(spam_data['target'] == 1), 'text'].str.len().mean()
not_spam_len = spam_data.loc[(spam_data['target'] == 0), 'text'].str.len().mean()
(not_spam_len, spam_len) 

(71.02362694300518, 138.8661311914324)

In [11]:
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [12]:
#Fits and transforms the training data X_train using a Tfidf Vectorizer ignoring terms that have 
#a document frequency < 5.
#Uses an additional feature, the length of document (number of characters)
#Fits a Support Vector Classification model with regularization `C=10000`. 
#Computes the AUC score using the transformed test data.


from sklearn.svm import SVC

vect = TfidfVectorizer(min_df=5).fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)
    
#add num_characters = X_train.str.len()
X_train_vectorized = add_feature(X_train_vectorized, X_train.str.len())
X_test_vectorized = add_feature(X_test_vectorized, X_test.str.len())
    
model = SVC(C=10000)
model.fit(X_train_vectorized, y_train)
predictions = model.predict(X_test_vectorized)
AUC_SVC = roc_auc_score(y_test, predictions)

AUC_SVC
    

0.95813668234215565

In [13]:
#Finds the average number of digits per document for not spam and spam documents
#Returns a tuple (average # digits not spam, average # digits spam).

spam_digit = spam_data.loc[(spam_data['target'] == 1), 'text'].str.count('\d').mean()
not_spam_digit = spam_data.loc[(spam_data['target'] == 0), 'text'].str.count('\d').mean()
    
(not_spam_digit, spam_digit) 
    

(0.2992746113989637, 15.759036144578314)

In [14]:
#Fits and transforms the training data `X_train` using a Tfidf Vectorizer ignoring terms that have 
#a document frequency < 5 and using word n-grams from n=1 to n=3.
#Adds the following features: number of digits per document
#Fits a Logistic Regression model with regularization `C=100`.
#Computes the AUC score using the transformed test data.



from sklearn.linear_model import LogisticRegression

vect = TfidfVectorizer(min_df=5, ngram_range=(1,3)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)
    
X_train_vectorized = add_feature(X_train_vectorized, X_train.str.len())
X_train_vectorized = add_feature(X_train_vectorized, X_train.str.count('\d'))
    
X_test_vectorized = add_feature(X_test_vectorized, X_test.str.len())
X_test_vectorized = add_feature(X_test_vectorized, X_test.str.count('\d'))
    
model = LogisticRegression(C=100)
model.fit(X_train_vectorized, y_train)
predictions = model.predict(X_test_vectorized)
    
AUC_LR = roc_auc_score(y_test, predictions)
AUC_LR


0.96533283533945646

In [15]:
#Finds the average number of non-word characters (anything other than a letter, digit or underscore) per document 
#for not spam and spam documents
#Returns a tuple (average # non-word characters not spam, average # non-word characters spam).


spam_non_word = spam_data.loc[(spam_data['target'] == 1), 'text'].str.count('\W').mean()
not_spam_non_word = spam_data.loc[(spam_data['target'] == 0), 'text'].str.count('\W').mean()
    
(not_spam_non_word , spam_non_word) 


(17.29181347150259, 29.041499330655956)

In [16]:
#Fits and transforms the training data X_train using a Count Vectorizer ignoring terms that have 
#a document frequency < 5 and using character n-grams from n=2 to n=5.

#Uses character n-grams pass in `analyzer='char_wb'` which creates character n-grams only from text inside 
#word boundaries to make the model more robust to spelling mistakes.

#Uses the following additional feature: 
#number of non-word characters (anything other than a letter, digit or underscore)

#Fits a Logistic Regression model with regularization C=100.
#Computes the AUC score using the transformed test data.

#Finds the 10 smallest and 10 largest coefficients from the model.
#The list of 10 smallest coefficients is sorted smallest first, 
#the list of 10 largest coefficients is sorted largest first.

#Returns a tuple `(AUC score as a float, smallest coefs list, largest coefs list)`.



vect = CountVectorizer(min_df=5, ngram_range=(2,5), analyzer='char_wb').fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)
    
X_train_vectorized = add_feature(X_train_vectorized, X_train.str.len())
X_train_vectorized = add_feature(X_train_vectorized, X_train.str.count('\d'))
X_train_vectorized = add_feature(X_train_vectorized, X_train.str.count('\W'))
                                     
X_test_vectorized = add_feature(X_test_vectorized, X_test.str.len())
X_test_vectorized = add_feature(X_test_vectorized, X_test.str.count('\d'))
X_test_vectorized = add_feature(X_test_vectorized, X_test.str.count('\W'))                                 
    
feature_names = np.array(vect.get_feature_names() + ['length_of_doc', 'digit_count', 'non_word_char_count'])
    
model = LogisticRegression(C=100)
model.fit(X_train_vectorized, y_train)
predictions = model.predict(X_test_vectorized)
AUC = roc_auc_score(y_test, predictions)
    
sorted_coef_index = model.coef_[0].argsort()

smallest_coefs = feature_names[sorted_coef_index[:10]]
largest_coefs = feature_names[sorted_coef_index[:-11:-1]]
    
(AUC, list(smallest_coefs), list(largest_coefs))

(0.97885931107074342,
 ['. ', '..', '? ', ' i', ' y', ' go', ':)', ' h', 'go', ' m'],
 ['digit_count', 'ne', 'ia', 'co', 'xt', ' ch', 'mob', ' x', 'ww', 'ar'])