In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score

spam_data = pd.read_csv('spam.csv')


spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
spam_data.head(10)

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
5,FreeMsg Hey there darling it's been 3 week's n...,1
6,Even my brother is not like to speak with me. ...,0
7,As per your request 'Melle Melle (Oru Minnamin...,0
8,WINNER!! As a valued network customer you have...,1
9,Had your mobile 11 months or more? U R entitle...,1


In [3]:



X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], 
                                                    spam_data['target'], 
                                                    random_state=0)
                                                    

Defining the neccessary functions needed 

In [4]:
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    # I think this function takes a list of features(words) and then adds them to the columns to be used for the vectorization and matrix thing
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

def non_word_char_counter(doc):
    '''Counts the number of non  characters in a string'''
    count=len(re.findall('\W', doc))
    return count
def digit_counter(doc):
    '''
    checks to see if a character is a number.
    If true adds one to the counter or else it continues the loop
    '''
    count=0
    for char in doc:
        if char.isnumeric():
            count+=1
        else:continue
    return count       

In [5]:
#Converting the training and testing series to dataframes so the other features can be added for further processing
x_train_df= X_train.to_frame()
x_test_df=X_test.to_frame()

In [6]:
 #Creating the new columns
#length_of_doc
x_train_df['length_of_doc']= x_train_df['text'].apply(lambda x:len(x))
x_test_df['length_of_doc']= x_test_df['text'].apply(lambda x:len(x))  

#Non character counter
x_train_df['non_word_char_count']= x_train_df['text'].apply(non_word_char_counter)
x_test_df['non_word_char_count']= x_test_df['text'].apply(non_word_char_counter)  

#Number counter
x_train_df['digit_count']= x_train_df['text'].apply(digit_counter)
x_test_df['digit_count']= x_test_df['text'].apply(digit_counter)  

#Vectorization and transformation
vect= CountVectorizer(analyzer='char_wb',ngram_range=(2,5)).fit(x_train_df['text'])
x_train_transform= vect.transform(x_train_df['text'])
x_test_transform= vect.transform(x_test_df['text'])

#Adding up the features
x_train_fin= add_feature(x_train_transform,[x_train_df['length_of_doc'],x_train_df['non_word_char_count'],x_train_df['digit_count']])
x_test_fin= add_feature(x_test_transform,[x_test_df['length_of_doc'],x_test_df['non_word_char_count'],x_test_df['digit_count']])


In [9]:
#Fitting the transformed features to a Logistics Regression model
logreg= LogisticRegression(C=100,solver='liblinear',random_state=0).fit(x_train_fin,y_train)
predictions= logreg.decision_function(x_test_fin)
auc_score= roc_auc_score(y_test, predictions)
print('This is the Area under the curve score {}. This shows the model is a good fit and will be great at predicting whether or not an email is a spam'.format(auc_score))

This is the Area under the curve score 0.9951912466258084. This shows the model is a good fit and will be great at predicting whether or not an email is a spam


In [10]:
 #Getting the features plus the newly added columns
features= vect.get_feature_names_out() 
tot_feats= np.append(features,np.array(['length_of_doc', 'digit_count', 'non_word_char_count']))
coeff= logreg.coef_
#Sorting the coefficients by with their indicies
sort_coef= coeff[0].argsort()

small_features= tot_feats[sort_coef[:10]]
large_features= tot_feats[sort_coef[-10:]]

print('Words with the least weight in predicting whether or not the email is a spam {}'.format(small_features))
print('words with the biggest weight in predicting whether or not the email is a spam {}'. format(large_features))

Words with the least weight in predicting whether or not the email is a spam ['. ' '..' '? ' ' i' ' y' ' go' ':)' ' h' ' ok' 'he']
words with the biggest weight in predicting whether or not the email is a spam [' x' ' ch' ' a ' 'ar' 'ww' 'mob' 'xt' 'co' 'ne' 'non_word_char_count']
