In [1]:
import re

import pandas as pd
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('data/ling_spam_classification.csv')
df

Unnamed: 0,subject,message,label
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0
1,,"lang classification grimes , joseph e . and ba...",0
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0
3,risk,a colleague and i are researching the differin...,0
4,request book information,earlier this morning i was on the phone with a...,0
...,...,...,...
2888,love your profile - ysuolvpv,hello thanks for stopping by ! ! we have taken...,1
2889,you have been asked to join kiddin,"the list owner of : "" kiddin "" has invited you...",1
2890,anglicization of composers ' names,"judging from the return post , i must have sou...",0
2891,"re : 6 . 797 , comparative method : n - ary co...",gotcha ! there are two separate fallacies in t...,0


In [3]:
df.isnull().sum()

subject    62
message     0
label       0
dtype: int64

In [4]:
df = df.drop('subject',axis=1)
df['message'] = df['message'].str.lower()
df

Unnamed: 0,message,label
0,content - length : 3386 apple-iss research cen...,0
1,"lang classification grimes , joseph e . and ba...",0
2,i am posting this inquiry for sergei atamas ( ...,0
3,a colleague and i are researching the differin...,0
4,earlier this morning i was on the phone with a...,0
...,...,...
2888,hello thanks for stopping by ! ! we have taken...,1
2889,"the list owner of : "" kiddin "" has invited you...",1
2890,"judging from the return post , i must have sou...",0
2891,gotcha ! there are two separate fallacies in t...,0


In [5]:
def decontact(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [6]:
def process_string(stringg):
    stringg = decontact(stringg)

    # REPLACING NEW LINES BY 'WHITE SPACE'
    stringg = re.sub(r'\n', " ", stringg)
    # REPLACING NUMBERS
    stringg = re.sub(r'\d+(\.\d+)?', 'numbers', stringg)
    # REPLACING EMAIL IDs BY 'MAILID'
    stringg = re.sub(r'^.+@[^\.].*\.[a-z]{2,}$','MailID', stringg)
    # REPLACING URLs  BY 'Links'
    stringg = re.sub(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','Links', stringg)
    # REPLACING CURRENCY SIGNS BY 'MONEY'
    stringg = re.sub(r'£|\$', 'Money', stringg)
    # REPLACING LARGE WHITE SPACE BY SINGLE WHITE SPACE
    stringg = re.sub(r'\s+', ' ', stringg)
    # REPLACING LEADING AND TRAILING WHITE SPACE BY SINGLE WHITE SPACE
    stringg = re.sub(r'^\s+|\s+?$', '', stringg)
    # REPLACING CONTACT NUMBERS
    stringg = re.sub(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','contact number', stringg)
    # REPLACING SPECIAL CHARACTERS  BY WHITE SPACE 
    stringg = re.sub(r"[^a-zA-Z0-9]+", " ", stringg)

    return stringg

In [7]:
df['message'] = df['message'].apply(process_string)
df

Unnamed: 0,message,label
0,content length numbers apple iss research cent...,0
1,lang classification grimes joseph e and barbar...,0
2,i am posting this inquiry for sergei atamas sa...,0
3,a colleague and i are researching the differin...,0
4,earlier this morning i was on the phone with a...,0
...,...,...
2888,hello thanks for stopping by we have taken man...,1
2889,the list owner of kiddin has invited you to jo...,1
2890,judging from the return post i must have sound...,0
2891,gotcha there are two separate fallacies in the...,0


In [8]:
# removing stopwords 
stop = stopwords.words('english')
df['clean_message'] = df['message'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['length'] = df['clean_message'].apply(len)
df

Unnamed: 0,message,label,clean_message,length
0,content length numbers apple iss research cent...,0,content length numbers apple iss research cent...,2071
1,lang classification grimes joseph e and barbar...,0,lang classification grimes joseph e barbara f ...,1490
2,i am posting this inquiry for sergei atamas sa...,0,posting inquiry sergei atamas satamas umabnet ...,1102
3,a colleague and i are researching the differin...,0,colleague researching differing degrees risk p...,208
4,earlier this morning i was on the phone with a...,0,earlier morning phone friend mine living south...,629
...,...,...,...,...
2888,hello thanks for stopping by we have taken man...,1,hello thanks stopping taken many new pics made...,132
2889,the list owner of kiddin has invited you to jo...,1,list owner kiddin invited join mailing list li...,1225
2890,judging from the return post i must have sound...,0,judging return post must sounded like kind sel...,639
2891,gotcha there are two separate fallacies in the...,0,gotcha two separate fallacies argument n ary c...,1933


In [9]:
X = df.clean_message
y = df.label
X.shape, y.shape

((2893,), (2893,))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=225)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2314,), (579,), (2314,), (579,))

In [11]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer

In [12]:
naive_bayes_model = MultinomialNB()
naive_bayes_model

In [13]:
model_pipeline = Pipeline([('vectorizer', tfidf_vectorizer), ('classifier', naive_bayes_model)])
model_pipeline

In [14]:
model_pipeline.fit(X_train, y_train)

In [15]:
y_pred = model_pipeline.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,

In [16]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       1.00      0.86      0.93       559
           1       0.20      1.00      0.34        20

    accuracy                           0.87       579
   macro avg       0.60      0.93      0.63       579
weighted avg       0.97      0.87      0.90       579

