In [1]:
import pandas as pd                                         # importing all the libraries required 
import nltk
#nltk.download('punkt')
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split 

In [2]:
#1.################################################################################################################
df = pd.read_csv('spam_ham_dataset.csv')                    # reading the csv file into dataframe 
def tok_words(text):                                        # tokenising the text content of the dataframe
    '''
    word tokenization
    '''
    a = nltk.word_tokenize(text)                            # applying word tokenization for words in text column
    return ' '.join(a)  
df['text'] = df['text'].apply(tok_words)
print(df['text'][0])                                        # for easy demonstration applied for text df['text'][0]

Subject : enron methanol ; meter # : 988291 this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary flow data provided by daren } . please override pop ' s daily volume { presently zero } to reflect daily activity you can obtain from gas control . this change is needed asap for economics purposes .


In [3]:
#2.################################################################################################################
def remove_sw_pun(mess):                                    # removing stopwords and punctuation from the text
    '''
    1.remove punctuations using string.punctuation()
    2.remove stopwords
    3.return list of clean words
    '''
    nopunc = [c for c in mess if c not in string.punctuation]
    nopunc = ''.join(nopunc)
    clean_words = [nopunc for nopunc in nopunc.split() if nopunc not in stopwords.words('english') ]
    return clean_words
df['text'] = df['text'].apply(remove_sw_pun)
print(df['text'][0])                                        # for easy demonstration applied for df['text'][0]

['Subject', 'enron', 'methanol', 'meter', '988291', 'follow', 'note', 'gave', 'monday', '4', '3', '00', 'preliminary', 'flow', 'data', 'provided', 'daren', 'please', 'override', 'pop', 'daily', 'volume', 'presently', 'zero', 'reflect', 'daily', 'activity', 'obtain', 'gas', 'control', 'change', 'needed', 'asap', 'economics', 'purposes']


In [4]:
#3.################################################################################################################
stemmer = PorterStemmer()                                   # creating object for stemming the words
def stemming_words(g):                                      # function to stem the words to their root word
    '''
    using stemming to consolidate inflected words to their root words
    '''
    return [stemmer.stem(g[i]) for i in range(len(g))]
df['text'] = df['text'].apply(stemming_words)
print(df['text'][0])                                        # for easy demonstration applied for df['text'][0]

['subject', 'enron', 'methanol', 'meter', '988291', 'follow', 'note', 'gave', 'monday', '4', '3', '00', 'preliminari', 'flow', 'data', 'provid', 'daren', 'pleas', 'overrid', 'pop', 'daili', 'volum', 'present', 'zero', 'reflect', 'daili', 'activ', 'obtain', 'ga', 'control', 'chang', 'need', 'asap', 'econom', 'purpos']


In [5]:
def list_to_sent(my_list):                                  # converting list of words back to sentence
    result_string = ' '.join(my_list)
    return result_string
df['text'] = df['text'].apply(list_to_sent)
print(df['text'][0])                                        # for easy demonstration applied for df['text'][0]

subject enron methanol meter 988291 follow note gave monday 4 3 00 preliminari flow data provid daren pleas overrid pop daili volum present zero reflect daili activ obtain ga control chang need asap econom purpos


In [6]:
#4.################################################################################################################
df['y']= df['label']                                        # creating another column of y in the dataframe
df.head(2)                                                  # for easy demonstration showing top two of dataframe

Unnamed: 0.1,Unnamed: 0,label,text,label_num,y
0,605,ham,subject enron methanol meter 988291 follow not...,0,ham
1,2349,ham,subject hpl nom januari 9 2001 see attach file...,0,ham


In [7]:
#5.################################################################################################################
text_train,text_test,y_train,y_test = train_test_split(df['text'],df['y'],test_size=0.2,stratify=df['y'],random_state=17)
print(text_train.head(3))                                     # splitting the data into train and test data
print(text_test.head(3))                                      # printing the top 3 values of each test and train data 
print(y_train.head(3))
print(y_test.head(3))

3322    subject sean well resum interest forward brend...
3975    subject hpl nom may 24 2001 see attach file hp...
4214    subject anxieti valium xanax pay valium xanax ...
Name: text, dtype: object
375     subject request applic report list follow earl...
1936    subject fw feb 01 invoic daren purchas beaumon...
4487    subject hey tuscani selfish avert fitch canto ...
Name: text, dtype: object
3322     ham
3975     ham
4214    spam
Name: y, dtype: object
375      ham
1936     ham
4487    spam
Name: y, dtype: object


In [8]:
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer  # importing necessary libraries 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,f1_score, accuracy_score
from sklearn.pipeline import Pipeline

In [9]:
cv = CountVectorizer(analyzer=remove_sw_pun).fit(text_train) # using CountVectorizer to find word value from bagofwords
bow_cv = cv.transform(text_train)                            # fitting and transforming train data
bow_cv_text = cv.transform(text_test)                        # transforming test data

In [10]:
itf = TfidfTransformer().fit(bow_cv)                      # using TfidfTransformer to find term freq inverse document freq of a word
bow_itf = itf.transform(bow_cv)                           # fitting and transforming train data
bow_itf_test = itf.transform(bow_cv_text)                 # transforming test data

In [11]:
lr = LogisticRegression().fit(bow_itf,y_train)            # creating logistic regr. object and fitting X_train,y_train
predictions =lr.predict(bow_itf_test)                     # predictions of mode on test data(X_test)

In [12]:
print(classification_report(y_test,predictions))          # printing classification report and showing f1-score and score

              precision    recall  f1-score   support

         ham       1.00      0.99      0.99       735
        spam       0.97      1.00      0.99       300

    accuracy                           0.99      1035
   macro avg       0.99      0.99      0.99      1035
weighted avg       0.99      0.99      0.99      1035



In [13]:
###################################################################################################################

In [14]:
pipeline = Pipeline([                                          
            ('bow',CountVectorizer(analyzer=remove_sw_pun)),
            ('tftdf',TfidfTransformer()),
            ('classifier',LogisticRegression())
            ])                                       # this is the alternate method using pipeline

In [15]:
pipeline.fit(text_train,y_train)                     # fitting train data

In [16]:
predictions = pipeline.predict(text_test)           # predicting using test data on the model

In [17]:
print(classification_report(y_test,predictions))    # showing F1-score and accuracy value for Logistic regression

              precision    recall  f1-score   support

         ham       1.00      0.99      0.99       735
        spam       0.97      1.00      0.99       300

    accuracy                           0.99      1035
   macro avg       0.99      0.99      0.99      1035
weighted avg       0.99      0.99      0.99      1035



In [18]:
f1s = f1_score(y_test,predictions,pos_label='spam') #alternate method to calculate F1-score and accuracy using f1_score and accuracy_score class
print('The f1-score of the model on spam is:',f1s)
f1h = f1_score(y_test,predictions,pos_label='ham')
print('The f1-score of the model on ham is:',f1h)
f1t = f1_score(y_test,predictions,average=None)
print('The f1-score of the model is:',f1t)
acs = accuracy_score(y_test,predictions)
print('The accuracy of the model is',acs)

The f1-score of the model on spam is: 0.985172981878089
The f1-score of the model on ham is: 0.9938482570061518
The f1-score of the model is: [0.99384826 0.98517298]
The accuracy of the model is 0.991304347826087


In [19]:
###################################################################################################################

In [20]:
# cv = CountVectorizer(analyzer=remove_sw_pun).fit(text_train) # using CountVectorizer to find word value from bagofwords
# bow_cv = cv.transform(text_train)                            # fitting and transforming train data
# bow_cv_text = cv.transform(text_test)  

In [21]:
# lr = LogisticRegression().fit(bow_cv,y_train)            # creating logistic regr. object and fitting X_train,y_train
# predictions =lr.predict(bow_cv_text)                     # predictions of mode on test data(X_test)

In [22]:
# print(classification_report(y_test,predictions)) 

In [23]:
pipeline = Pipeline([                                          
            ('bow',CountVectorizer(analyzer=remove_sw_pun)),
            ('classifier',LogisticRegression())
            ])    

In [24]:
pipeline.fit(text_train,y_train)                     # fitting train data

In [25]:
predictions = pipeline.predict(text_test)           # predicting using test data on the model
print(classification_report(y_test,predictions))    # showing F1-score and accuracy value for Logistic regression

              precision    recall  f1-score   support

         ham       0.99      0.98      0.98       735
        spam       0.95      0.98      0.96       300

    accuracy                           0.98      1035
   macro avg       0.97      0.98      0.97      1035
weighted avg       0.98      0.98      0.98      1035



In [26]:
f1s = f1_score(y_test,predictions,pos_label='spam') #alternate method to calculate F1-score and accuracy using f1_score and accuracy_score class
print('The f1-score of the model on spam is:',f1s)
f1h = f1_score(y_test,predictions,pos_label='ham')
print('The f1-score of the model on ham is:',f1h)
f1t = f1_score(y_test,predictions,average=None)
print('The f1-score of the model is:',f1t)
acs = accuracy_score(y_test,predictions)
print('The accuracy of the model is',acs)

The f1-score of the model on spam is: 0.9638157894736843
The f1-score of the model on ham is: 0.9849521203830368
The f1-score of the model is: [0.98495212 0.96381579]
The accuracy of the model is 0.978743961352657


In [27]:
###################################################################################################################

In [28]:
tf = CountVectorizer(analyzer=remove_sw_pun).fit(text_train) # using CountVectorizer to find word value from bagofwords
bow_cv = cv.transform(text_train)                            # fitting and transforming train data
bow_cv_text = cv.transform(text_test)  

In [29]:
#Your code goes here

In [30]:
from sklearn.feature_extraction.text import CountVectorizer    # importing necessary libraries
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [31]:
pipeline = Pipeline([
            ('bow',CountVectorizer(analyzer=remove_sw_pun)),
            ('tftdf',TfidfTransformer()),
            ('classifier',MultinomialNB())
            ])                                                 # creating pipeline with sequence of functions to perfrom on the text

In [32]:
pipeline.fit(text_train,y_train)                               # fitting train data

In [33]:
predictions = pipeline.predict(text_test)                      # predicting using test data on the model

In [34]:
print(classification_report(y_test,predictions))               # showing F1-score and accuracy value for Multinomial NaiveBayes model

              precision    recall  f1-score   support

         ham       0.88      1.00      0.94       735
        spam       1.00      0.67      0.80       300

    accuracy                           0.91      1035
   macro avg       0.94      0.84      0.87      1035
weighted avg       0.92      0.91      0.90      1035



In [35]:
f1s = f1_score(y_test,predictions,pos_label='spam') #alternate method to calculate F1-score and accuracy using f1_score and accuracy_score class
print('The f1-score of the model on spam is:',f1s)
f1h = f1_score(y_test,predictions,pos_label='ham')
print('The f1-score of the model on ham is:',f1h)
f1t = f1_score(y_test,predictions,average=None)
print('The f1-score of the model is:',f1t)
acs = accuracy_score(y_test,predictions)
print('The accuracy of the model is',acs)

The f1-score of the model on spam is: 0.8047808764940239
The f1-score of the model on ham is: 0.9375
The f1-score of the model is: [0.9375     0.80478088]
The accuracy of the model is 0.9053140096618357
