In [49]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [50]:
mail_df = pd.read_csv('mail_data.csv')

In [51]:
print(mail_df)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [52]:
data = mail_df.where((pd.notnull(mail_df)),'')

In [53]:
print(data)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [54]:
X_train, X_test, y_train, y_test = train_test_split(mail_df['Message'], mail_df['Category'], test_size=0.2, random_state=42)

In [55]:
print(X_train)

1978    Reply to win £100 weekly! Where will the 2006 ...
3989    Hello. Sort of out in town already. That . So ...
3935     How come guoyang go n tell her? Then u told her?
4078    Hey sathya till now we dint meet not even a si...
4086    Orange brings you ringtones from all time Char...
                              ...                        
3772    Hi, wlcome back, did wonder if you got eaten b...
5191                               Sorry, I'll call later
5226        Prabha..i'm soryda..realy..frm heart i'm sory
5390                           Nt joking seriously i told
860               Did he just say somebody is named tampa
Name: Message, Length: 4457, dtype: object


In [56]:
print(X_test)

3245    Squeeeeeze!! This is christmas hug.. If u lik ...
944     And also I've sorta blown him off a couple tim...
1044    Mmm thats better now i got a roast down me! i...
2484        Mm have some kanji dont eat anything heavy ok
812     So there's a ring that comes with the guys cos...
                              ...                        
4264    Den only weekdays got special price... Haiz......
2439         I not busy juz dun wan 2 go so early.. Hee..
5556    Yes i have. So that's why u texted. Pshew...mi...
4205    How are you enjoying this semester? Take care ...
4293                                                G.W.R
Name: Message, Length: 1115, dtype: object


In [57]:
print(y_test)

3245    ham
944     ham
1044    ham
2484    ham
812     ham
       ... 
4264    ham
2439    ham
5556    ham
4205    ham
4293    ham
Name: Category, Length: 1115, dtype: object


In [58]:
# Convert text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()

In [59]:
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [60]:
print(X_train_vectorized)

  (0, 5687)	1
  (0, 6888)	3
  (0, 7474)	1
  (0, 258)	1
  (0, 7396)	1
  (0, 7437)	1
  (0, 7471)	1
  (0, 6773)	1
  (0, 354)	1
  (0, 2805)	1
  (0, 7555)	1
  (0, 2106)	1
  (0, 1271)	1
  (0, 3364)	1
  (0, 5980)	1
  (0, 6460)	1
  (0, 694)	1
  (0, 2568)	1
  (0, 5999)	1
  (1, 7471)	1
  (1, 3369)	1
  (1, 6304)	1
  (1, 4854)	1
  (1, 4976)	1
  (1, 3603)	1
  :	:
  (4452, 7535)	1
  (4452, 2502)	1
  (4452, 7519)	1
  (4453, 1606)	1
  (4453, 6303)	1
  (4453, 4129)	1
  (4453, 3997)	1
  (4454, 2984)	1
  (4454, 3351)	1
  (4454, 5584)	1
  (4454, 5321)	1
  (4454, 6309)	1
  (4454, 6308)	1
  (4455, 6904)	1
  (4455, 4812)	1
  (4455, 3812)	1
  (4455, 5996)	1
  (4456, 3712)	1
  (4456, 3841)	1
  (4456, 2294)	1
  (4456, 3339)	1
  (4456, 5895)	1
  (4456, 6666)	1
  (4456, 6268)	1
  (4456, 4660)	1


In [61]:
print(X_test_vectorized)

  (0, 1196)	1
  (0, 1805)	1
  (0, 2122)	1
  (0, 2231)	1
  (0, 3094)	1
  (0, 3325)	1
  (0, 3505)	2
  (0, 3566)	2
  (0, 3712)	1
  (0, 4080)	1
  (0, 4226)	1
  (0, 4373)	1
  (0, 4639)	1
  (0, 4779)	1
  (0, 5111)	1
  (0, 6252)	1
  (0, 6811)	1
  (1, 927)	1
  (1, 961)	1
  (1, 1407)	1
  (1, 2031)	1
  (1, 2910)	1
  (1, 3404)	2
  (1, 3554)	1
  (1, 4169)	1
  :	:
  (1111, 2472)	1
  (1111, 2489)	1
  (1111, 3140)	1
  (1111, 3360)	1
  (1111, 3845)	1
  (1111, 4797)	1
  (1111, 6252)	1
  (1111, 7327)	1
  (1112, 3330)	1
  (1112, 4484)	1
  (1112, 4606)	1
  (1112, 6252)	2
  (1112, 6750)	1
  (1112, 6770)	1
  (1112, 7456)	1
  (1112, 7646)	1
  (1112, 7662)	1
  (1113, 1051)	1
  (1113, 1520)	1
  (1113, 1654)	1
  (1113, 3487)	1
  (1113, 5976)	1
  (1113, 6650)	1
  (1113, 6811)	1
  (1113, 7662)	1


In [62]:
# Train a Logistic Regression classifier
clf = LogisticRegression()
clf.fit(X_train_vectorized, y_train)

In [63]:
predictions = clf.predict(X_test_vectorized)


In [64]:
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
classification_rep = classification_report(y_test, predictions)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{classification_rep}')


Accuracy: 0.9865470852017937
Confusion Matrix:
[[966   0]
 [ 15 134]]
Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       1.00      0.90      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115

