In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split 
from sklearn import svm


In [3]:
df = pd.read_csv('mail_data.csv')


In [4]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [11]:
df.isnull().sum()  

Category    0
Message     0
dtype: int64

In [12]:
df["Category"].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [13]:
# On remarque qu'on a beaucoup plus de valeurs ham et pas enormement de valeurs de spam , ce qui pourrai biaisé notre résultat ,
# Avec le sous-echantillonnage on peut diminué le nombre de lignes ou la catégorie ham est presente pour avoir un nombre de ligne égale

In [18]:

ham_message = df[df['Category'] == 'ham']
spam_message = df[df['Category'] == 'spam']


In [17]:
ham_message.value_counts

<bound method DataFrame.value_counts of      Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
6         ham  Even my brother is not like to speak with me. ...
...       ...                                                ...
5565      ham                                       Huh y lei...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[4825 rows x 2 columns]>

In [20]:
spam_message.value_counts()

Category  Message                                                                                                                                                           
spam      Please call our customer service representative on FREEPHONE 0808 145 4742 between 9am-11pm as you have WON a guaranteed £1000 cash or £5000 prize!                   4
          #ERROR!                                                                                                                                                               3
          Loan for any purpose £500 - £75,000. Homeowners + Tenants welcome. Have you been previously refused? We can still help. Call Free 0800 1956669 or text back 'help'    3
          Camera - You are awarded a SiPix Digital Camera! call 09061221066 fromm landline. Delivery within 28 days.                                                            3
          December only! Had your mobile 11mths+? You are entitled to update to the latest colour camera mobile for

In [25]:
print(df.shape)

(5572, 2)


In [34]:
spam_message.shape[0]

(747, 2)

In [24]:
ham_message.shape[0]

4825

In [32]:
undersampled_ham = ham_message.sample(frac=(spam_message.shape[0]/ham_message.shape[0]) ,random_state=1 )

In [33]:
undersampled_ham.shape

(747, 2)

In [36]:
new_df = pd.concat([undersampled_ham ,spam_message])

In [37]:
new_df

Unnamed: 0,Category,Message
2535,ham,Ok enjoy . R u there in home.
1213,ham,"Yo, the game almost over? Want to go to walmar..."
522,ham,Shall i come to get pickle
5398,ham,Hi. Hope you had a good day. Have a better night.
700,ham,K..u also dont msg or reply to his msg..
...,...,...
5537,spam,Want explicit SEX in 30 secs? Ring 02073162414...
5540,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,spam,Had your contract mobile 11 Mnths? Latest Moto...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...


In [43]:
new_df['Category'].value_counts()

Category
ham     747
spam    747
Name: count, dtype: int64

In [46]:
new_df["Category"]= new_df['Category'].apply(lambda x : 0 if x == "ham" else 1)

In [47]:
new_df

Unnamed: 0,Category,Message
2535,0,Ok enjoy . R u there in home.
1213,0,"Yo, the game almost over? Want to go to walmar..."
522,0,Shall i come to get pickle
5398,0,Hi. Hope you had a good day. Have a better night.
700,0,K..u also dont msg or reply to his msg..
...,...,...
5537,1,Want explicit SEX in 30 secs? Ring 02073162414...
5540,1,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,1,Had your contract mobile 11 Mnths? Latest Moto...
5566,1,REMINDER FROM O2: To get 2.50 pounds free call...


In [48]:
balanced_df = new_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [49]:
balanced_df

Unnamed: 0,Category,Message
0,1,"URGENT, IMPORTANT INFORMATION FOR O2 USER. TOD..."
1,1,Panasonic & BluetoothHdset FREE. Nokia FREE. M...
2,1,Do you want a new Video handset? 750 any time ...
3,1,Hi if ur lookin 4 saucy daytime fun wiv busty ...
4,1,09066362231 URGENT! Your mobile No 07xxxxxxxxx...
...,...,...
1489,1,December only! Had your mobile 11mths+? You ar...
1490,1,Loans for any purpose even if you have Bad Cre...
1491,1,You have an important customer service announc...
1492,1,URGENT! Your Mobile number has been awarded wi...


In [50]:
X = balanced_df['Message']
y = balanced_df["Category"]

In [51]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.25 , random_state=42)

In [52]:
X_train.shape

(1120,)

In [53]:
y_train.shape

(1120,)

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer

In [56]:
vect = TfidfVectorizer(min_df=1 , stop_words='english' , lowercase=True)

In [57]:
X_train = vect.fit_transform(X_train)

In [63]:
X_train[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [64]:
X_test = vect.transform(X_test)

In [65]:
X_test.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [66]:
from sklearn.linear_model import LogisticRegression

In [67]:
lr = LogisticRegression()

In [68]:
lr.fit(X_train , y_train)

In [69]:
from sklearn.metrics import accuracy_score

In [71]:
lr_pred = lr.predict(X_test)


In [72]:
accuracy = accuracy_score(y_test , lr_pred)

In [73]:
accuracy

0.9491978609625669