# Spam mail detection by Logistic Regression

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
import os
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer # convert text to feature vector
from sklearn.metrics import accuracy_score

In [25]:
os.listdir()

['.ipynb_checkpoints', 'mail_data.csv', 'Spam Mail Detector.ipynb']

In [3]:
df = pd.read_csv('mail_data.csv')

In [4]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [6]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [7]:
# the dataset is not balanced

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [9]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [11]:
df_spam = df[df.Category=='spam']
df_ham = df[df.Category=='ham']

In [12]:
df_spam.shape

(747, 2)

In [13]:
df_ham.shape

(4825, 2)

In [14]:
df_ham = df_ham.sample(df_spam.shape[0])

In [15]:
df_ham.shape

(747, 2)

In [16]:
df_balanced = pd.concat([df_ham,df_spam], axis = 0)

In [17]:
df_balanced.shape

(1494, 2)

### Label Encoding

In [31]:
df_balanced.replace(['spam','ham'],[0,1],inplace=True)

In [32]:
df_balanced.head()

Unnamed: 0,Category,Message
3870,1,No let me do the math. Your not good at it.
4764,1,Prepare to be pleasured :)
1433,1,Thanks for ve lovely wisheds. You rock
289,1,"My life Means a lot to me, Not because I love ..."
5014,1,"Uncle G, just checking up on you. Do have a re..."


In [33]:
df_balanced.Category.value_counts()

1    747
0    747
Name: Category, dtype: int64

In [19]:
# df_balanced is now balanced

### Splitting dataset into train and test datasets 

In [34]:
x = df_balanced.Message
y = df_balanced.Category

In [54]:
X_train , X_test , y_train , y_test = train_test_split(x,y , test_size= 0.2 , random_state= 3 , stratify= y )

In [55]:
y_train.value_counts()

1    598
0    597
Name: Category, dtype: int64

### Feature Extraction 

In [56]:
FE = TfidfVectorizer(min_df= 1 , stop_words= 'english' ,lowercase= True)

In [57]:
X_train_features = FE.fit_transform(X_train)
X_test_features = FE.transform(X_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [63]:
print(X_test_features[0])

  (0, 2199)	0.5699312333174855
  (0, 1756)	0.5661360111438491
  (0, 1718)	0.5955404320240073


### Train the model

In [66]:
model = LogisticRegression()

In [67]:
model.fit(X_train_features , y_train)

In [82]:
train_score = model.score(X_train_features,y_train)

In [78]:
y_predicted = model.predict(X_test_features)

In [80]:
 test_score = accuracy_score(y_test , y_predicted)

In [81]:
 test_score

0.959866220735786

## Predictive System

In [110]:
def prediction ():
    email = list(input(' enter the email'))
    email_f = FE.transform(email)
    output = model.predict(email_f)
    print(output)