# Email-Spam Detection using Logistic Regression 

In [40]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score


## Importing Data

In [41]:
data = pd.read_csv("spam.csv",encoding='latin1')
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


## Cleaning and reformating the Data

In [42]:
data.drop(columns= ["Unnamed: 2",	"Unnamed: 3",	"Unnamed: 4"], inplace = True)
data.rename(columns={"v1":"label", "v2": "feature"}, inplace= True)
data['label'] = data['label'].map({'spam': 1, 'ham': 0})
data.head()

Unnamed: 0,label,feature
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## Splitting the data into Training and Testing Data

In [43]:
X_train, X_test, y_train, y_test = train_test_split(data['feature'], data['label'], test_size=0.1, random_state=1234)
print("X_train:\n", X_train.head())
print("X_test: \n", X_test.head())
print("y_train \n", y_train.head())
print("y_test: \n", y_test.head())

X_train:
 2893                     K...k...yesterday i was in cbe .
2092    Oh, my love, it's soooo good to hear from you....
1293     Happy birthday... May all ur dreams come true...
253     Ups which is 3days also, and the shipping comp...
2434    Uncle boye. I need movies oh. Guide me. Plus y...
Name: feature, dtype: object
X_test: 
 1537    All sounds good. Fingers . Makes it difficult ...
963     Yo chad which gymnastics class do you wanna ta...
4421              MMM ... Fuck .... Merry Christmas to me
46          Didn't you get hep b immunisation in nigeria.
581        Ok anyway no need to change with what you said
Name: feature, dtype: object
y_train 
 2893    0
2092    0
1293    0
253     0
2434    0
Name: label, dtype: int64
y_test: 
 1537    0
963     0
4421    0
46      0
581     0
Name: label, dtype: int64


## Text vectorization using TF-IDF

In [44]:
vectorizer = TfidfVectorizer(stop_words = "english", max_features = 5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
print(vectorizer.vocabulary_)





## Training Logistic Regression model

In [45]:
model = LogisticRegression()
model.fit(X_train_vectorized,y_train)

## Predicting and finding Accuracy

In [48]:
y_pred = model.predict(X_test_vectorized)

acc = accuracy_score(y_true = y_test, y_pred = y_pred)
print(f"Accuracy: { acc*100:.2f}")

Accuracy: 97.13
