In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("/Users/nanda/Downloads/KDE Shared/mail_data.csv")

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
## replace null values with null strings
df = df.where((pd.notnull(df)),'')

In [6]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [8]:
df.replace({"Category":{"ham":0,"spam":1}},inplace = True)

In [10]:
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
X = df["Message"]
y = df["Category"]

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=3)

## Feature Extraction

In [14]:
## Transforming the text data  to feature vectors that can be used as input to the Logistic regression

In [15]:
feature_extraction = TfidfVectorizer(min_df=1,stop_words="english",lowercase="True")

In [16]:
X_train_features = feature_extraction.fit_transform(X_train)

In [17]:
X_train_features

<4457x7485 sparse matrix of type '<class 'numpy.float64'>'
	with 34822 stored elements in Compressed Sparse Row format>

In [19]:
X_test_features = feature_extraction.transform(X_test)

In [20]:
## Converting the train and test outputs to int
y_train = y_train.astype("int")
y_test = y_test.astype("int")

In [21]:
## Loading the logistic model
model = LogisticRegression()

In [24]:
## Training the Logistic regression model using the training data
model.fit(X_train_features,y_train)

LogisticRegression()

In [25]:
## Prediction on traing data
prediction_train_data = model.predict(X_train_features)

In [26]:
accuracy_train_data = accuracy_score(prediction_train_data,y_train)
accuracy_train_data

0.9681400044873233

In [28]:
prediction_test_data = model.predict(X_test_features)
accuracy_test_data = accuracy_score(prediction_test_data,y_test)
accuracy_test_data

0.9641255605381166

## Building a predictive system

In [42]:
input_mail = ['''I only haf msn. It's yijue@hotmail.com''']

extracted_mail = feature_extraction.transform(input_mail)
prediction =model.predict(extracted_mail)
prediction

array([0])

In [43]:
if (prediction[0])==1:
    print("This is a spam mail")
else:
    print("This is a ham mail")

This is a ham mail
