In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('mail_data.csv')
df.shape

(5572, 2)

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [5]:
# there are no null values in the dataset

In [6]:
# labeling spam mail as 0, ham mail as 1

df['Category'][df['Category']=='spam'] = 0
df['Category'][df['Category']=='ham'] = 1

In [8]:
X = df['Message']
y = df['Category']

In [13]:
# splitting data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [14]:
# transforming text data of X to feature vectors

from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
doc = TfidfVectorizer(stop_words='english', min_df=1, lowercase=True)

doc.fit(X_train)

X_train_features = doc.transform(X_train)
X_test_features = doc.transform(X_test)

In [17]:
y_train.dtype

dtype('O')

In [18]:
# converting y values to integers
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [20]:
X_train_features[0]

<1x7431 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

## Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression

In [23]:
logreg = LogisticRegression()

logreg.fit(X_train_features, y_train)

LogisticRegression()

In [24]:
y_pred_train = logreg.predict(X_train_features)
print(f"accuracy on training data: {accuracy_score(y_train, y_pred_train)}")

accuracy on training data: 0.9670181736594121


In [26]:
y_pred_test = logreg.predict(X_test_features)
print(f"accuracy on test data: {accuracy_score(y_test, y_pred_test)}")

accuracy on test data: 0.9659192825112107


In [27]:
# good accuracy achieved using Logistic Regression