In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split as tt
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings

warnings.filterwarnings("ignore")

In [14]:
df = pd.read_csv("mail_data.csv")

df = df.where((pd.notnull(df)),'')
# print(mail_data.head())
# print(df.info())
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [21]:
words = stopwords.words('english')

# Stemming
port_stem = PorterStemmer()

def stemming(content):
    # Removing anything thats not an alphabet(by converting them into blank spaces)
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    # Removing stop words
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in words]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content


df["Message"] = df["Message"].apply(stemming)


# Category Mapping
df.replace({"Category":{"spam":1,"ham":0}},inplace=True)


# # Handling Biased Data
# spam = df[df["Category"]==1]
# ham = df[df["Category"]==0]
# ham = ham.sample(n=spam.shape[0])
# df = pd.concat([spam,ham],axis=0)

# Vectorizing
m = df["Message"]
n = df["Category"]
vectorizer = TfidfVectorizer(min_df=1,lowercase=True,stop_words='english')
m = vectorizer.fit_transform(m)
n = n.astype('int64')
print(m.shape)

(5572, 6141)


In [22]:
m_train,m_test,n_train,n_test = tt(m,n,train_size=0.8,random_state=0)

In [23]:
clf = LogisticRegression(random_state=0)
clf.fit(m_train,n_train)

n_train_pred = clf.predict(m_train)
n_test_pred = clf.predict(m_test)

In [24]:
train_acc = accuracy_score(n_train,n_train_pred)
test_acc = accuracy_score(n_test,n_test_pred)
print("Training Accuracy:",train_acc)
print("Testing Accuracy:",test_acc)

Training Accuracy: 0.9715054969710568
Testing Accuracy: 0.95695067264574
