# Mounting the google drive

In [1]:
# Mounting the google drive to get the images

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Importing useful libraries and data reading

In [3]:
# Importing useful libraries

import numpy as np # For numerical computations
import pandas as pd # For dataframe related tasks

# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# To remove unneccesary warnings
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression # For model training
from sklearn.model_selection import train_test_split # Fro train test splitting
from sklearn import metrics # For model evaluation
from sklearn.feature_extraction.text import TfidfVectorizer

import pickle # For model saving

In [4]:
# Specifying the location for all our data is located

data_folder_path = 'drive/MyDrive/AI_project_data/'

In [5]:
# Reading the data

df = pd.read_csv(data_folder_path + 'mail_data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# EDA

In [6]:
# Lets see if we have any missing values

df.isna().sum()

Category    0
Message     0
dtype: int64

In [7]:
# Lets see the shape of our dataset

df.shape

(5572, 2)

# Data preprocessing

In [10]:
# Lets take our feature and target variables

X = df['Message']
y = df['Category']

In [11]:
# Now that we got our target and features we can do our train test split

X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                 test_size=0.2,
                                                 random_state=1
                                                 )

In [12]:
# Lets make the message numerical by using feature extractin that we imported from sklearn

feature_extractor = TfidfVectorizer(min_df=1,
                          stop_words='english',
                          lowercase=True
                          )
X_train_features = feature_extractor.fit_transform(X_train)
X_test_features = feature_extractor.transform(X_test)

# Model training

In [13]:
# Now that we have our training and test set we can start our model training
# Lets first instantiate our model

lr = LogisticRegression()

In [14]:
# Training our model

lr.fit(X_train_features,y_train)

LogisticRegression()

# Model Evaluation

In [15]:
# Lets see our training set accuracy

pred = lr.predict(X_train_features)
accuracy = metrics.accuracy_score(y_train,pred)
accuracy

0.9681400044873233

In [16]:
# Lets see our test set accuracy

pred = lr.predict(X_test_features)
accuracy = metrics.accuracy_score(y_test,pred)
accuracy

0.9704035874439462

# Saving the model

In [18]:
# Saving the model 

file_name = 'models/spam_mail_prediction.h5'
with open(file_name,'wb') as f:
  pickle.dump(lr,f)