In [1]:
# import the required libraries
# importing numpy for changing the data into numpy arrays
import numpy as np
# import pandas to working with dataframes
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [4]:
# loading dataset
data=pd.read_csv("/content/spam.csv", encoding='latin1')
# view the first 5 rows of the Data
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
# Output distribution of category spam and Ham("not-Spam")
data['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [7]:
# create  labels for both spam and not-spam categories
spam_data=data[data['v1']=='spam']

spam_data.shape




(747, 5)

In [9]:
not_spam=data[data['v1']=='ham']
not_spam.shape


(4825, 5)

In [10]:
# Get random sample from not_spam data in size of spam data
not_spam_downsampled = not_spam.sample(spam_data.shape[0])
not_spam_downsampled.shape

(747, 5)

In [11]:
# Create balanced data that contains the same size of two categories
balanced_data = pd.concat([not_spam_downsampled, spam_data])
balanced_data.shape

(1494, 5)

In [13]:
# New balanced data distribution
balanced_data['v1'].value_counts()

ham     747
spam    747
Name: v1, dtype: int64

In [14]:
# Create a new column that contains encoded values of Category ['ham':0, 'spam':1]
balanced_data['spam']= balanced_data['v1'].apply(lambda x: 1 if x=='spam' else 0)
balanced_data.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,spam
693,ham,Will purchase d stuff today and mail to you. D...,,,,0
4340,ham,ÌÏ all write or wat..,,,,0
787,spam,Ever thought about living a good life with a p...,,,,1
5141,spam,FREE for 1st week! No1 Nokia tone 4 ur mobile ...,,,,1
4357,spam,Our dating service has been asked 2 contact U ...,,,,1


In [16]:
# Separate the features (X) and the target variable (y)
X = balanced_data['v2']
y = balanced_data['spam']

In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

In [19]:
# Fit and transform the vectorizer on the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [20]:
# Create a LogisticRegression model
model = LogisticRegression()

In [21]:
# Fit the model to the training data
model.fit(X_train_tfidf, y_train)

In [22]:
#  Make predictions on the test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model performance
accuracy = np.mean(y_pred == y_test)
print("Accuracy: ", accuracy)
print(accuracy*100,"%")

Accuracy:  0.9331103678929766
93.31103678929766 %
