In [23]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [24]:
raw_mail_data = pd.read_csv("merged_output.csv")

In [25]:
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [27]:
raw_mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [28]:
df = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

In [29]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [30]:
df.shape

(10743, 2)

In [31]:
# Label encoding

In [32]:
df['Category'] = df['Category'].map({'spam': 0, 'ham': 1})

In [33]:
df.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [34]:
# loc function

In [35]:
df.loc[df['Category'] == 'spam', 'Category',] = 0
df.loc[df['Category'] == 'ham', 'Category',] = 1

In [36]:
df.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [37]:
X = df['Message']
Y = df['Category']

In [38]:
X

0        Go until jurong point, crazy.. Available only ...
1                            Ok lar... Joking wif u oni...
2        Free entry in 2 a wkly comp to win FA Cup fina...
3        U dun say so early hor... U c already then say...
4        Nah I don't think he goes to usf, he lives aro...
                               ...                        
10738    Subject: put the 10 on the ft\r\nthe transport...
10739    Subject: 3 / 4 / 2000 and following noms\r\nhp...
10740    Subject: calpine daily gas nomination\r\n>\r\n...
10741    Subject: industrial worksheets for august 2000...
10742    Subject: important online banking alert\r\ndea...
Name: Message, Length: 10743, dtype: object

In [39]:
Y

0        1
1        1
2        0
3        1
4        1
        ..
10738    1
10739    1
10740    1
10741    1
10742    0
Name: Category, Length: 10743, dtype: int64

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)

In [41]:
X_train.shape

(8594,)

In [42]:
y_train.shape

(8594,)

In [43]:
X_test.shape

(2149,)

In [44]:
# Feature Extraction (Vectorization)
feature_extraction = TfidfVectorizer(min_df = 1, stop_words="english", lowercase=True)

# Fit and Transform the Training Data
# This is the line that defines X_train_features
X_train_features = feature_extraction.fit_transform(X_train.astype(str))

# Transform the Test Data
X_test_features = feature_extraction.transform(X_test.astype(str))

print("X_train_features created successfully.")
print("Shape:", X_train_features.shape)

X_train_features created successfully.
Shape: (8594, 46029)


In [45]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

ros = RandomOverSampler(sampling_strategy=0.8, random_state=42) 
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_features, y_train)

print("Resampled training data shape %s" % Counter(y_train_resampled))


Resampled training data shape Counter({1: 6812, 0: 5449})


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10743 entries, 0 to 10742
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  10743 non-null  int64 
 1   Message   10743 non-null  object
dtypes: int64(1), object(1)
memory usage: 168.0+ KB


In [47]:
y_train = y_train.astype("int")
y_test = y_test.astype("int")

In [48]:
X_train

456       Si si. I think ill go make those oreo truffles.
5653    Subject: 21 st changes\r\n- - - - - - - - - - ...
239     New Theory: Argument wins d SITUATION, but los...
2862    I am not at all happy with what you saying or ...
6954    Subject: guadalupe\r\ni rolled 740208 , 740209...
                              ...                        
5734              Subject: holiday e - cards\r\ngbhzivjwl
5191                               Sorry, I'll call later
5390                           Nt joking seriously i told
860               Did he just say somebody is named tampa
7270    Subject: top profiie bouncing sharply off its ...
Name: Message, Length: 8594, dtype: object

In [49]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 302376 stored elements and shape (8594, 46029)>
  Coords	Values
  (0, 38062)	0.7152897231442159
  (0, 41119)	0.19518859360109334
  (0, 23153)	0.29093827629132507
  (0, 27495)	0.18249400110289402
  (0, 31252)	0.4076229163346854
  (0, 42063)	0.4076229163346854
  (1, 39807)	0.09023189753848629
  (1, 1371)	0.22474696873039632
  (1, 39246)	0.27053748557089363
  (1, 11178)	0.251500933459646
  (1, 19474)	0.08986835271191018
  (1, 6162)	0.2591923222679649
  (1, 11473)	0.26821415543952754
  (1, 13230)	0.18794480459265514
  (1, 17357)	0.23243103082283664
  (1, 57)	0.1918369757367814
  (1, 1298)	0.2069023485968049
  (1, 1300)	0.15262126201960205
  (1, 486)	0.17049860628172203
  (1, 2248)	0.14226410404455372
  (1, 42041)	0.16713134516429712
  (1, 8442)	0.17579075452944187
  (1, 35488)	0.15721408180221128
  (1, 12306)	0.0851618524370083
  (1, 2127)	0.136484457841334
  :	:
  (8593, 14660)	0.03814565722405589
  (8593, 28117)	0.114436971672

In [50]:
print(X_test_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 71020 stored elements and shape (2149, 46029)>
  Coords	Values
  (0, 77)	0.03402596510297785
  (0, 486)	0.028102601128941945
  (0, 545)	0.07471348975561151
  (0, 597)	0.06335565887589419
  (0, 736)	0.05927775407190669
  (0, 1055)	0.037017007631499586
  (0, 1300)	0.12577975104516773
  (0, 1304)	0.09330000745455501
  (0, 1419)	0.039513424426754803
  (0, 1567)	0.036936116444558534
  (0, 1703)	0.06681978427251371
  (0, 1755)	0.03973338994537083
  (0, 2135)	0.07279114195348532
  (0, 2642)	0.046645482940745726
  (0, 2740)	0.03766377619626428
  (0, 2741)	0.04101200574650111
  (0, 3079)	0.053487768313450236
  (0, 3168)	0.06989248474051261
  (0, 3531)	0.07471348975561151
  (0, 3976)	0.07471348975561151
  (0, 4027)	0.14942697951122302
  (0, 4028)	0.07471348975561151
  (0, 5011)	0.15835237344610584
  (0, 7927)	0.23157548134446443
  (0, 10884)	0.026685697007455073
  :	:
  (2145, 44840)	0.13411970679894006
  (2146, 7720)	0.43035596437187

In [51]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_resampled, y_train_resampled) 

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [52]:
prediction_train_data = model.predict(X_train_features)
accuracy_train_data = accuracy_score(y_train, prediction_train_data)

In [53]:
print("Accuarcy on train data: ", accuracy_train_data)

Accuarcy on train data:  0.9904584593902723


In [54]:
prediction_test_data = model.predict(X_test_features)
accuracy_test_data = accuracy_score(y_test, prediction_test_data)

In [55]:
print("Accuarcy on test data: ", accuracy_test_data)

Accuarcy on test data:  0.9711493718008376


In [56]:
import pickle
import os

# Define the correct relative path: go up one level, then into 'models'
# This assumes the notebook is running from inside the 'data' directory.
MODEL_DIR = os.path.join(os.pardir, 'models')

# Create the directory if it doesn't exist (e.g., api/models)
os.makedirs(MODEL_DIR, exist_ok=True)

# Save the model and vectorizer to the correct subdirectory
pickle.dump(model, open(os.path.join(MODEL_DIR, "logistic_regression.pkl"), "wb"))
pickle.dump(feature_extraction, open(os.path.join(MODEL_DIR, "feature_extraction.pkl"), "wb"))

print(f"Model and Feature Extractor saved successfully to the: {os.path.abspath(MODEL_DIR)}")

Model and Feature Extractor saved successfully to the: c:\Users\DELL\Cooking\!isSpam\api\models
