# Importing Dependencies

In [1]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Data collection 

In [2]:
email_data = pd.read_csv("combined_data.csv")

In [3]:
email_data.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [4]:
email_data.shape

(83448, 2)

In [5]:
email_data.isnull().sum()

label    0
text     0
dtype: int64

In [6]:
email_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83448 entries, 0 to 83447
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   83448 non-null  int64 
 1   text    83448 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


In [7]:
# replace the null value with the null string 
mail_data = email_data.where((pd.notnull(email_data)),'')

In [8]:
mail_data.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


# Encoding 

In [9]:
# mail_data.loc[mail_data["label"]=='spam', "Catrgory",] = 0

# Separating the Features and Label

In [10]:
x = mail_data["text"]
y = mail_data["label"]

In [11]:
x

0        ounce feather bowl hummingbird opec moment ala...
1        wulvob get your medircations online qnb ikud v...
2         computer connection from cnn com wednesday es...
3        university degree obtain a prosperous future m...
4        thanks for all your answers guys i know i shou...
                               ...                        
83443    hi given a date how do i get the last date of ...
83444    now you can order software on cd or download i...
83445    dear valued member canadianpharmacy provides a...
83446    subscribe change profile contact us long term ...
83447    get the most out of life ! viagra has helped m...
Name: text, Length: 83448, dtype: object

In [12]:
y

0        1
1        1
2        0
3        1
4        0
        ..
83443    0
83444    1
83445    1
83446    0
83447    1
Name: label, Length: 83448, dtype: int64

# Train Test split

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

In [14]:
# Display shapes of training and testing datasets
print("Training Features Shape:", x_train.shape)  
print("Testing Features Shape:", x_test.shape)  

print("Training Labels Shape:", y_train.shape)  
print("Testing Labels Shape:", y_test.shape)

Training Features Shape: (66758,)
Testing Features Shape: (16690,)
Training Labels Shape: (66758,)
Testing Labels Shape: (16690,)


# Feature Extraction

In [15]:
# print(type(y))

In [17]:
# transform the test data to feature/text vector for ML model 
feature_extraction = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)

x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test) # because we fit and train our model on the training data 

y_train = y_train.astype("int")
y_test = y_test.astype("int")


In [21]:
 print(x_train_features)

# Training the Model 

In [19]:
lr_model = LogisticRegression()

In [22]:
lr_model.fit(x_train_features, y_train)

# Evaluation of Trained Model

In [31]:
# prediction on training data 
train_data_pred = lr_model.predict(x_train_features)
train_data_accuracy = accuracy_score(train_data_pred, y_train)
# print the valuue
print("Accuracy of Training data : ", train_data_accuracy*100)

Accuracy of Training data :  98.93495910602475


In [32]:
# prediction on Testing data 
test_data_pred = lr_model.predict(x_test_features)
test_data_accuracy = accuracy_score(test_data_pred, y_test)
# print the valuue
print("Accuracy of Testing data : ",test_data_accuracy*100)

Accuracy of Testing data :  98.6099460754943


In [29]:
print(lr_model.score(x_train_features,y_train)*100, lr_model.score(x_test_features, y_test)*100)

98.93495910602475 98.6099460754943


# Save Model 

In [33]:
import joblib
filename = "LR Mail Spam Prediction.pkl"
# save 
joblib.dump(lr_model, filename)

['LR Mail Spam Prediction.pkl']

# Load Model

In [36]:
loaded_model = joblib.load(filename)

print("Model loaded successfully!")

Model loaded successfully!


# Predictive Model

In [46]:
# 0,  hi list when loading library rgl i get error rgl glx extension missing on server i have mesa and xgl installed but xgl info sais its an highly experimental code does this mean i shouldn't bother for a while with opengl or are there alternatives any help appreciated thanx herry r escapenumber escapenumber escapenumber on xescapenumber escapenumber opensuse escapenumber escapenumber dr alexander herr herry spatial and statistical analyst csiro sustainable ecosystems davies laboratory university drive spielas qld escapenumber private mail bag aitkenvale qld escapenumber phone www escapenumber escapenumber escapenumber escapenumber escapenumber fax home http herry ausbats org au webadmin abs http ausbats org au sustainable ecosystems http www cse csiro au r help stat math ethz ch mailing list https stat ethz ch mailman listinfo r help please do read the posting guide http www r project org posting guide html and provide commented minimal self contained reproducible code 

In [52]:
input_mail = ["hi list when loading library rgl i get error rgl glx extension missing on server i have mesa and xgl installed but xgl info sais its an highly experimental code does this mean i shouldn't bother for a while with opengl or are there alternatives any help appreciated thanx herry r escapenumber escapenumber escapenumber on xescapenumber escapenumber opensuse escapenumber escapenumber dr alexander herr herry spatial and statistical analyst csiro sustainable ecosystems davies laboratory university drive spielas qld escapenumber private mail bag aitkenvale qld escapenumber phone www escapenumber escapenumber escapenumber escapenumber escapenumber fax home http herry ausbats org au webadmin abs http ausbats org au sustainable ecosystems http www cse csiro au r help stat math ethz ch mailing list https stat ethz ch mailman listinfo r help please do read the posting guide http www r project org posting guide html and provide commented minimal self contained reproducible code"]
# onvert text to features vectors 
input_data_feature = feature_extraction.transform(input_mail)

# now making prediction 
prediction = loaded_model.predict(input_data_feature)

if (prediction[0] == 1):
    print("Ham Mail")
else:
    print("Spam Mail")

Spam Mail


In [53]:
print(prediction)

[0]


In [54]:
# like example of prediction 
my_list = [1,2,3] 
print(my_list[0])

1
