In [15]:
#import all libraries

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline


In [16]:
# importing file from the computer to colab and loading it
data = pd.read_csv("spam.csv",encoding='latin1')
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [17]:
data = data[["v1", "v2"]] #only necessary features to be involved
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [18]:
data[data.isnull().any(axis=1)] #check for null values

Unnamed: 0,v1,v2


In [19]:
#renaming the columns and assigning numeric values to ham and spam

data = data.rename(columns={"v1": "label", "v2": "message"})
data['label'] = data['label'].str.strip().str.lower()

# Now map 'ham' to 0 and 'spam' to 1
data['label'] = data['label'].map({'ham': 0, 'spam': 1})
data

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [20]:
# splitting our dataset into training and testing data (80-20 split)
train_size = int(0.8 * len(data))
train_data = data[:train_size]
test_data = data[train_size:]

# Separate features and labels
X_train = train_data['message']
y_train = train_data['label']

X_test = test_data['message']
y_test = test_data['label']


In [21]:
train_data.shape

(4457, 2)

In [22]:
test_data.shape

(1115, 2)

In [23]:
pipeline = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('clf',LogisticRegression())])


In [24]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)

In [25]:
accuracy = accuracy_score(y_pred,y_test) # MODEL IS COMPLETE
print("Accuracy is : ",accuracy)  #checking for accuracy and report
print(classification_report(y_test, y_pred))

Accuracy is :  0.9730941704035875
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       970
           1       0.99      0.80      0.89       145

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [26]:
import joblib 

joblib.dump(pipeline,'spam_pipeline.pkl')

['spam_pipeline.pkl']