
# Importing the data

In [4]:
import numpy as np
import pandas as pd

# Specify the encoding as 'latin1' (ISO-8859-1) which is a common encoding for CSV files
data = pd.read_csv('C:\\Users\\dell\\OneDrive\\Documents\\Titanic data for data science\\spam.csv', encoding='latin1')
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


# Dropping the columns that we don't need

In [5]:
columns_to_drop = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"]
data.drop(columns=columns_to_drop, inplace=True)

In [6]:
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# Data Analysis

In [7]:
data.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [8]:
data.isnull().sum()

v1    0
v2    0
dtype: int64

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


# Data Processing

In [10]:
data_encoded = pd.get_dummies(data, columns=['v1'])
df=data_encoded

In [11]:
df

Unnamed: 0,v2,v1_ham,v1_spam
0,"Go until jurong point, crazy.. Available only ...",True,False
1,Ok lar... Joking wif u oni...,True,False
2,Free entry in 2 a wkly comp to win FA Cup fina...,False,True
3,U dun say so early hor... U c already then say...,True,False
4,"Nah I don't think he goes to usf, he lives aro...",True,False
...,...,...,...
5567,This is the 2nd time we have tried 2 contact u...,False,True
5568,Will Ì_ b going to esplanade fr home?,True,False
5569,"Pity, * was in mood for that. So...any other s...",True,False
5570,The guy did some bitching but I acted like i'd...,True,False


In [12]:
df.drop(columns=['v1_ham'], inplace=True)


In [50]:
df

Unnamed: 0,Mail,Spam
0,"Go until jurong point, crazy.. Available only ...",False
1,Ok lar... Joking wif u oni...,False
2,Free entry in 2 a wkly comp to win FA Cup fina...,True
3,U dun say so early hor... U c already then say...,False
4,"Nah I don't think he goes to usf, he lives aro...",False
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,True
5568,Will Ì_ b going to esplanade fr home?,False
5569,"Pity, * was in mood for that. So...any other s...",False
5570,The guy did some bitching but I acted like i'd...,False


In [14]:
df = df.rename(columns={'v1_spam': 'Spam','v2':'Mail'})
df.head()

Unnamed: 0,Mail,Spam
0,"Go until jurong point, crazy.. Available only ...",False
1,Ok lar... Joking wif u oni...,False
2,Free entry in 2 a wkly comp to win FA Cup fina...,True
3,U dun say so early hor... U c already then say...,False
4,"Nah I don't think he goes to usf, he lives aro...",False


# Feature Extraction and  Model Traning

In [15]:
from sklearn.model_selection import train_test_split
X=df['Mail']
Y=df['Spam']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
Y_train

1978    False
3989     True
3935    False
4078    False
4086     True
        ...  
3772    False
5191    False
5226    False
5390    False
860     False
Name: Spam, Length: 4457, dtype: bool

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
fe=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

In [28]:
X_train_fe=fe.fit_transform(X_train)
X_test_fe=fe.transform(X_test)

In [34]:
from sklearn.linear_model import LinearRegression

le = LinearRegression()

le.fit(X_train_fe, Y_train)


# Making Predictions

In [37]:
emails = [
    'Free entry in 2 a wkly comp to win FA Cup',
    'Go until jurong point, crazy'
]
mail=fe.transform(emails)

predict=le.predict(mail)

for i, predict in enumerate(predict):
    if predict == 0:
        print(f"Email {i+1}: Ham")
    else:
        print(f"Email {i+1}: Spam")


Email 1: Spam
Email 2: Spam


In [38]:
emails = [
    'we have contact with the mafia',
    'Nah I don\'t think he goes to usf, he lives around here though'
]

mail=fe.transform(emails)

predict=le.predict(mail)

for i, predict in enumerate(predict):
    if predict == 0:
        print(f"Email {i+1}: Ham")
    else:
        print(f"Email {i+1}: Spam")



Email 1: Spam
Email 2: Spam


# Accuracy of the Model

In [36]:
le.score(X_train_fe, Y_train)

0.9999826613156025