In [50]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords


In [51]:
df = pd.read_csv('spam.csv',encoding='ISO-8859-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [52]:
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
3870,ham,Yeah my usual guy's out of town but there're d...,,,
1195,spam,You have 1 new voicemail. Please call 08719181503,,,
1131,ham,"Sorry, I'll call later",,,
1651,ham,I wan but too early lei... Me outside now wun ...,,,
1648,ham,If u laugh really loud.. If u talk spontaneous...,,,


In [53]:
df.shape

(5572, 5)

In [6]:
# 1. Data cleaning

In [54]:
# Call the info() method to get information about the dataset
df.info

<bound method DataFrame.info of         v1                                                 v2 Unnamed: 2  \
0      ham  Go until jurong point, crazy.. Available only ...        NaN   
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...    ...                                                ...        ...   
5567  spam  This is the 2nd time we have tried 2 contact u...        NaN   
5568   ham              Will Ì_ b going to esplanade fr home?        NaN   
5569   ham  Pity, * was in mood for that. So...any other s...        NaN   
5570   ham  The guy did some bitching but I acted like i'd...        NaN   
5571   ham                         Rofl. Its true to its name        NaN   

     Unnamed: 3 Unnamed: 4  
0           NaN        NaN

In [55]:
# Remove unnecessary columns
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [56]:
df.sample(5)

Unnamed: 0,v1,v2
4490,ham,"My friend, she's studying at warwick, we've pl..."
406,ham,All was well until slightly disastrous class t...
3858,spam,Win the newest åÒHarry Potter and the Order of...
2760,ham,I dont thnk its a wrong calling between us
3207,ham,Oops my phone died and I didn't even know. Yea...


In [57]:
# Rename columns
df = df.rename(columns={'v2': 'text', 'v1': 'label'})
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [58]:
df = df[['text', 'label']]
df.head()

Unnamed: 0,text,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [59]:
# Check for missing values in data
missing_values = df.isnull().sum()

# Print the number of missing values for each column
print(missing_values)

text     0
label    0
dtype: int64


In [25]:
# Check for duplicate values in the DataFrame
duplicates = df.duplicated().sum()

In [60]:
duplicates

403

In [26]:
# Remove the duplicate rows and keep the first occurrence of each unique row
df.drop_duplicates(keep='first', inplace=True)

In [61]:
df.duplicated().sum()

403

In [20]:
df.shape

(5169, 2)

In [21]:
# 2.Preprocessing the dataset

In [62]:
# check for null values
df.isnull().sum()

text     0
label    0
dtype: int64

In [63]:
nltk.download('stopwords')
# Set of English stopwords
STOPW = set(stopwords.words('english'))  
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and replace them with a space
    text = re.sub(r'[^0-9a-zA-Z]', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove stopwords from the text
    text = " ".join(word for word in text.split() if word not in STOPW)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Norhan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [64]:
df['clean_text'] = [clean_text(text) for text in df['text']]
df.head()


Unnamed: 0,text,label,clean_text
0,"Go until jurong point, crazy.. Available only ...",ham,go jurong point crazy available bugis n great ...
1,Ok lar... Joking wif u oni...,ham,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,U dun say so early hor... U c already then say...,ham,u dun say early hor u c already say
4,"Nah I don't think he goes to usf, he lives aro...",ham,nah think goes usf lives around though


In [65]:
#Input 
X = df['clean_text']
y = df['label']

In [66]:
#ModelTraining
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression

def classify(model, X, y):
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True, stratify=y)
    
    # Create pipeline
    pipeline_model = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', model)
    ])
    
    # Fit the pipeline on the training data
    pipeline_model.fit(X_train, y_train)
    
    # Calculate accuracy on the testing data
    accuracy = pipeline_model.score(X_test, y_test)
    print('Accuracy:', accuracy * 100)
    
    # Make predictions on the testing data
    y_pred = pipeline_model.predict(X_test)
    # Print classification report
    print(classification_report(y_test, y_pred))

In [67]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
classify(model, X, y)

Accuracy: 96.69777458722182
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1206
        spam       1.00      0.75      0.86       187

    accuracy                           0.97      1393
   macro avg       0.98      0.88      0.92      1393
weighted avg       0.97      0.97      0.96      1393



In [68]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model, X, y)

Accuracy: 96.8413496051687
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1206
        spam       0.99      0.77      0.87       187

    accuracy                           0.97      1393
   macro avg       0.98      0.88      0.92      1393
weighted avg       0.97      0.97      0.97      1393

