In [2]:
import pandas as pd
import string

In [3]:

df = pd.read_csv('spam.csv', encoding='ISO-8859-1')

df["v2"] = df["v2"].astype(str)

df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1,inplace=True)
df.rename(columns = {'v1':'Status','v2':'Text'}, inplace=True)


def remove_punctuations(text):
    for char in string.punctuation:
        text = text.replace(char, '')
    return text

df['Text'] = df['Text'].apply(remove_punctuations) 
df['Text'] = df['Text'].str.lower()


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Status  5572 non-null   object
 1   Text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
df.head()

Unnamed: 0,Status,Text
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...


In [6]:
df['Status'].replace({'ham': 0,'spam':1}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Status'].replace({'ham': 0,'spam':1}, inplace=True)
  df['Status'].replace({'ham': 0,'spam':1}, inplace=True)


In [7]:
df.head()

Unnamed: 0,Status,Text
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


In [8]:
df = df.drop_duplicates()
df.groupby('Status').describe()

Unnamed: 0_level_0,Text,Text,Text,Text
Unnamed: 0_level_1,count,unique,top,freq
Status,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,4499,4499,go until jurong point crazy available only in ...,1
1,643,643,free entry in 2 a wkly comp to win fa cup fina...,1


In [9]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pith1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
df['Text Token'] = df['Text'].apply(word_tokenize)


In [11]:
df.head()

Unnamed: 0,Status,Text,Text Token
0,0,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,0,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,1,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,0,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,0,nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [12]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()
df['Text Lemmatize'] = df['Text Token'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pith1\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\pith1\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [13]:
df.head()

Unnamed: 0,Status,Text,Text Token,Text Lemmatize
0,0,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, until, jurong, point, crazy, available, o..."
1,0,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,1,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,0,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, so, early, hor, u, c, already, t..."
4,0,nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, i, dont, think, he, go, to, usf, he, lif..."


In [14]:
df['Text'] = df['Text Lemmatize'].apply(lambda token: ' '.join(token) )
df.head()


Unnamed: 0,Status,Text,Text Token,Text Lemmatize
0,0,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, until, jurong, point, crazy, available, o..."
1,0,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,1,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,0,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, so, early, hor, u, c, already, t..."
4,0,nah i dont think he go to usf he life around h...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, i, dont, think, he, go, to, usf, he, lif..."


In [15]:
df.drop(['Text Token','Text Lemmatize'],axis=1,inplace=True)

In [16]:
df.head()

Unnamed: 0,Status,Text
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he go to usf he life around h...


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report


X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Status'], test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

adaboost = AdaBoostClassifier(n_estimators=50, random_state=42)
adaboost.fit(X_train_tfidf, y_train)

y_pred = adaboost.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Accuracy: 0.9650145772594753

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       902
           1       0.95      0.76      0.84       127

    accuracy                           0.97      1029
   macro avg       0.96      0.88      0.91      1029
weighted avg       0.96      0.97      0.96      1029



In [18]:
while True:
    user_input = input("Enter text (type 'exit' to quit): ")

    if user_input.lower() == 'exit':
        print("Exiting...")
        break

    user_input_tfidf = tfidf_vectorizer.transform([user_input])

    predicted_label = adaboost.predict(user_input_tfidf)

    if predicted_label == 1:
        print("Spam")
    else:
        print("Not Spam")

Not Spam
Not Spam
Not Spam
Not Spam
Exiting...


Dear valued customer, our company is delighted to offer you a special discount on our latest products. Please visit our website for more information.
