In [None]:
import numpy as np
import pandas as pd

In [None]:
import pandas as pd

# Option 1: Specify a different encoding like ISO-8859-1 (Latin-1)
df = pd.read_csv(r"C:\Users\kostu\Downloads\spam_test.csv", encoding='ISO-8859-1')

# Option 2: Try Windows encoding (Windows-1252)
#df = pd.read_csv(r"C:\Users\devgo\Downloads\spam.csv", encoding='Windows-1252')

# Option 3: Ignore bad characters
#df = pd.read_csv(r"C:\Users\devgo\Downloads\spam.csv", encoding='utf-8', errors='ignore')


In [None]:
df.sample(5)

In [None]:
df.shape

In [1]:
### Process Flow of the project
# 1. Data cleaning
# 2. EDA
# 3. Text Preprocessing
# 4. Model building

## 1. Data Cleaning

In [None]:
df.info()

In [None]:
# drop last 3 cols
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [None]:
df.sample(5)

In [None]:
# renaming the cols
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.sample(5)

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
df['target'] = encoder.fit_transform(df['target'])

In [None]:
df.head()

In [None]:
# missing values
df.isnull().sum()

In [None]:
# check for duplicate values
df.duplicated().sum()

In [None]:
# remove duplicates
df = df.drop_duplicates(keep='first')

In [None]:
df.duplicated().sum()

In [None]:
df.shape

## 2.EDA

In [None]:
df.head()

In [None]:
df['target'].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(), labels=['ham','spam'],autopct="%0.2f")
plt.show()

In [None]:
# Data is imbalanced

In [None]:
import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')

In [None]:
!pip install nltk

In [None]:
nltk.download('punkt')

In [None]:
df['num_characters'] = df['text'].apply(len)

In [None]:
df.head()

In [None]:
# num of words
df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
df.head()

In [None]:
df['num_sentences'] = df['text'].apply(lambda x:len(nltk.sent_tokenize(x))) 
#here this converts text into list and then we count it

In [None]:
df.head()

In [None]:
df[['num_characters','num_words','num_sentences']].describe()

In [None]:
# ham
df[df['target'] == 0][['num_characters','num_words','num_sentences']].describe()

In [None]:
#spam
df[df['target'] == 1][['num_characters','num_words','num_sentences']].describe()

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df[df['target'] == 0]['num_characters'])
sns.histplot(df[df['target'] == 1]['num_characters'],color='red')

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df[df['target'] == 0]['num_words'])
sns.histplot(df[df['target'] == 1]['num_words'],color='red')

In [None]:
sns.pairplot(df,hue='target')

In [None]:
sns.heatmap(df[['num_characters','num_words','num_sentences']].corr(),annot=True)

## 3. Text Preprocessing
- Lower case
- Tokenization
 

In [None]:
import string
from nltk.stem import PorterStemmer
 
ps = PorterStemmer()

In [None]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    
            
    return " ".join(y)

In [None]:
transform_text("I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.")

In [None]:
df['text'][10]

In [None]:
ps.stem('loving')

In [None]:
df['transformed_text'] = df['text'].apply(transform_text)

In [None]:
df.head()

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')

In [None]:
spam_wc = wc.generate(df[df['target'] == 1]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.figure(figsize=(15,6))
plt.imshow(spam_wc)

In [None]:
ham_wc = wc.generate(df[df['target'] == 0]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.figure(figsize=(15,6))
plt.imshow(ham_wc)

In [None]:
df.head()

In [None]:
spam_corpus = []
for msg in df[df['target'] == 1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

In [None]:
len(spam_corpus)

In [None]:
from collections import Counter
##sns.barplot(pd.DataFrame(Counter(spam_corpus).most_common(30))[0])
sns.barplot(pd.DataFrame(Counter(spam_corpus).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
ham_corpus = []
for msg in df[df['target'] == 0]['transformed_text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [None]:
len(ham_corpus)

In [None]:
print(Counter(ham_corpus).most_common(20))

In [None]:
from collections import Counter
sns.barplot(pd.DataFrame(Counter(ham_corpus).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
# Text Vectorization
# using Bag of Words
df.head()

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.ensemble import BaggingClassifier
# from sklearn.ensemble import ExtraTreesClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from xgboost import XGBClassifier