In [None]:
# Import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string
import nltk
nltk.download('stopwords')

In [2]:
df = pd.read_csv("spam_ham_dataset.csv")

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df['label'].value_counts()

In [None]:
df.duplicated().sum()

In [None]:
# Set the overall aesthetic of the plots
sns.set(style="whitegrid")  # Professional style with grid

# Use the 'Dark2' palette for professional color tones
sns.histplot(data=df, x="label_num", hue="label", palette="Dark2", edgecolor="black", linewidth=1.5)

plt.title("Distribution of Label Numbers", fontsize=16, fontweight='bold')
plt.xlabel("Label Numbers", fontsize=14)
plt.ylabel("Counting", fontsize=14)

plt.show()

In [None]:
df.label_num.value_counts()

In [None]:
df.drop(df[df['label_num'] == 0].index[1499:],  inplace=True)
df

In [None]:
# Set the overall aesthetic of the plots
sns.set(style="whitegrid")  # Professional style with grid

# Use the 'Dark2' palette for professional color tones
sns.histplot(data=df, x="label_num", hue="label", palette="Dark2", edgecolor="black", linewidth=1.5)

plt.title("Distribution of Label Numbers", fontsize=16, fontweight='bold')
plt.xlabel("Label Numbers", fontsize=14)
plt.ylabel("Counting", fontsize=14)

plt.show()

In [12]:
df.drop(["Unnamed: 0",'label'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.label_num.value_counts()

In [None]:
# lets check through pie chart
# labels = df['label_num'].value_counts().index
labels = ['ham', 'spam']
sizes = df['label_num'].value_counts().values
colors = ['#FF9999', '#66B3FF', '#99FF99', '#FFCC99', '#FFD700']  # Define custom colors

plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%')
plt.axis('equal')  # Ensures the pie chart is a circle
plt.show()

In [20]:
# Create the function to preprocess the text

def clean_text(text):
    
    # Remove the punctuations
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Tokenize the words
    words = text.split()
    
    # Remove stopwords and apply stemming (the library we use)
    stop_words = stopwords.words('english')
    stemmer = SnowballStemmer('english')
    words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]
    return ' '.join(words)

df['clean_text'] = df['text'].apply(clean_text)

In [21]:
tfidf_vectorizer = TfidfVectorizer()
X=tfidf_vectorizer.fit_transform(df["clean_text"])
y=df["label_num"]

In [22]:
X_train, X_text, y_train, y_test=train_test_split(X, y ,test_size=0.2,random_state=42, stratify=y)

In [None]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=7)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier()

In [None]:

knn=KNeighborsClassifier(n_neighbors=7)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()

In [None]:
from xgboost import XGBClassifier
xgb=XGBClassifier()

In [None]:
# Define models
models = {
    "Naive Bayes": MultinomialNB(),
    "SVM": SVC(),
    "LogisticRegression": lg,
    "KNN": knn,
    "Decision Tree": dtc,
    "Random Forest": rfc,
    "XGBoost": xgb,
}

best_results = {}
best_model_name = None
best_accuracy = 0.0 

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)  # Training  the model
    y_pred = model.predict(X_text)  # Predict on the test set

    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    best_results[name] = accuracy  # Store accuracy in the results dictionary

    # Display accuracy and classification report for the current model
    print(f"{name} Accuracy: {accuracy:.2f}")
    print("-----------------------------------------------------")
    print(classification_report(y_test, y_pred))

    # Check if this is the best accuracy so far
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_name = name

# Display the model with the best accuracy at the end
print("\nBest Model:")
print(f"{best_model_name} with Accuracy: {best_accuracy:.2f}")

In [None]:
# Visualize the models Accuracy 

# Define a color palette
colors = ['#1f77b4', '#ff7f0e','#FF9999', '#66B3FF', '#99FF99', '#FFCC99', '#FFD700']  

plt.figure(figsize=(12, 6))
bars = plt.bar(best_results.keys(), best_results.values(), color=colors)
plt.title('Model Comparison', fontsize=16, fontweight='bold')
plt.xlabel('Model', fontsize=14)
plt.ylabel('Accuracy', fontsize=14)
plt.ylim(0.9, 1.0)

# Add value labels on top of bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height + 0.01, f'{height:.2f}', 
             ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()