In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to dataset files:", path)

In [None]:
import pandas as pd
import os

# Folder path
dataset_dir = r"C:\Users\kjain\.cache\kagglehub\datasets\uciml\sms-spam-collection-dataset\versions\1"
dataset_path = os.path.join(dataset_dir, "spam.csv")

# Load the CSV
# The spam.csv from Kaggle usually has commas, but sometimes extra columns may exist, so we'll inspect first
data = pd.read_csv(dataset_path, encoding='latin-1')  # encoding='latin-1' avoids special char issues

# Optional: drop unnecessary extra columns
# Most Kaggle spam.csv files have columns: 'v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'
data = data[['v1', 'v2']]  # keep only label and message
data.columns = ['label', 'message']  # rename columns

# Check the first rows
data.head()

In [None]:
data.info()

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
data["label"] = encoder.fit_transform(data["label"])

In [None]:
data.head()

In [None]:
data.duplicated().sum()

In [None]:
data = data.drop_duplicates(keep = "first")

In [None]:
data.duplicated().sum()

In [None]:
data.shape

In [None]:
#EDA
data["label"].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.pie(data["label"].value_counts(),labels = ["ham","spam"],autopct = "%0.2f")
plt.show()

In [None]:
import nltk

In [None]:
nltk.download("punkt")

In [None]:
data["num_characters"] = data['message'].apply(len)

In [None]:
data.head()

In [None]:
import nltk
from nltk.tokenize import word_tokenize

data["num_words"] = data['message'].apply(lambda x: len(word_tokenize(x)))
data

In [None]:
from nltk.tokenize import sent_tokenize

data["num_sentences"] = data['message'].apply(lambda x: len(sent_tokenize(x)))
data

In [None]:
data.describe()

In [None]:
#ham
data[data["label"] == 0].describe()

In [None]:
#spam
data[data["label"] == 1].describe()

In [None]:
import seaborn as sns

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = (12,8))
sns.histplot(data[data["label"] == 0]["num_characters"], kde=False)
sns.histplot(data[data["label"] == 1]["num_characters"], color="red", kde=False)

plt.show()


In [None]:
sns.pairplot(data,hue = "label")

In [None]:

# Select only numeric columns
corr = data.select_dtypes(include=["number"]).corr()

plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.show()


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize


# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Preprocess a single string of text:
    1. Lowercase
    2. Tokenize using nltk.word_tokenize
    3. Remove non-alphanumeric tokens
    4. Remove stopwords
    5. Apply stemming
    """
    # 1. Lowercase
    text = text.lower()
    
    # 2. Tokenize
    tokens = word_tokenize(text)
    
    # 3 & 4. Remove non-alphanumeric tokens and stopwords
    clean_tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    
    # 5. Stemming
    stemmed_tokens = [stemmer.stem(token) for token in clean_tokens]
    
    return stemmed_tokens

preprocess_text("Hi how are you Nitish")

In [None]:
data["transformed_text"] = data['message'].apply(preprocess_text)
data

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Filter label == 1
texts = data[data["label"] == 1]["transformed_text"]

# Join list of tokens into a single string
spam_text = " ".join([" ".join(tokens) for tokens in texts if isinstance(tokens, list)])

# Generate WordCloud
spam_wc = WordCloud(width=400, height=400, background_color='white').generate(spam_text)

# Display
plt.figure(figsize=(15, 7))
plt.imshow(spam_wc, interpolation='bilinear')
plt.axis("off")
plt.title("WordCloud for label = 1")
plt.show()


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Filter label == 0
texts = data[data["label"] == 0]["transformed_text"]

# Join list of tokens into a single string
ham_text = " ".join([" ".join(tokens) for tokens in texts if isinstance(tokens, list)])

# Generate WordCloud
ham_wc = WordCloud(width=400, height=400, background_color='white').generate(spam_text)

# Display
plt.figure(figsize=(15, 7))
plt.imshow(spam_wc, interpolation='bilinear')
plt.axis("off")
plt.title("WordCloud for label = 0")
plt.show()


In [None]:
import matplotlib.pyplot as plt

def get_top_words(df, label, top_n=30):
    texts = df[df["label"] == label]["transformed_text"]
    word_counts = {}
    for tokens in texts:
        if isinstance(tokens, list):
            for word in tokens:
                if word in word_counts:
                    word_counts[word] += 1
                else:
                    word_counts[word] = 1
    # Sort and return top N words
    return sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:top_n]

# Get top 30 words for spam and ham
top_spam = get_top_words(data, label=1)
top_ham = get_top_words(data, label=0)

# Separate words and counts
spam_words, spam_counts = zip(*top_spam)
ham_words, ham_counts = zip(*top_ham)

# Plotting
plt.figure(figsize=(20, 8))

# Spam barplot
plt.subplot(1, 2, 1)
plt.bar(spam_words, spam_counts, color='red')
plt.xticks(rotation=90)
plt.title("Top 30 Words in Spam Messages")

# Ham barplot
plt.subplot(1, 2, 2)
plt.bar(ham_words, ham_counts, color='green')
plt.xticks(rotation=90)
plt.title("Top 30 Words in Ham Messages")

plt.tight_layout()
plt.show()


In [None]:
# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score



In [None]:
# 1. Prepare data
X = data['transformed_text'].apply(lambda x: ' '.join(x))  # join token lists to strings if needed
y = data['label']

# 2. Convert text to feature vectors
vectorizer = CountVectorizer()
X_vect = vectorizer.fit_transform(X)

# Convert to dense array for GaussianNB (needs dense input)
X_dense = X_vect.toarray()



In [None]:
# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)
X_train_dense, X_test_dense = train_test_split(X_dense, test_size=0.2, random_state=42)  # for GaussianNB

# 4. Initialize models
models = {
    "MultinomialNB": MultinomialNB(),
    "BernoulliNB": BernoulliNB(),
    "GaussianNB": GaussianNB()
}

# 5. Train and evaluate
for name, model in models.items():
    print(f"\n--- {name} ---")
    
    # Use dense input for GaussianNB
    if name == "GaussianNB":
        model.fit(X_train_dense, y_train)
        y_pred = model.predict(X_test_dense)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    cf = confusion_matrix(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print("Confusion Matrix:")
    print(cf)


In [None]:
# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score

# 1. Prepare data
X = data['transformed_text'].apply(lambda x: ' '.join(x))  # join token lists to strings if needed
y = data['label']

# 2. Convert text to TF-IDF feature vectors
vectorizer = TfidfVectorizer()
X_vect = vectorizer.fit_transform(X)

# Convert to dense array for GaussianNB
X_dense = X_vect.toarray()

# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)
X_train_dense, X_test_dense = train_test_split(X_dense, test_size=0.2, random_state=42)  # for GaussianNB

# 4. Initialize models
models = {
    "MultinomialNB": MultinomialNB(),
    "BernoulliNB": BernoulliNB(),
    "GaussianNB": GaussianNB()
}

# 5. Train and evaluate
for name, model in models.items():
    print(f"\n--- {name} ---")
    
    # Use dense input for GaussianNB
    if name == "GaussianNB":
        model.fit(X_train_dense, y_train)
        y_pred = model.predict(X_test_dense)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    cf = confusion_matrix(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print("Confusion Matrix:")
    print(cf)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report

# ------------- Prepare data ----------------
X = data['transformed_text'].apply(lambda x: ' '.join(x))
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

X_train_dense = X_train_tfidf.toarray()  # For models that need dense input
X_test_dense = X_test_tfidf.toarray()

# ------------- Define models ----------------
models = {
    "MultinomialNB": MultinomialNB(),
    "BernoulliNB": BernoulliNB(),
    "GaussianNB": GaussianNB(),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "ExtraTrees": ExtraTreesClassifier(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=500),
    "SVC": SVC(kernel='linear', probability=True),
    "KNeighbors": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(random_state=42)
}

# ------------- Train & evaluate ----------------
results = []

for name, model in models.items():
    if name == "GaussianNB":
        model.fit(X_train_dense, y_train)
        y_pred = model.predict(X_test_dense)
    else:
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_test_tfidf)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    results.append({"Model": name, "Accuracy": acc, "Precision": prec})

# Convert results to DataFrame
import pandas as pd
results_df = pd.DataFrame(results)

# Top 5 models based on precision
top5_precision = results_df.sort_values(by="Precision", ascending=False).head(5)
print("Top 5 Models Based on Precision:\n", top5_precision)

# Optional: show full metrics for top 5
for name in top5_precision['Model']:
    print(f"\n--- {name} ---")
    if name == "GaussianNB":
        y_pred = models[name].predict(X_test_dense)
    else:
        y_pred = models[name].predict(X_test_tfidf)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report

# Define the top models
top_models = [
    ('MultinomialNB', MultinomialNB()),
    ('BernoulliNB', BernoulliNB()),
    ('ExtraTrees', ExtraTreesClassifier(n_estimators=200, random_state=42)),
    ('RandomForest', RandomForestClassifier(n_estimators=200, random_state=42))
]

# Create a soft voting ensemble
ensemble = VotingClassifier(estimators=top_models, voting='soft')

# Fit ensemble on training data
ensemble.fit(X_train_tfidf, y_train)

# Predictions
y_pred = ensemble.predict(X_test_tfidf)

# Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
cf = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Ensemble Model Metrics:")
print("Accuracy:", acc)
print("Precision:", prec)
print("Confusion Matrix:\n", cf)
print("Classification Report:\n", report)


In [None]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import ExtraTreesClassifier

# Base models
base_models = [
    ('MultinomialNB', MultinomialNB()),
    ('BernoulliNB', BernoulliNB()),
    ('ExtraTrees', ExtraTreesClassifier(n_estimators=200, random_state=42)),
    ('RandomForest', RandomForestClassifier(n_estimators=200, random_state=42))
]

# Meta-model as RandomForest
meta_model = RandomForestClassifier(n_estimators=200, random_state=42)

# Stacking ensemble
stacking_ensemble_rf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    passthrough=True
)

# Fit ensemble
stacking_ensemble_rf.fit(X_train_tfidf, y_train)

# Predict
y_pred_stack_rf = stacking_ensemble_rf.predict(X_test_tfidf)

# Metrics
acc_stack_rf = accuracy_score(y_test, y_pred_stack_rf)
prec_stack_rf = precision_score(y_test, y_pred_stack_rf)
cf_stack_rf = confusion_matrix(y_test, y_pred_stack_rf)
report_stack_rf = classification_report(y_test, y_pred_stack_rf)

print("Stacking Ensemble with RandomForest as Meta-Model:")
print("Accuracy:", acc_stack_rf)
print("Precision:", prec_stack_rf)
print("Confusion Matrix:\n", cf_stack_rf)
print("Classification Report:\n", report_stack_rf)
