 # Task 2: Stratified K-Fold for Imbalanced Data

In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# 1. Load Data — the file has no headers, so add your own after reading
df = pd.read_csv(
    r"C:\Users\SMART TECH\Desktop\AppliedNLPMaterial-master\SMSSpamCollection",
    sep='\t', 
    encoding='latin-1',
    header=None  # Important: no header in file
)

# Rename the columns manually
df.columns = ['label', 'text']

# Convert labels to binary: ham = 0, spam = 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# 2. Feature Extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
y = df['label'].values

# Just to confirm it works
print("Shape of X:", X.shape)
print("First 5 labels:", y[:5])
df.head()


Shape of X: (5572, 8749)
First 5 labels: [0 0 1 0 0]


Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."



 # 2. Apply Stratified K-Fold (K=5)

In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import numpy as np


In [15]:
# 5-Fold Stratified Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
accuracies = []

print("\n=== Stratified K-Fold ===")


=== Stratified K-Fold ===


In [17]:
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Print class distribution in this fold
    unique, counts = np.unique(y_test, return_counts=True)
    print(f"\nFold {fold} - Class Distribution:", dict(zip(unique, counts)))

    # Train model
    clf = MultinomialNB()
    clf.fit(X_train, y_train)

    # Evaluate
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)
    print(f"Fold {fold} Accuracy: {acc:.4f}")
    fold += 1

print(f"\nAverage Accuracy over 5 folds: {np.mean(accuracies):.4f}")


Fold 1 - Class Distribution: {0: 965, 1: 150}
Fold 1 Accuracy: 0.9659

Fold 2 - Class Distribution: {0: 965, 1: 150}
Fold 2 Accuracy: 0.9605

Fold 3 - Class Distribution: {0: 965, 1: 149}
Fold 3 Accuracy: 0.9551

Fold 4 - Class Distribution: {0: 965, 1: 149}
Fold 4 Accuracy: 0.9578

Fold 5 - Class Distribution: {0: 965, 1: 149}
Fold 5 Accuracy: 0.9623

Average Accuracy over 5 folds: 0.9603


 # Task 3: Compare Train-Test Split vs K-Fold

In [20]:
from sklearn.model_selection import train_test_split, cross_val_score

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

clf_split = MultinomialNB()
clf_split.fit(X_train, y_train)
y_pred_split = clf_split.predict(X_test)
split_accuracy = accuracy_score(y_test, y_pred_split)

print("\n=== Train-Test Split ===")
print(f"Accuracy (80/20 split): {split_accuracy:.4f}")

# Cross-Validation (5-Fold)
clf_cv = MultinomialNB()
cv_scores = cross_val_score(clf_cv, X, y, cv=5, scoring='accuracy')

print("\n=== 5-Fold Cross-Validation ===")
print("Fold Accuracies:", cv_scores)
print(f"Mean CV Accuracy: {np.mean(cv_scores):.4f}")



=== Train-Test Split ===
Accuracy (80/20 split): 0.9605

=== 5-Fold Cross-Validation ===
Fold Accuracies: [0.96502242 0.95426009 0.95780969 0.9551167  0.96229803]
Mean CV Accuracy: 0.9589
