In [2]:
import pandas as pd

# Load the dataset
file_path = "updated_sms.csv"
df = pd.read_csv(file_path, encoding="utf-8")  # Using UTF-8 encoding

# Display basic info
df.info()

# Show first few rows
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6064 entries, 0 to 6063
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    6064 non-null   object
 1   message  6059 non-null   object
dtypes: object(2)
memory usage: 94.9+ KB


Unnamed: 0,label,message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in a wkly comp to win fa cup final...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


In [3]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download stopwords if not available
nltk.download("stopwords")

# Initialize stemmer
ps = PorterStemmer()

# Define preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):  # Ensure text is valid
        return ""
    
    text = text.lower()  # Convert to lowercase
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    words = [ps.stem(word) for word in words if len(word) > 2]  # Apply stemming & remove short words
    
    return ' '.join(words) if words else "empty"  # Ensure non-empty output

# Apply preprocessing
df["message"] = df["message"].apply(preprocess_text)

# Show results
df.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/krishnam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,label,message
0,0,jurong point crazi avail bugi great world buff...
1,0,lar joke wif oni
2,1,free entri wkli comp win cup final tkt may tex...
3,0,dun say earli hor alreadi say
4,0,nah dont think goe usf live around though


In [4]:
# Define a custom stopwords list (common English stopwords)
custom_stopwords = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", 
    "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", 
    "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", 
    "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", 
    "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", 
    "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", 
    "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", 
    "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", 
    "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", 
    "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", 
    "don", "should", "now"
])

# Update preprocessing function to use custom stopwords
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    
    text = text.lower()  # Convert to lowercase
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    words = text.split()
    words = [word for word in words if word not in custom_stopwords]  # Remove stopwords
    words = [ps.stem(word) for word in words if len(word) > 2]  # Apply stemming & remove short words
    
    return ' '.join(words) if words else "empty"

# Apply preprocessing again
df["message"] = df["message"].apply(preprocess_text)

# Show processed text
df.head()


Unnamed: 0,label,message
0,0,jurong point crazi avail bugi great world buff...
1,0,lar joke wif oni
2,1,free entri wkli comp win cup final tkt may tex...
3,0,dun say earli hor alreadi say
4,0,nah dont think goe usf live around though


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Transform messages into numerical format
X = vectorizer.fit_transform(df["message"])

# Extract labels
y = df["label"]

# Check shape of transformed data
X.shape, y.shape


((6064, 6961), (6064,))

In [6]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check distribution of labels in train & test sets
y_train.value_counts(normalize=True), y_test.value_counts(normalize=True)


(label
 0      0.795712
 1      0.203875
 ham    0.000412
 Name: proportion, dtype: float64,
 label
 0    0.795548
 1    0.204452
 Name: proportion, dtype: float64)

In [8]:
# Ensure y_test and y_pred are integers
y_test = y_test.astype(int)
y_pred = y_pred.astype(int)

# Recalculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)
conf_matrix = confusion_matrix(y_test, y_pred)

# Display results
accuracy, precision, recall, f1, conf_matrix


(0.9678483099752679,
 0.9771689497716894,
 0.8629032258064516,
 0.9164882226980728,
 array([[960,   5],
        [ 34, 214]]))

In [12]:
# Ensure labels are integers
y_test = y_test.astype(int)

# Loop through each alpha value and retrain
nb_results = {}
for alpha in alpha_values:
    nb_model = MultinomialNB(alpha=alpha)
    nb_model.fit(X_train, y_train)

    # Predict and convert to integers
    y_pred = nb_model.predict(X_test).astype(int)

    # Compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label=1)
    recall = recall_score(y_test, y_pred, pos_label=1)
    f1 = f1_score(y_test, y_pred, pos_label=1)

    # Store results
    nb_results[alpha] = (accuracy, precision, recall, f1)

# Convert results to DataFrame
nb_results_df = pd.DataFrame(nb_results, index=["Accuracy", "Precision", "Recall", "F1"]).T
nb_results_df


Unnamed: 0,Accuracy,Precision,Recall,F1
0.1,0.972795,0.928287,0.939516,0.933868
0.5,0.974444,0.965665,0.907258,0.935551
1.0,0.967848,0.977169,0.862903,0.916488
2.0,0.945589,0.989247,0.741935,0.847926
5.0,0.892828,1.0,0.475806,0.644809


In [19]:
# Reinitialize and fit LabelEncoder on the entire dataset (y)
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)  # Fit on full labels

# Split the dataset again to keep consistency
y_train_encoded, y_test_encoded = train_test_split(y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Train Random Forest again with the corrected labels
rf_model.fit(X_train, y_train_encoded)
y_pred_rf = rf_model.predict(X_test)

# Recalculate performance metrics using "weighted" average to handle class imbalance better
accuracy_rf = accuracy_score(y_test_encoded, y_pred_rf)
precision_rf = precision_score(y_test_encoded, y_pred_rf, average="weighted")
recall_rf = recall_score(y_test_encoded, y_pred_rf, average="weighted")
f1_rf = f1_score(y_test_encoded, y_pred_rf, average="weighted")

# Display results
accuracy_rf, precision_rf, recall_rf, f1_rf




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.9604286892003298,
 0.9610461806864413,
 0.9604286892003298,
 0.9599885737346161)

In [21]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Define a stratified K-Fold to balance class distribution
cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)  # Reduce splits to 2

# Define parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

# Perform Grid Search with balanced CV
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=cv, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train, y_train_encoded)

# Print best parameters
print("Best Hyperparameters:", grid_search.best_params_)


Best Hyperparameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}


In [22]:
# Train optimized Random Forest model
optimized_rf = RandomForestClassifier(
    max_depth=None,
    min_samples_split=5,
    n_estimators=200,
    random_state=42,
    class_weight="balanced"
)

optimized_rf.fit(X_train, y_train_encoded)

# Make predictions
y_pred_optimized = optimized_rf.predict(X_test)

# Evaluate performance
accuracy_opt = accuracy_score(y_test_encoded, y_pred_optimized)
precision_opt = precision_score(y_test_encoded, y_pred_optimized, average="weighted")
recall_opt = recall_score(y_test_encoded, y_pred_optimized, average="weighted")
f1_opt = f1_score(y_test_encoded, y_pred_optimized, average="weighted")

# Display final results
print("Final Optimized Random Forest Performance:")
print("✅ Accuracy:", accuracy_opt)
print("✅ Precision:", precision_opt)
print("✅ Recall:", recall_opt)
print("✅ F1 Score:", f1_opt)


Final Optimized Random Forest Performance:
✅ Accuracy: 0.9744435284418796
✅ Precision: 0.9755211161901838
✅ Recall: 0.9744435284418796
✅ F1 Score: 0.9743774343100848


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
import joblib
joblib.dump(optimized_rf, "optimized_sms_spam_model.pkl")
print("Model saved as optimized_sms_spam_model.pkl")


Model saved as optimized_sms_spam_model.pkl


In [26]:
# Find misclassified indices
misclassified_idx = (y_test_encoded != y_pred_optimized)

# Extract test set indices
test_indices = y_test.index  # Get the correct test set indices

# Extract misclassified messages
misclassified_messages = df.loc[test_indices[misclassified_idx], "message"]

# Display misclassified messages
print("Misclassified Messages:\n", misclassified_messages)


Misclassified Messages:
 5615          win allexpensespaid trip vega enter contest
731     email alertfrom jeri stewart kbsubject lowcost...
751     realiz year well thousand old ladi run around ...
2863                          adult content video shortli
3530    xma new year eve ticket sale club day till thu...
5752                  packag await clearanc pay fee relea
5811                           packag ship track deliveri
2699                                            lost help
3981                                            ringtonek
1154    girl mani local virgin readi fil everi sexual ...
3360                        sorri miss call let talk time
4598                                  full heat appli oil
2823    romcapspam everyon around respond well presenc...
5468    httptm widelivecomindex wmlidadafirsttrueåác r...
1458    clair havin borin time alon wanna cum nite cha...
5737               packag wait post offic pay fee collect
4425                       updat face book stat

In [27]:
import joblib

# Save the trained model
joblib.dump(optimized_rf, "optimized_spam_classifier.pkl")

# Save the TF-IDF vectorizer too
joblib.dump(vectorizer, "optimized tfidf_vectorizer.pkl")

print("✅ Model and vectorizer saved successfully!")


✅ Model and vectorizer saved successfully!
