In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['Category', 'Message']]  # Select only the relevant columns
df.columns = ['label', 'text']  # Rename columns for convenience


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])  # spam = 1, ham = 0

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)
lr_pred = lr_model.predict(X_test_tfidf)

print("Logistic Regression:")
print(classification_report(y_test, lr_pred))
print("Accuracy:", accuracy_score(y_test, lr_pred))


Logistic Regression:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.76      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

Accuracy: 0.967713004484305


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
rf_pred = rf_model.predict(X_test_tfidf)

print("\nRandom Forest Classifier:")
print(classification_report(y_test, rf_pred))
print("Accuracy:", accuracy_score(y_test, rf_pred))



Random Forest Classifier:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Accuracy: 0.97847533632287


In [None]:
from sklearn.ensemble import AdaBoostClassifier

# AdaBoost Classifier
ab_model = AdaBoostClassifier(n_estimators=50, random_state=42)
ab_model.fit(X_train_tfidf, y_train)
ab_pred = ab_model.predict(X_test_tfidf)

print("\nAdaBoost Classifier:")
print(classification_report(y_test, ab_pred))
print("Accuracy:", accuracy_score(y_test, ab_pred))



AdaBoost Classifier:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       966
           1       0.95      0.83      0.89       149

    accuracy                           0.97      1115
   macro avg       0.96      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115

Accuracy: 0.9721973094170404


In [None]:
from sklearn.neighbors import KNeighborsClassifier

# K-Nearest Neighbors
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_tfidf, y_train)
knn_pred = knn_model.predict(X_test_tfidf)

print("\nK-Nearest Neighbors:")
print(classification_report(y_test, knn_pred))
print("Accuracy:", accuracy_score(y_test, knn_pred))



K-Nearest Neighbors:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95       966
           1       1.00      0.35      0.52       149

    accuracy                           0.91      1115
   macro avg       0.95      0.67      0.73      1115
weighted avg       0.92      0.91      0.89      1115

Accuracy: 0.9130044843049328


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, None],
    'criterion': ['gini', 'entropy']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train_tfidf, y_train)

# Best parameters and best model
print("Best parameters found: ", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_

# Evaluate the best model
best_rf_pred = best_rf_model.predict(X_test_tfidf)
print("\nTuned Random Forest Classifier:")
print(classification_report(y_test, best_rf_pred))
print("Accuracy:", accuracy_score(y_test, best_rf_pred))


Fitting 5 folds for each of 72 candidates, totalling 360 fits


  warn(


Best parameters found:  {'criterion': 'entropy', 'max_depth': None, 'max_features': 'auto', 'n_estimators': 200}

Tuned Random Forest Classifier:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Accuracy: 0.97847533632287
