In [1]:
# Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

**Load the Dataset**

In [2]:
file_path = 'spam.csv'
data = pd.read_csv(file_path, encoding='latin-1')

**Clean the Dataset**

In [3]:
# Clean the dataset by removing unnecessary columns
data_cleaned = data[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'message'})
data_cleaned.dropna(inplace=True)

**Preprocess the Labels**

In [4]:
# Convert labels to binary values (ham: 0, spam: 1)
data_cleaned['label'] = data_cleaned['label'].map({'ham': 0, 'spam': 1})


**Split the Dataset into Training and Testing Sets**

In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_cleaned['message'], data_cleaned['label'], test_size=0.2, random_state=42)

**Transform the Text Data Using TF-IDF**

In [6]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

**Train a Logistic Regression Classifier with Hyperparameter Tuning**

In [7]:
param_grid = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train_tfidf, y_train)

**Use the Best Model from Grid Search**

In [8]:
best_model = grid_search.best_estimator_

**Predict the Labels on the Test Set**

In [9]:
y_pred = best_model.predict(X_test_tfidf)

**Evaluate the Model**

In [10]:
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)


Best Parameters: {'C': 100}
Accuracy: 0.97847533632287
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.97      0.87      0.92       150

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

