In [8]:
"""The model performs the following:
1.Load and clean the dataset.
2.Convert text into numbers using TfidfVectorizer.
3.Split data into training and testing sets.
4.Train a logistic regression model.
5.Evaluate model performance using accuracy and confusion matrix.
6.Tune the model using GridSearchCV to find the best settings.
7.Evaluate the improved model with the best settings."""



import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [9]:
# Loading dataset
df = pd.read_csv("/kaggle/input/task01/Task_1.csv")

# Handle inappropriate values
df.dropna(inplace=True)

# Check for class imbalance
class_distribution = df['labels'].value_counts()
print(class_distribution)

labels
ham     14475
spam     2241
Name: count, dtype: int64


 class imbalance is when there can be more spam messages than ham (or vice versa).

In [10]:
# Vectorizer and transforming text data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])
y = df['labels']

# Model training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
"""train_test_split Splits the data into training and testing sets. We use 80% for training and 20% for testing."""

model = LogisticRegression(class_weight='balanced',solver='saga', max_iter=2000)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy with saga solver: {accuracy}')


Accuracy with saga solver: 0.9877392344497608


accuracy score calculates the accuracy by comparing the predicted labels (y_pred) with the actual labels (y_test)

In [11]:
# Create a confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", confusion_mat)

# Grid search for hyperparameter tuning
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}  
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best model from GridSearchCV
best_model = grid_search.best_estimator_

# Predict with the best model
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print("Best Accuracy:", accuracy_best)

Confusion Matrix:
 [[2878   15]
 [  26  425]]
Best Accuracy: 0.9919258373205742


In confusion matrix a11 depicts-true positives, a12-true negatives, a21-false positives, a22- false negatives.