<a href="https://colab.research.google.com/github/Headyhadia/Machine-Learning-Practice/blob/main/EmailSpamClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **TASK 1: Email Spam Classification (Text-Based ML Task)**

# Import libraries

In [None]:
# Import libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# 2. Load and Clean Dataset

In [None]:
# 2. Load and Clean Dataset

# Load the dataset (
try:
    df = pd.read_csv('spam.csv', encoding='latin-1')
except FileNotFoundError:
    print("Error: 'spam.csv' not found. Please ensure the file is in the notebook directory.")

# Remove unnecessary columns and rename useful ones
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], errors='ignore')
df.columns = ['label', 'message']

# Encode 'spam' as 1 and 'ham' as 0
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# Display basic info
print(f"Dataset Shape: {df.shape}")
print(df.head())
print("\nClass Distribution:\n", df['label'].value_counts())

Dataset Shape: (5572, 3)
  label                                            message  label_num
0   ham  Go until jurong point, crazy.. Available only ...          0
1   ham                      Ok lar... Joking wif u oni...          0
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...          1
3   ham  U dun say so early hor... U c already then say...          0
4   ham  Nah I don't think he goes to usf, he lives aro...          0

Class Distribution:
 label
ham     4825
spam     747
Name: count, dtype: int64


# 3. Feature and Target Definition

In [None]:
# 3. Feature and Target Definition

# Feature variable (X) and Target variable (y)
X = df['message']
y = df['label_num']

# Split data: 80% Training, 20% Testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Vectorization (TF-IDF)

In [None]:
# 4. Vectorization (TF-IDF)

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', lowercase=True)

# Fit and transform the training data, transform the test data
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# 5. Model Training and Evaluation

In [None]:
# 5. Model Training and Evaluation

# Initialize and train Naive Bayes Classifier
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("\nDetailed Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

Model Accuracy: 96.68%

Detailed Classification Report:

              precision    recall  f1-score   support

         Ham       0.96      1.00      0.98       965
        Spam       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



# 6. Sample Predictions


In [None]:
# 6. Sample Predictions

# Compare actual labels with predictions on a few samples
results = pd.DataFrame({
    'Actual Message': X_test.iloc[:5].values,
    'Actual Label': y_test.iloc[:5].map({0: 'Ham', 1: 'Spam'}).values,
    'Predicted Label': pd.Series(y_pred[:5]).map({0: 'Ham', 1: 'Spam'}).values
})

results

Unnamed: 0,Actual Message,Actual Label,Predicted Label
0,"Funny fact Nobody teaches volcanoes 2 erupt, t...",Ham,Ham
1,I sent my scores to sophas and i had to do sec...,Ham,Ham
2,We know someone who you know that fancies you....,Spam,Ham
3,Only if you promise your getting out as SOON a...,Ham,Ham
4,Congratulations ur awarded either å£500 of CD ...,Spam,Spam
