# Step 1: Load and Explore the Data

In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('email.csv')  # Adjust filename if different

# Display basic info
print(df.info())
print(df.head())

# Check class distribution
print("\nClass distribution:")
print(df['Category'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5573 entries, 0 to 5572
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5573 non-null   object
 1   Message   5573 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...

Class distribution:
Category
ham               4825
spam               747
{"mode":"full"       1
Name: count, dtype: int64


# Step 2: Data Preprocessing

In [3]:
print(df['Category'].unique())

['ham' 'spam' '{"mode":"full"']


In [4]:
df['Category'] = df['Category'].str.strip().str.lower()

In [5]:
df = df[df['Category'].isin(['ham', 'spam'])]


In [6]:
print(df['Category'].unique())

['ham' 'spam']


In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    words = nltk.word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    return ' '.join(words)

# Apply preprocessing
df['processed_text'] = df['Message'].apply(preprocess_text)

# Convert labels to numerical values (0 for ham, 1 for spam)
df['label_num'] = df['Category'].map({'ham': 0, 'spam': 1})

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
print(df['label_num'].isnull().sum()) 

0


In [9]:
print(df['Category'].value_counts())

Category
ham     4825
spam     747
Name: count, dtype: int64


# Step 3: Feature Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')

X = vectorizer.fit_transform(df['processed_text']).toarray()
# Convert labels to numerical values (0 for ham, 1 for spam)
y = df['label_num']




# Step 4: Train-Test Split

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Step 5: Model Training

In [15]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

model.fit(X_train, y_train)


# Step 6: Model Evaluation

In [16]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9721973094170404

Confusion Matrix:
[[966   0]
 [ 31 118]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.79      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115



# Step 7: Export Model (Bonus 1)

In [17]:
import joblib

joblib.dump(model, 'spam_classifier.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

example

In [21]:
sample_spam = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 8"
features = vectorizer.transform([sample_spam])
print(model.predict(features))  # output [1]


[1]
