In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
import pickle
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import SMOTE

In [None]:
# Load data
data = pd.read_csv('/content/spam.csv', encoding='latin-1')

In [None]:
# Print column names to verify
print(data.columns)

In [None]:
# Adjust renaming based on actual column names
data = data.rename(columns={data.columns[0]: "label", data.columns[1]: "message"})
data = data[["label", "message"]]

In [None]:
# Map labels to binary values
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [None]:
# Preprocessing
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords.words('english')])
    return text

data['processed_message'] = data['message'].apply(preprocess_text)

In [None]:
# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')
X = tfidf.fit_transform(data['processed_message']).toarray()
y = data['label']

In [None]:
# Balance dataset using SMOTE
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Gradient Boosting Model
model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Evaluate model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(classification_report(y_test, y_pred))

In [None]:
# Save model and vectorizer
pickle.dump(model, open('spam.pkl', 'wb'))
pickle.dump(tfidf, open('vectorizer.pkl', 'wb'))