In [1]:
# Combined imports
import pandas as pd
import regex as re
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from pickle import dump

# Load and preprocess data
total_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv")
total_data["is_spam"] = total_data["is_spam"].astype(int)
total_data.drop_duplicates(inplace=True)
total_data.reset_index(drop=True, inplace=True)

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^a-z ]', " ", text)  # Remove non-letter characters
    text = re.sub(r'\\s+', " ", text.lower())  # Remove extra spaces and lowercase
    return text.split()

# Apply preprocessing
total_data["url"] = total_data["url"].apply(preprocess_text)

# Lemmatization and stopwords deletion
download("wordnet")
download("stopwords")
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))  # Use a set for faster lookup

def lemmatize_text(words):
    return [lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 3 and word != "http"]

total_data["url"] = total_data["url"].apply(lemmatize_text)

# Feature extraction with TF-IDF
tokens_list = [" ".join(tokens) for tokens in total_data["url"]]
vectorizer = TfidfVectorizer(max_features=5000, max_df=0.8, min_df=5)
X = vectorizer.fit_transform(tokens_list).toarray()
y = total_data["is_spam"]

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = SVC(kernel="linear", random_state=42)
model.fit(X_train, y_train)

# Model prediction and evaluation
y_pred = model.predict(X_test)
initial_accuracy = accuracy_score(y_test, y_pred)
print(f"Initial Model Accuracy: {initial_accuracy}")

# Hyperparameter Optimization
hyperparams = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "degree": [1, 2, 3, 4, 5],
    "gamma": ["scale", "auto"],
    "random_state": [42],
}

# Initialize hyperparameter search
grid = GridSearchCV(model, hyperparams, scoring="accuracy", cv=5)
grid.fit(X_train, y_train)

# Training the model with optimized hyperparameters
opt_model = SVC(**grid.best_params_)
opt_model.fit(X_train, y_train)

# Predict and evaluate with the optimized model
opt_y_pred = opt_model.predict(X_test)
optimized_accuracy = accuracy_score(y_test, opt_y_pred)
print(f"Optimized Model Accuracy: {optimized_accuracy}")

# Model saving (Path needs to be verified)
# dump(opt_model, open("/models/optimized_svm_classifier.sav", "wb"))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Initial Model Accuracy: 0.9514767932489452
Optimized Model Accuracy: 0.959915611814346
