# Explore here

In [6]:
import pandas as pd

# Load the dataset from the provided URL
url = 'https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv'
df = pd.read_csv(url)

# Check the first few rows of the dataset
df.head()


Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True


In [10]:
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

# Download stopwords from NLTK
nltk.download('stopwords')
stop_words = stopwords.words('english')

# Function to preprocess URLs
def preprocess_url(url):
    # Convert to lowercase and remove special characters
    url = re.sub(r'\W+', ' ', url.lower())
    # Tokenize and remove stopwords
    tokens = [word for word in url.split() if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to the 'url' column
df['processed_url'] = df['url'].apply(preprocess_url)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ignaciovelutini/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
from sklearn.model_selection import train_test_split

# Split the dataset, using 'is_spam' as the label
X_train, X_test, y_train, y_test = train_test_split(df['processed_url'], df['is_spam'], test_size=0.2, random_state=42)


In [12]:
# Convert URLs into numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [13]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Create and train the SVM model
model = SVC()
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Initial model accuracy: {accuracy}')


Initial model accuracy: 0.9466666666666667


In [14]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'poly', 'rbf']
}

# Perform grid search
grid = GridSearchCV(SVC(), param_grid)
grid.fit(X_train_tfidf, y_train)

# Get the best parameters
print(f"Best hyperparameters: {grid.best_params_}")


Best hyperparameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


In [15]:
# Train the optimized model with the best parameters
opt_model = SVC(C=1000, degree=1, gamma="auto", kernel="poly", random_state=42)
opt_model.fit(X_train_tfidf, y_train)

# Make predictions with the optimized model
y_pred_opt = opt_model.predict(X_test_tfidf)

# Evaluate the optimized model
accuracy_opt = accuracy_score(y_test, y_pred_opt)
print(f'Optimized model accuracy: {accuracy_opt}')


Optimized model accuracy: 0.8966666666666666


In [16]:
import os
import joblib

# Create the '../models' directory if it doesn't exist
if not os.path.exists('../models'):
    os.makedirs('../models')

# Save the optimized model to the '../models' folder
joblib.dump(opt_model, '../models/svm_spam_detector_model.pkl')


['../models/svm_spam_detector_model.pkl']