# Explore here

In [None]:
import pandas as pd
df = pd.read_csv('url_spam.csv')
df.head()


Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True


In [11]:
print(df['is_spam'].value_counts())

is_spam
False    2303
True      696
Name: count, dtype: int64


In [None]:


import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_url(url):
    
    tokens = re.split(r'\W+', url)
    
    
    tokens = [token.lower() for token in tokens if token != '']
    
    
    tokens = [t for t in tokens if t not in stop_words]
    
    
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    
    
    return " ".join(tokens)

df['processed_url'] = df['url'].apply(preprocess_url)
print(df[['url','processed_url']].head())


[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...


                                                 url  \
0  https://briefingday.us8.list-manage.com/unsubs...   
1                             https://www.hvper.com/   
2                 https://briefingday.com/m/v4n3i4f3   
3   https://briefingday.com/n/20200618/m#commentform   
4                        https://briefingday.com/fan   

                                      processed_url  
0  http briefingday us8 list manage com unsubscribe  
1                                http www hvper com  
2                     http briefingday com v4n3i4f3  
3       http briefingday com n 20200618 commentform  
4                          http briefingday com fan  


In [None]:

from sklearn.model_selection import train_test_split

X = df['url']
y = df['is_spam']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC())
])


pipeline.fit(X_train, y_train)


y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       0.96      0.96      0.96       461
        True       0.86      0.87      0.87       139

    accuracy                           0.94       600
   macro avg       0.91      0.91      0.91       600
weighted avg       0.94      0.94      0.94       600



In [None]:


from sklearn.model_selection import GridSearchCV

param_grid = {
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__max_df': [0.75, 1.0],
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf'],
    'svm__gamma': ['scale', 'auto']
}

grid = GridSearchCV(pipeline, param_grid, cv=3, verbose=2, n_jobs=-1)
grid.fit(X_train, y_train)

print("Mejores parámetros:", grid.best_params_)


y_pred_best = grid.predict(X_test)
print(classification_report(y_test, y_pred_best))


Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END svm__C=0.1, svm__gamma=scale, svm__kernel=linear, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1); total time=   0.3s
[CV] END svm__C=0.1, svm__gamma=scale, svm__kernel=linear, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1); total time=   0.3s
[CV] END svm__C=0.1, svm__gamma=scale, svm__kernel=linear, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1); total time=   0.3s
[CV] END svm__C=0.1, svm__gamma=scale, svm__kernel=linear, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2); total time=   0.5s
[CV] END svm__C=0.1, svm__gamma=scale, svm__kernel=linear, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2); total time=   0.5s
[CV] END svm__C=0.1, svm__gamma=scale, svm__kernel=linear, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2); total time=   0.5s
[CV] END svm__C=0.1, svm__gamma=scale, svm__kernel=linear, tfidf__max_df=1.0, tfidf__ngram_range=(1, 1); total time=   0.4s
[CV] END svm__C=0.1, svm__gamma=scale, svm__kernel=linear, tfidf

In [None]:


import joblib

joblib.dump(grid.best_estimator_, "url_spam_svm_model.joblib")



['url_spam_svm_model.joblib']