# Sistema de detección de enlaces spam

#### 0.- Instalacion de los requerimientos

In [216]:
import os
import subprocess
import sys

req_file = "/workspaces/Tutorial-de-Proyecto-de-NLP/requirements.txt"

if os.path.exists(req_file):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", req_file])
else:
    print(f"Archivo {req_file} no encontrado.")

Defaulting to user installation because normal site-packages is not writeable


#### 1.- Importacion de librerias

In [217]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re
import nltk
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import joblib
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from pickle import dump


#### Paso 2: Preprocesa los enlaces

In [218]:
# Paso 1: Carga del conjunto de datos

url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"
df = pd.read_csv(url)
df.to_csv("/workspaces/Tutorial-de-Proyecto-de-NLP/data/raw/url_spam.csv", index=False)
df.head()

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True


In [219]:
df["is_spam"] = df["is_spam"].apply(lambda x: 1 if x else 0).astype(int)
df.head()

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,1
1,https://www.hvper.com/,1
2,https://briefingday.com/m/v4n3i4f3,1
3,https://briefingday.com/n/20200618/m#commentform,0
4,https://briefingday.com/fan,1


In [220]:
df = df.drop_duplicates()
df = df.reset_index(inplace = False, drop = True)
df.shape

(2369, 2)

In [221]:
print(f"Spam: {len(df.loc[df.is_spam == 1])}")
print(f"No spam: {len(df.loc[df.is_spam == 0])}")

Spam: 244
No spam: 2125


In [222]:
def texto_limpio(texto):
    texto = re.sub(r'[^a-z ]', " ", texto)
    texto = re.sub(r'\s+[a-zA-Z]\s+', " ", texto)
    texto = re.sub(r'\^[a-zA-Z]\s+', " ", texto)
    texto = re.sub(r'\s+', " ", texto.lower())
    texto = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", texto)
    return texto.split()

df["url"] = df["url"].apply(texto_limpio)
df.head()

Unnamed: 0,url,is_spam
0,"[https, briefingday, us, list, manage, com, un...",1
1,"[https, www, hvper, com]",1
2,"[https, briefingday, com, v, i]",1
3,"[https, briefingday, com, m, commentform]",0
4,"[https, briefingday, com, fan]",1


In [223]:
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def lemmatize_text(words, lemmatizer = lemmatizer):
    tokens = [lemmatizer.lemmatize(word) for word in words]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [word for word in tokens if len(word) > 3]
    return tokens

df["url"] = df["url"].apply(lemmatize_text)
df.head()

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,url,is_spam
0,"[http, briefingday, list, manage, unsubscribe]",1
1,"[http, hvper]",1
2,"[http, briefingday]",1
3,"[http, briefingday, commentform]",0
4,"[http, briefingday]",1


In [224]:
tokens_list = df["url"]
tokens_list = [" ".join(tokens) for tokens in tokens_list]

vectorizer = TfidfVectorizer(max_features = 5000, max_df = 0.8, min_df = 5)
X = vectorizer.fit_transform(tokens_list).toarray()
y = df["is_spam"]

X[:6]


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(6, 538))

In [225]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#### Paso 3: Construye un SVM

In [226]:
model = SVC(kernel = "linear", random_state = 42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred

accuracy_score(y_test, y_pred)

0.9514767932489452

#### Paso 4: Optimiza el modelo anterior

In [227]:
#Dificultades para hacer esto, pero le voy pillando el tranquillo, asi que tuve que ver las soluciones...

param_grid = [
    {
        "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        "kernel": ["poly"],
        "degree": [1, 2, 3, 4, 5],
        "gamma": ["scale", "auto"]
    },
    {
        "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        "kernel": ["linear", "rbf", "sigmoid"],
        "gamma": ["scale", "auto"]
    }
]

grid = GridSearchCV(SVC(), param_grid, cv=5, verbose=1)
grid


In [228]:
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)

Fitting 5 folds for each of 112 candidates, totalling 560 fits
Best parameters: {'C': 1000, 'degree': 1, 'gamma': 'auto', 'kernel': 'poly'}
Best score: 0.929287598944591


In [229]:
opt_model = SVC(C = 1000, degree = 1, gamma = "auto", kernel = "poly", random_state = 42)
opt_model.fit(X_train, y_train)
y_pred = opt_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.9620253164556962

In [230]:

dump(model, open("/workspaces/Tutorial-de-Proyecto-de-NLP/models/modelo_acabado.sav", "wb"))