In [72]:
import re
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.metrics import f1_score  #ytrue, ypred
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# NLP toolkits
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt_tab')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [73]:
def grid_SVC(X_train, y_train, performance_metric='f1', resultsGrid=False): #clasificador SVC
    model = SVC()
    #HIPERPARAMETROS
    C = np.linspace(0.000001 , 1000, 10) #parametro de regularizacion
    kernels = ['poly', 'rbf', 'linear', 'sigmoid'] #kernel
    gamma = ['scale', 'auto'] #gamma
    grid = dict(C = C, kernel = kernels, gamma = gamma)
    #VALIDACION CRUZADA / CROSS-VALIDATION: dividiendo datos en conjuntos de train y test
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                           scoring=performance_metric,error_score='raise')
    grid_result = grid_search.fit(X_train, y_train)


    if resultsGrid==True:
        return grid_result.cv_results_
    else:
        return  grid_result.best_estimator_
    # Si resultsGrid es True, devuelve todos los resultados de la búsqueda en
    # cuadrícula. De lo contrario, devuelve el modelo SVC que tuvo el mejor
    # rendimiento durante la búsqueda.


def lemmatize_text(text): #Lematización del texto.

    tokens = nltk.word_tokenize(text)
    # toma el texto de entrada y lo divide en palabras individuales (tokens)

    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    # para cada token, usa la lematizacion el cual reduce las palabras a su forma base

    lemmatized_text = ' '.join(lemmatized_words)
    # une todas las palabras lematizadas en una sola cadena de texto

    return lemmatized_text


def clean_text(string): #limpia y procesa el texto
    string = string.lower() #convierte a minusculas
    string = re.sub(r"http(s)?:*", '', string) #elimina cualquier URL que comience con "http" o "https" del texto
    string = re.sub(r"[-/.#&]", ' ', string) #reemplaza caracteres especiales por espacios en blanco
    string = re.sub(r"w{3}", ' ', string) #reemplaza www por un espacio
    string = string.strip() #elimina cualquier espacio en blanco al principio o al final

    string = ' '.join([word for word in string.split() if word not in stop_words])
    # divide la cadena en palabras, luego filtra esas palabras para eliminar palabras comunes
    # como "el", "la", "un" "y" que estan dentro de stop_words. Al final une las palabras restantes nuevamente.

    string = lemmatize_text(string)
    # aplica la lematizacion al texto, reduciendo cada palabra a su forma base: corriendo=correr, mejores=bueno
    return string

In [74]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"

In [75]:
df  = pd.read_csv(url)

In [76]:
df.head(5)

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True


In [77]:
df.columns

Index(['url', 'is_spam'], dtype='object')

In [78]:
samples = [df['url'].loc[np.random.randint(0,df.shape[0])] for _ in range(30)]

In [79]:
lemmatizer = WordNetLemmatizer() # activamos el lematizador
stop_words.extend(['of', 'yet']) # podemos agregar las stop_words que queramos
list(map(clean_text, samples))

['onezero medium com inside social medium cult convinces young people give everything f3878fbec632',
 'newstatesman com politics health 2020 06 dark side wellness industry',
 'venturebeat com 2020 06 30 ibm us ai enhance old wimbledon tennis footage digital era',
 'reuters com article u usa boogaloo facebook ban account linked anti government u boogaloo movement iduskbn2413jc',
 'thehustle co lululemon self care product lifestyle brand',
 'cnn com 2020 06 29 tech lululemon mirror fitness startup acquisition index html',
 'theguardian com technology 2020 jun 30 third advertiser may boycott facebook hate speech revolt',
 'reuters com article u facebook ad boycott exclusive exclusive facebook ad boycott campaign go global organizer say iduskbn23z0o4',
 'vice com en_us article 5dz94x uber acquisition jump bikeshare destroyed thousand bike',
 'cbsnews com news supreme court allows federal death penalty justice department execution',
 'readtheplaque com',
 'skift com 2020 06 22 hospitality d

In [80]:
X, y = df['url'], df['is_spam']

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    shuffle=True,
                                                    test_size = 0.3,
                                                    random_state=123)

In [82]:
X_train

Unnamed: 0,url
1099,https://www.reuters.com/article/us-usa-trump-o...
1448,https://www.eventbrite.com/e/big-friendship-bo...
2327,https://news.rice.edu/2020/06/29/laser-welded-...
1412,https://creativemornings.com/companies/sdco-pa...
1224,https://www.nytimes.com/2020/06/25/world/afric...
...,...
1147,https://en.wikipedia.org/wiki/Tim_O%27Brien_(a...
2154,https://www.washingtonpost.com/privacy-policy/...
1766,https://www.cnbc.com/2020/06/26/amazon-buys-se...
1122,https://www.amazon.com/Rivers-Tides-Andy-Golds...


In [83]:
# limpiamos la data
X_train = X_train.apply(lambda x : clean_text(x))
X_test = X_test.apply(lambda x : clean_text(x))

In [84]:
X_train

Unnamed: 0,url
1099,reuters com article u usa trump obamacare trum...
1448,eventbrite com e big friendship book launch ti...
2327,news rice edu 2020 06 29 laser welded sugar sw...
1412,creativemornings com company sdco partner
1224,nytimes com 2020 06 25 world africa ebola cong...
...,...
1147,en wikipedia org wiki tim_o % 27brien_ ( author )
2154,washingtonpost com privacy policy 2011 11 18 g...
1766,cnbc com 2020 06 26 amazon buy self driving te...
1122,amazon com river tide andy goldsworthy dp b001...


In [85]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer().fit(X_train)
#vect = TfidfVectorizer().fit(X_train)
X_train = vect.transform(X_train)
X_test  = vect.transform(X_test)

In [86]:
y_train = np.where(y_train==True,1,0)

In [87]:
y_train

array([0, 1, 0, ..., 0, 0, 0])

In [88]:
best_ml = grid_SVC(X_train, y_train)

In [89]:
preds = best_ml.predict(X_test)

In [90]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

       False       0.97      0.96      0.97       682
        True       0.87      0.92      0.89       218

    accuracy                           0.95       900
   macro avg       0.92      0.94      0.93       900
weighted avg       0.95      0.95      0.95       900



# CONCLUSION:

### Accuracy del 95%:
- Las predicciones (spam y no spam combinadas) fueron correctas.

a. Recall/Sensibilidad del 96%:
- Las URLs que no son spam, el modelo identifico correctamente el 96% de ellas.

b. Recall/Sensibilidad del 92%:
- Las URLs que son spam, fueron correctamente identificadas en un 92%.