# Intro to NPL

In [3]:
import pandas as pd

total_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv")
total_data

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True
...,...,...
2994,https://www.smartcitiesworld.net/news/news/dee...,False
2995,https://www.youtube.com/watch,True
2996,https://techcrunch.com/2019/07/04/an-optimisti...,False
2997,https://www.technologyreview.com/2019/12/20/13...,False


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Tokenización, eliminación de stopwords y lematización
def preprocess_text(text):
    # Tokenización
    tokens = word_tokenize(text)
    # Eliminación de stopwords y puntuaciones
    stop_words = set(stopwords.words('english'))
    tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in stop_words]
    # Lematización
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

# Aplicar preprocesamiento a cada URL en el conjunto de datos
total_data['tokens'] = total_data['url'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Convertir tokens a texto nuevamente
total_data['text'] = total_data['tokens'].apply(lambda tokens: ' '.join(tokens))

# Dividir el conjunto de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(total_data['text'], total_data['is_spam'], test_size=0.2, random_state=42)



# Crear un vectorizador TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Inicializar y entrenar el clasificador SVM
svm_classifier = SVC()
svm_classifier.fit(X_train_tfidf, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = svm_classifier.predict(X_test_tfidf)

# Analizar resultados
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       0.76      1.00      0.86       455
        True       0.00      0.00      0.00       145

    accuracy                           0.76       600
   macro avg       0.38      0.50      0.43       600
weighted avg       0.58      0.76      0.65       600



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
from sklearn.model_selection import GridSearchCV

# Definir los parámetros a optimizar
param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['rbf', 'linear', 'poly', 'sigmoid']}

# Inicializar el grid search
grid_search = GridSearchCV(SVC(), param_grid, cv=5, n_jobs=-1)

# Realizar la búsqueda en la cuadrícula
grid_search.fit(X_train_tfidf, y_train)

# Mostrar los mejores parámetros encontrados
print("Mejores parámetros encontrados:", grid_search.best_params_)

# Realizar predicciones con el mejor modelo
best_svm_classifier = grid_search.best_estimator_
y_pred_best = best_svm_classifier.predict(X_test_tfidf)

# Analizar resultados del mejor modelo
print(classification_report(y_test, y_pred_best))


Mejores parámetros encontrados: {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}
              precision    recall  f1-score   support

       False       0.76      1.00      0.86       455
        True       0.00      0.00      0.00       145

    accuracy                           0.76       600
   macro avg       0.38      0.50      0.43       600
weighted avg       0.58      0.76      0.65       600



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
