# Notebook básico de NLP con LinearSVC

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split




In [2]:
data = pd.read_csv("train.csv")
data[0:5]

Unnamed: 0,label,text,id
0,0,"Hi Roy hope you are ok, Trans people are not g...",95e98db99c2
1,0,But fuckin' hell what even is biology,a87b8708e63
2,0,Whose the nice looking clergyman?,5b3cb03803f
3,1,"AIDS ARE IN YOUR WAY, SIN HAS CONSEQUENCES AND...",5b0bad2347e
4,0,to learn,332048bd188


In [3]:
#check missing values
data.isnull().sum()

label    0
text     0
id       0
dtype: int64

In [4]:
###Pre-procesado

import spacy

# Cargar el modelo de idioma inglés de spaCy
nlp = spacy.load("en_core_web_sm")

# Función para preprocesar el texto
def preprocess_text(text):
    # Tokenización y lematización
    doc = nlp(text)
    tokens_lemmatized = [token.lemma_ for token in doc]
    
    # Eliminación de stopwords y caracteres especiales
    clean_tokens = [token.lower() for token in tokens_lemmatized if not nlp.vocab[token].is_stop and token.isalpha]
    
    # Unir los tokens limpios en una cadena de texto nuevamente
    clean_text = " ".join(clean_tokens)
    
    return clean_text

# Cargar los datos
data = pd.read_csv("train.csv")

# Aplicar el preprocesamiento al texto
data['clean_text'] = data['text'].apply(preprocess_text)

# Mostrar los primeros 5 registros con el texto preprocesado
data.head()


Unnamed: 0,label,text,id,clean_text
0,0,"Hi Roy hope you are ok, Trans people are not g...",95e98db99c2,"hi roy hope ok , trans people gay . thing s ra..."
1,0,But fuckin' hell what even is biology,a87b8708e63,fuckin ' hell biology
2,0,Whose the nice looking clergyman?,5b3cb03803f,nice look clergyman ?
3,1,"AIDS ARE IN YOUR WAY, SIN HAS CONSEQUENCES AND...",5b0bad2347e,"aids way , sin consequences bad ."
4,0,to learn,332048bd188,learn


In [6]:
#División de datos
X = data['clean_text']
y = data['label']

X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.2, random_state=42)

#Vectorización
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
from sklearn.svm import LinearSVC # 0.75
from sklearn.linear_model import LogisticRegression  # Regresión Logística 0.75
from sklearn.ensemble import RandomForestClassifier  # Random Forest rnd 33 0.74
from sklearn.neural_network import MLPClassifier  # Redes Neuronales 0.69

from sklearn.pipeline import Pipeline

text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

text_clf.fit(X_train, y_train)

predictions = text_clf.predict(X_test)


from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[877 156]
 [258 339]]
              precision    recall  f1-score   support

           0       0.77      0.85      0.81      1033
           1       0.68      0.57      0.62       597

    accuracy                           0.75      1630
   macro avg       0.73      0.71      0.71      1630
weighted avg       0.74      0.75      0.74      1630



In [8]:
# Cargar el conjunto de datos test
test_data = pd.read_csv("test_nolabel.csv")

test_data[0:5]

Unnamed: 0,text,id
0,Well media selected them exactly because they'...,ce40fcb3a33
1,We must stand together to return this mass of ...,02ca950343c
2,http://apologeticspress.org/APContent.aspx?cat...,a59e88b4f9c
3,"The UK has universal healthcare you dickwad, t...",ad62d80af38
4,Is that comment good for clergyman Paul Kosyl?,a6b36372f05


In [9]:
# Aplicar el preprocesamiento al texto
test_data['clean_text'] = test_data['text'].apply(preprocess_text)
test_data[0:5]

Unnamed: 0,text,id,clean_text
0,Well media selected them exactly because they'...,ce40fcb3a33,"medium select exactly stupid ignorant level , ..."
1,We must stand together to return this mass of ...,02ca950343c,stand return mass people come ... need stand g...
2,http://apologeticspress.org/APContent.aspx?cat...,a59e88b4f9c,http://apologeticspress.org/apcontent.aspx?cat...
3,"The UK has universal healthcare you dickwad, t...",ad62d80af38,"uk universal healthcare dickwad , medical cost..."
4,Is that comment good for clergyman Paul Kosyl?,a6b36372f05,comment good clergyman paul kosyl ?


In [10]:
#Predicción
pred_new = text_clf.predict(test_data["clean_text"])

In [11]:
predictions_df = pd.DataFrame({'id': test_data["id"], 'label': pred_new})

predictions_df.to_csv("SVC_predictions.csv", index=False, header=True)
predictions_df

Unnamed: 0,id,label
0,ce40fcb3a33,1
1,02ca950343c,0
2,a59e88b4f9c,0
3,ad62d80af38,0
4,a6b36372f05,0
...,...,...
3488,c42316663f5,0
3489,3e1b52d2030,1
3490,fe66fe0f5cc,0
3491,3a5ca8a8fd5,0
