# NLP

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
data = pd.read_csv("train_correct.csv")
data = data[:4000]

In [3]:
import spacy
import classy_classification
nlp = spacy.load('en_core_web_md')

import re

def preprocess_text(text):
    
 # Tokenización y lematización
    doc = nlp(text)
    tokens_lemmatized = [token.lemma_ for token in doc] 
    
    # Eliminación de stopwords y caracteres especiales
    clean_tokens = [token.lower() for token in tokens_lemmatized
                    if not nlp.vocab[token].is_stop 
                    and not nlp.vocab[token].is_punct
                    and token.isalpha 
                   ]
    # Unir los tokens limpios en una cadena de texto nuevamente
    clean_text = " ".join(clean_tokens)
    
    # Expresión regular para encontrar URLs
    #url_pattern = r'https?://\S+|www\.\S+'
    #clean_text = re.sub(url_pattern, 'URL', clean_text)
    
    return clean_text


#División de datos

data['clean_text'] = data['clean_text'].apply(preprocess_text)
X = data['clean_text']
y = data['label']

X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.2, random_state=42)
data

  _torch_pytree._register_pytree_node(


Unnamed: 0,label,text,id,clean_text
0,0,"Hi Roy hope you are ok, Trans people are not g...",95e98db99c2,hi boy hope oka train people gay thing ram thr...
1,0,But fuckin' hell what even is biology,a87b8708e63,fucking hell biology
2,0,Whose the nice looking clergyman?,5b3cb03803f,nice look clergyman
3,1,"AIDS ARE IN YOUR WAY, SIN HAS CONSEQUENCES AND...",5b0bad2347e,aids way sin consequences bad
4,0,to learn,332048bd188,learn
...,...,...,...,...
3995,0,We dont have any apartments left. People now s...,f618ec39ebd,apartment leave people sleep outside sad turke...
3996,1,"Then stay put in Lebanon, Nisreena. Here in th...",7ea64e3633c,stay lebanon nisreena equality
3997,0,Wa'el Zaki,2f0c894f432,saki
3998,0,Slightly odd,9187c008564,slightly odd


In [4]:


data_train = {}
for i in range(len(X_train)):
    etiqueta = y_train.iloc[i]
    texto = X_train.iloc[i]
    if etiqueta not in data_train:
        data_train[etiqueta] = []
    data_train[etiqueta].append(texto)

# Mapeo de etiquetas
label_mapping = { 0: "no_hate", 1: "hate"}
# Convertir las etiquetas en el diccionario de datos
data_train = {label_mapping[label]: texts for label, texts in data_train.items()}


In [None]:
from sentence_transformers import SentenceTransformer
#import spacy_sentence_bert

# paraphrase-multilingual-MiniLM-L12-v2 0.71


# Cargar modelo SentenceTransformer
model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

# Agregar el componente de clasificación
nlp = spacy.load('en_core_web_md')
nlp.add_pipe("classy_classification", 
    config={
        "data": data_train,
        "model": model_name
    }
)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt


# Realiza predicciones
#predictions = [max(nlp(text)._.cats, key=nlp(text)._.cats.get) for text in X_test]

# Realiza predicciones
predictions = []

for text in X_test:
    cats = nlp(text)._.cats
    if cats['hate'] - cats['no_hate'] > -0.0:
        predictions.append(1)
    else:
        predictions.append(0)


# Convertir las predicciones de hate/no_hate a 1/0
#predictions = [1 if cat == 'hate' else 0 for cat in predictions]

conf_matrix = confusion_matrix(y_test, predictions)

# Crear un mapa de calor de la matriz de confusión
plt.figure(figsize=(3, 2))
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='g', 
            xticklabels=['No ofensivo', 'Ofensivo'], 
            yticklabels=['No ofensivo', 'Ofensivo'])
plt.xlabel('Predicción')
plt.ylabel('Etiqueta verdadera')
plt.title('Matriz de Confusión')
plt.show()

print(classification_report(y_test, predictions))

In [None]:
# Cargar el conjunto de datos test
test_data = pd.read_csv("test_nolabel_corr.csv")

test_data

In [None]:
# Aplicar el preprocesamiento al texto
test_data['clean_text'] = test_data['clean_text'].apply(preprocess_text)
test_data

In [None]:
#Predicción
#pred_new = [max(nlp(text)._.cats, key=nlp(text)._.cats.get) for text in test_data['clean_text']]

# Convertir las predicciones de hate/no_hate a 1/0
#pred_new = [1 if cat == 'hate' else 0 for cat in predictions]


pred_new = []

for text in test_data['clean_text']:
    cats = nlp(text)._.cats
    if cats['hate'] - cats['no_hate'] > -0.0:
        pred_new.append(1)
    else:
        pred_new.append(0)



In [None]:
predictions_df = pd.DataFrame({'id': test_data["id"], 'label': pred_new})

predictions_df.to_csv("all-MiniLM_predictions.csv", index=False, header=True)
predictions_df