#### Import required libraries

In [25]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Ensure you have downloaded the necessary NLTK data files
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jvazq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jvazq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jvazq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jvazq\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Upload dataset from csv

In [26]:
# Load the file youtoxic_english_1000.csv to youtoxic dataframe
youtoxic = pd.read_csv('data/youtoxic_english_1000.csv')


#### Review dataset

In [27]:
youtoxic.head()

Unnamed: 0,CommentId,VideoId,Text,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism
0,Ugg2KwwX0V8-aXgCoAEC,04kJtp6pVXI,If only people would just take a step back and...,False,False,False,False,False,False,False,False,False,False,False,False
1,Ugg2s5AzSPioEXgCoAEC,04kJtp6pVXI,Law enforcement is not trained to shoot to app...,True,True,False,False,False,False,False,False,False,False,False,False
2,Ugg3dWTOxryFfHgCoAEC,04kJtp6pVXI,\nDont you reckon them 'black lives matter' ba...,True,True,False,False,True,False,False,False,False,False,False,False
3,Ugg7Gd006w1MPngCoAEC,04kJtp6pVXI,There are a very large number of people who do...,False,False,False,False,False,False,False,False,False,False,False,False
4,Ugg8FfTbbNF8IngCoAEC,04kJtp6pVXI,"The Arab dude is absolutely right, he should h...",False,False,False,False,False,False,False,False,False,False,False,False


In [28]:
youtoxic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   CommentId        1000 non-null   object
 1   VideoId          1000 non-null   object
 2   Text             1000 non-null   object
 3   IsToxic          1000 non-null   bool  
 4   IsAbusive        1000 non-null   bool  
 5   IsThreat         1000 non-null   bool  
 6   IsProvocative    1000 non-null   bool  
 7   IsObscene        1000 non-null   bool  
 8   IsHatespeech     1000 non-null   bool  
 9   IsRacist         1000 non-null   bool  
 10  IsNationalist    1000 non-null   bool  
 11  IsSexist         1000 non-null   bool  
 12  IsHomophobic     1000 non-null   bool  
 13  IsReligiousHate  1000 non-null   bool  
 14  IsRadicalism     1000 non-null   bool  
dtypes: bool(12), object(3)
memory usage: 35.3+ KB


#### Convertir Booleanos a Int64 por si alguna librería posterior no puede hacer la transformación intrínseca de boolean a int64

In [29]:
# Identify boolean columns
bool_columns = youtoxic.select_dtypes(include=['bool']).columns

# Convert boolean columns to int64
youtoxic[bool_columns] = youtoxic[bool_columns].astype('int64')

# Display the updated information about the dataset to verify the conversion
youtoxic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   CommentId        1000 non-null   object
 1   VideoId          1000 non-null   object
 2   Text             1000 non-null   object
 3   IsToxic          1000 non-null   int64 
 4   IsAbusive        1000 non-null   int64 
 5   IsThreat         1000 non-null   int64 
 6   IsProvocative    1000 non-null   int64 
 7   IsObscene        1000 non-null   int64 
 8   IsHatespeech     1000 non-null   int64 
 9   IsRacist         1000 non-null   int64 
 10  IsNationalist    1000 non-null   int64 
 11  IsSexist         1000 non-null   int64 
 12  IsHomophobic     1000 non-null   int64 
 13  IsReligiousHate  1000 non-null   int64 
 14  IsRadicalism     1000 non-null   int64 
dtypes: int64(12), object(3)
memory usage: 117.3+ KB


In [30]:
# Export the youtoxic dataframe to an Excel file
#youtoxic.to_excel('youtoxic.xlsx', index=False)

##### Después de analizar el fichero en Excel se tienen estas conclusiones:

* Efectivamente IsToxic es la bandera que agrupa a los diferentes tipos de clasificaciones de odio.
* No hay ningún discurso de odio que no tenga la bandera IsToxic encendida.
* No hay ninguna bandera IsToxic encendida sin que haya ninguna de las otras banderas encendidas. Esto significa que la clasificación IsToxic no existe per se y solo representa la existencia de discurso de odio en alguna de las características.

#### Porcentaje de registros en cada categoria

In [31]:
# Get total number of rows
total_rows = len(youtoxic)

# Select integer columns
int_cols = youtoxic.select_dtypes(include=['int64']).columns

# Calculate counts and percentages
results = []
for col in int_cols:
    count_ones = youtoxic[col].sum()  # Sum of 1s
    percentage = (count_ones / total_rows) * 100
    results.append({
        'Column': col,
        'Count of 1s': count_ones,
        'Percentage': f'{percentage:.2f}%'
    })

# Create and display table
table = pd.DataFrame(results)
display(table)

Unnamed: 0,Column,Count of 1s,Percentage
0,IsToxic,462,46.20%
1,IsAbusive,353,35.30%
2,IsThreat,21,2.10%
3,IsProvocative,161,16.10%
4,IsObscene,100,10.00%
5,IsHatespeech,138,13.80%
6,IsRacist,125,12.50%
7,IsNationalist,8,0.80%
8,IsSexist,1,0.10%
9,IsHomophobic,0,0.00%


##### Note: IsToxic es la columna bandera que indica si hay algún comentario de odio en las otras 11 categorias. No se necesita crear una columna que sumarice 12 columnas porque esa columna ya está en el dataset.

* Los datos no nos permiten clasificar un discurso por medio de las características: - 
- IsHomophobic
- IsRadicalism

* Las características 
- IsSexist
- IsNationalist
- IsReligiousHate
- IsThreat 
están severamente desbalanceadas.

* También están muy desbalanceadas las características: 
- IsProvocative
- IsObscene
- IsHateSpeech
- IsRacist

* La única característica donde el desbalanceo es razonable es:
- IsAbusive

* Con estas observaciones se presume que el modelo con una sola bandera IsToxic es muy generalista y que por lo tanto no es capaz de predecir con una mejor precisión (alrededor de 70%) si el mensaje es de odio o no.

* Un modelo multi-label binary classification (entiendo que con un Naive Bayes) puede ser una mejor solución en este caso, utilizando solamente las características:
- IsAbusive
- IsProvocative
- IsObscene
- IsHateSpeech
- IsRacist

Por supuesto, resolviendo el problema del desbalanceo en las últimas 4 características.

Después de reconsiderarlo, se descarta la caracteristica VideoId porque aunque sí puede aportar información de un vídeo, el hecho de que la muestra es tan pequeña y seguramente estos vídeos no se utlizarán al momento de predecir hace que no sea útil la información que pueda aportar aunque en las pruebas iniciales sí haya mejorado la precisión.

---
A partir de aquí se preparan 2 datasets. Uno para entrenar un modelo multi-etiqueta de clasificación binaria. El otro para un modelo de una sola categoria: Istoxic

Youmultihatred:
- Text
- IsAbusive
- IsProvocative
- IsObscene
- IsHateSpeech
- IsRacist

Youtoxic:
- Text
- IsToxic

In [32]:
# List of columns to keep for version without multi-label classification

#youmultihatred = youtoxic[['Text', 'IsAbusive','IsProvocative','IsObscene','IsHatespeech', 'IsRacist']]

youtoxic = youtoxic[['Text', 'IsToxic']]


#### Preprocess dataset

1. Remover URLs
2. Remover special characters y números
3. Convertir a minúsculas
4. Remover espacios innecesarios
Tokenizar momentaneamente para:
5. Quitar Stopwords
6. Lematizar

In [33]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# No utilizaremos textblob dado que las metricas mejoraron al no utilizarlo.
# Ademas, textblob es muy lento para procesar grandes cantidades de texto.

#from textblob import TextBlob

def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove special characters and numbers
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Correct misspellings
    #text = str(TextBlob(text).correct())
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join the words back into a single string
    text = ' '.join(words)

    return text


# Apply the preprocess_text function to the 'Text' column of the youtoxic and youmultihatred dataframes
youtoxic['Text'] = youtoxic['Text'].apply(preprocess_text)
#youmultihatred['Text'] = youmultihatred['Text'].apply(preprocess_text)

# Display the updated dataframes
youtoxic.head()
#youmultihatred.head()

Unnamed: 0,Text,IsToxic
0,people would take step back make case wasnt an...,0
1,law enforcement trained shoot apprehend traine...,1
2,dont reckon black life matter banner held whit...,1
3,large number people like police officer called...,0
4,arab dude absolutely right shot extra time sho...,0


#### Generate sample csv file from dataframe

In [34]:
# Export a random selection of 20 rows from youtoxic dataframe to a CSV file
# Uncomment the line below to export the sample

youtoxic.head(n=20).to_csv('data/youtoxic_sample.csv', index=False)
#youmultihatred.head(n=20).to_csv('data/youmultihatred_sample.csv', index=False)

### Building the ML models

#### 2. Bert Tokenizer + PCA reducido

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import onnx
import skl2onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn
from skl2onnx.algebra.onnx_ops import OnnxConcat
from sklearn.pipeline import Pipeline

# Initialize BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings
def get_bert_embeddings(texts, max_length=128):
    # Tokenize and encode sequences
    tokens = tokenizer(
        texts.tolist(),
        max_length=max_length,
        truncation=True,
        padding=True,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        outputs = bert_model(**tokens)
        # Use [CLS] token embeddings as sequence representation
        embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    return embeddings

# Prepare data
X = youtoxic['Text']
y = youtoxic['IsToxic']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Get BERT embeddings
X_train_bert = get_bert_embeddings(X_train)
X_test_bert = get_bert_embeddings(X_test)

# 1. Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_bert)
X_test_scaled = scaler.transform(X_test_bert)

# 2. Apply PCA for dimensionality reduction
pca = PCA(n_components=0.8)  # Keep 95% of variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# 3. Stronger regularization in LogisticRegression
model = LogisticRegression(
    C=0.1,  # Increased regularization strength
    penalty='elasticnet',  # Combine L1 and L2
    solver='saga',
    l1_ratio=0.5,  # Equal mix of L1 and L2
    max_iter=1000
)

# 4. Add cross-validation
cv_scores = cross_val_score(model, X_train_pca, y_train, cv=5)
print(f"Cross-validation scores: {cv_scores.mean():.2f} (+/- {cv_scores.std() * 2:.2f})")

# 5. Train and evaluate
model.fit(X_train_pca, y_train)
y_train_pred = model.predict(X_train_pca)
y_test_pred = model.predict(X_test_pca)

# Calculate metrics
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
overfitting_percentage = ((train_accuracy - test_accuracy) / train_accuracy) * 100

print(f'Training Accuracy: {train_accuracy:.2f}')
print(f'Test Accuracy: {test_accuracy:.2f}')
print(f'Overfitting Percentage: {overfitting_percentage:.2f}%')
print(f'F1-score: {f1_score(y_test, y_test_pred):.2f}')

# Save the model to a file
joblib.dump(model, 'model_runtimes/best_model.pkl')
joblib.dump(pca, 'model_runtimes/pca_model.pkl')
joblib.dump(scaler, 'model_runtimes/scaler_model.pkl')

# Define the pipeline with scaler, PCA, and logistic regression

pipeline = Pipeline([
    ('scaler', scaler),
    ('pca', pca),
    ('logistic_regression', model)
])

# Convert the pipeline to ONNX format
initial_type = [('float_input', FloatTensorType([None, X_train_bert.shape[1]]))]
onnx_model = convert_sklearn(pipeline, initial_types=initial_type)

# Save the ONNX model to a file
with open("model_runtimes/pipeline_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())


Task exception was never retrieved
future: <Task finished name='Task-56' coro=<Server.serve() done, defined at c:\Users\jvazq\Data_Analysis_Tools\anaconda3\envs\airbnb_analytics\Lib\site-packages\uvicorn\server.py:67> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "c:\Users\jvazq\Data_Analysis_Tools\anaconda3\envs\airbnb_analytics\Lib\site-packages\uvicorn\main.py", line 577, in run
    server.run()
  File "c:\Users\jvazq\Data_Analysis_Tools\anaconda3\envs\airbnb_analytics\Lib\site-packages\uvicorn\server.py", line 65, in run
    return asyncio.run(self.serve(sockets=sockets))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jvazq\Data_Analysis_Tools\anaconda3\envs\airbnb_analytics\Lib\site-packages\nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jvazq\Data_Analysis_Tools\anaconda3\envs\airbnb_analytics\Lib\site-packages\nest_asyncio.py", line 92, in run_un

Cross-validation scores: 0.71 (+/- 0.07)
Training Accuracy: 0.78
Test Accuracy: 0.74
Overfitting Percentage: 4.33%
F1-score: 0.74


### 3. Building an API to consume the model

#### 3.2 FastAPI for Bert + PCA model

In [None]:
from fastapi import FastAPI, Request, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import RedirectResponse
from pydantic import BaseModel
import uvicorn
import numpy as np
import nest_asyncio
import joblib

# Apply nest_asyncio
nest_asyncio.apply()


# Load BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Load the trained model
best_model = joblib.load('model_runtimes/best_model.pkl')

# Load PCA and scaler
pca = joblib.load('model_runtimes/pca_model.pkl')
scaler = joblib.load('model_runtimes/scaler_model.pkl')

def get_bert_embeddings(text, max_length=128):
    # Tokenize and encode
    tokens = tokenizer(
        text,
        max_length=max_length,
        truncation=True,
        padding=True,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        outputs = bert_model(**tokens)
        embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    return embeddings


# Create FastAPI app
app = FastAPI(title="Optimized Toxic Comment Classifier API")

# Add CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

class TextRequest(BaseModel):
    text: str

class PredictionResponse(BaseModel):
    is_toxic: bool
    probability: float


@app.get("/")
async def root():
    return RedirectResponse(url="/docs")

@app.post("/predict", response_model=PredictionResponse)
async def predict(request: TextRequest):
    try:
        # Get BERT embeddings
        embeddings = get_bert_embeddings(request.text)
        
        # Scale features
        scaled_features = scaler.transform(embeddings)
        
        # Apply PCA
        pca_features = pca.transform(scaled_features)
        
        # Predict
        probability = best_model.predict_proba(pca_features)[0][1]
        prediction = best_model.predict(pca_features)[0]
        
        return {
            "is_toxic": bool(prediction),
            "probability": float(probability)
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

'''
@app.get("/model-info")
async def model_info():
    return {
        "best_parameters": grid_search.best_params_,
        "best_score": grid_search.best_score_
    }
'''
    
@app.get("/health")
async def health():
    return {"status": "ok"}

if __name__ == "__main__":
    uvicorn.run(app, host="127.0.0.1", port=8000)

print("API is running at http://127.0.0.1:8000")
print("Documentation available at http://127.0.0.1:8000/docs")

INFO:     Started server process [5212]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)
INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [5212]


API is running at http://127.0.0.1:8000
Documentation available at http://127.0.0.1:8000/docs
