In [8]:
import sys
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score  
from sklearn.metrics import confusion_matrix
sys.path.append(os.path.abspath('..'))  # Ajusta la ruta según la ubicación de 'src'
from src.training.etl import EDAdataset
from src.utils.conexion import SQLConnection

In [9]:
params = {"medico": "PSICOLOGÍA","fechaini": "20230101","fechafin": "20250504"}
sql_path = os.path.join("..", "sql_queries", "queries.sql")
sqlconection = SQLConnection(sql_path=sql_path, params=params)
historias_clinicas_df = sqlconection.generate_dataframe()

In [10]:
df_copy = EDAdataset(historias_clinicas_df)
df_final = df_copy.dataset_eda(historias_clinicas_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df[column_name] = self.df[column_name].str.capitalize()


In [12]:
# feature_engineer.py

import pandas as pd
import re
import spacy
from sklearn import preprocessing

class PreprocesadorTexto:
    def __init__(self, df, stopwords={
            "medico", "paciente", "psicologo", "psicologa",
            "psicologia", "psicoterapeuta", "psicoterapia", "refiere"}):
        self.df = df.copy()
        self.stopwords = stopwords
        self.nlp = spacy.load("es_core_news_lg")
        self.stopwords = self.nlp.Defaults.stop_words.union(self.stopwords)

    def concatenar_columnas(self, df, columna1="subjetivo", columna2="objetivo", nueva_columna="concatenada"):
        self.df = df
        self.df[nueva_columna] = self.df[columna1].astype(str) + " " + self.df[columna2].astype(str)
        return self.df

    def expresiones_regulares(self, columna):
        self.df[columna] = (
            self.df[columna]
            .astype(str)
            .str.lower()
            .apply(lambda x: re.sub(r'\s+', ' ', re.sub(r'[^a-zñü ]', '', x)).strip())
        )

    def tokenizar(self, columna):
        self.df[columna] = (
            self.df[columna]
            .astype(str)
            .fillna("")
            .apply(lambda x: [
                token.text for token in self.nlp(x)
                if token.text.lower() not in self.stopwords and not token.is_punct and not token.is_space
            ])
        )

    def lematizar(self, columna):
        self.df[columna] = self.df[columna].apply(
            lambda x: [token.lemma_ for token in self.nlp(" ".join(x))] if isinstance(x, list) else []
        )

    def label_encodering(self, columna, nueva_columna, tipo="sexo"):
        label_encoder = preprocessing.LabelEncoder()
        self.df[nueva_columna] = label_encoder.fit_transform(self.df[columna])
        mapping_df = pd.DataFrame({
            tipo.capitalize(): label_encoder.classes_,
            'Codigo': label_encoder.transform(label_encoder.classes_)
        })
        return mapping_df

    def procesar(self, columna_texto, columna_sexo=None, columna_grupo=None):
        self.expresiones_regulares(columna_texto)
        self.tokenizar(columna_texto)
        self.lematizar(columna_texto)

        mappings = {}
        if columna_sexo:
            mappings["sexo"] = self.label_encodering(columna_sexo, "sexo_codificado", tipo="sexo")
        if columna_grupo:
            mappings["grupo"] = self.label_encodering(columna_grupo, "grupo_codificado", tipo="grupo")
        return self.df, mappings

ImportError: cannot import name 'registry' from 'thinc' (unknown location)

In [None]:
stopwords={
            "medico", "paciente", "psicologo", "psicologa",
            "psicologia", "psicoterapeuta", "psicoterapia", "refiere"
        }

preprocesador = PreprocesadorTexto(df_final, stopwords=stopwords)
df_procesado, mapeos = preprocesador.procesar(
    columna_texto="concatenada", # Columna que contiene el texto a procesar
    columna_sexo="sexo", # Columna que contiene la variable sexo
    columna_grupo="grupo" # Columna que contiene la variable grupo
)

In [17]:
# Leer dataset historia_clinicas_procesadas.xlsx
df_final = pd.read_excel("historias_clinicas_procesadas.xlsx")

# Split de datos

In [31]:
# Dividir datos antes del vectorizador
X = df_final[["concatenada", "sexo_codificado"]]    
y = df_final["grupo_codificado"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [32]:
X_train.shape

(6866, 2)

In [33]:
X_train


Unnamed: 0,concatenada,sexo_codificado
5023,"['psiquiatra', 'ver', 'necesitar', 'medicament...",0
7585,"['refierevengo', 'tratamiento', 'año', 'enviar...",0
1598,"['dinmico', 'familiar', 'continuo', 'explicar'...",1
9162,"['expresar', 'momento', 'sentir', 'ocasión', '...",0
300,"['tener', 'problema', 'mamo', 'viernes', 'habl...",0
...,...,...
5734,"['estrado', 'depresin', 'ansiedad', 'tranquili...",0
5191,"['decir', 'cosa', 'padr', 'preocupado', 'separ...",1
5390,"['cosa', 'afectar', 'sentir', 'desbordado', 'a...",1
860,"['venir', 'situacin', 'familiar', 'venir', 'pr...",0


In [34]:
# Vectorizacion
vectorizer_LR = TfidfVectorizer()
X_train_vect = vectorizer_LR.fit_transform(X_train["concatenada"])
X_test_vect = vectorizer_LR.transform(X_test["concatenada"])

In [38]:
from sklearn.pipeline import Pipeline

# Creamos el Pipeline completo

model_pipeline = Pipeline(steps=[
    ("classifier", LogisticRegression(random_state=42, max_iter=1000))
])

In [39]:
# Fit the pipeline 
model_pipeline.fit(X_train_vect, y_train)

0,1,2
,steps,"[('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [40]:
# Make predictions
y_pred = model_pipeline.predict(X_test_vect)
y_pred_proba = model_pipeline.predict_proba(X_test_vect)[:, 1]

In [42]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.6010873258579681
Precision: 0.6199477054971955
Recall: 0.6010873258579681
F1 Score: 0.583542767746734
Confusion Matrix:
 [[304   1 118  36  20  43]
 [ 25  16  78  26   1   2]
 [ 54   0 765 104   7  17]
 [ 49   2 229 383   3   9]
 [ 72   1  64  18  98  44]
 [ 43   0  64  35   9 203]]


# Training con MLFlow

In [43]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [44]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://127.0.0.1:5000'


In [45]:
mlflow.search_experiments()

[<Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1758418427717, experiment_id='0', last_update_time=1758418427717, lifecycle_stage='active', name='Default', tags={}>]

## Prueba con autologgin

In [46]:
experiment_name = "nlp_logistic_regression_v1"
mlflow.set_experiment(experiment_name)
mlflow.autolog()

with mlflow.start_run():
    model_pipeline.fit(X_train_vect, y_train)
    y_pred = model_pipeline.predict(X_test_vect)
    mlflow.log_param("model_type", "Logistic Regression")
    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))
    mlflow.log_metric("precision", precision_score(y_test, y_pred, average='weighted'))
    mlflow.log_metric("recall", recall_score(y_test, y_pred, average='weighted'))
    mlflow.log_metric("f1_score", f1_score(y_test, y_pred, average='weighted'))


2025/09/20 20:41:05 INFO mlflow.tracking.fluent: Experiment with name 'nlp_logistic_regression_v1' does not exist. Creating a new experiment.
2025/09/20 20:41:07 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


🏃 View run monumental-doe-781 at: http://127.0.0.1:5000/#/experiments/1/runs/ead316c92ce44272a3edb391e2805540
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


In [47]:
# train.py

class Train:
    def __init__(self, df, target_column, model, test_size=0.3, training_columns=["concatenada", "sexo_codificado"]):
        self.df = df
        self.target_column = target_column
        self.test_size = test_size
        self.model = model
        self.training_columns = training_columns

    def train_test_split(self):
        X_train, X_test, y_train, y_test = train_test_split(self.df[self.training_columns], self.df[self.target_column], test_size=self.test_size, random_state=42)
        return X_train, X_test, y_train, y_test
    
    def vectorizer(self, vectorizer_model=TfidfVectorizer(), columns_to_vectorize=["concatenada"]):
        self.vectorizer_model = vectorizer_model
        X_train, X_test, y_train, y_test = self.train_test_split()
        X_train_vect = self.vectorizer_model.fit_transform(X_train[columns_to_vectorize])
        X_test_vect = self.vectorizer_model.transform(X_test[columns_to_vectorize])
        return X_train_vect, X_test_vect

    def create_pipeline_train(self):
        pipeline = Pipeline(steps=[("classifier", self.model)])
        return pipeline

    def train(self):
        X_train, X_test, y_train, y_test = self.train_test_split()
        X_train_vect, X_test_vect = self.vectorizer()
        pipeline = self.create_pipeline_train()
        pipeline.fit(X_train_vect, y_train)
        return pipeline