## Entrenamiento de modelo

## Importaciones

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from IPython.display import display

## Cargar dataset limpio

In [2]:
df = pd.read_csv('../source/dataset_limpio.csv')
display(df)

Unnamed: 0,condition,price,base_price,price_diff,sold_quantity,available_quantity,accepts_mercadopago,free_shipping,shipping_mode,country,state,city,neighborhood,num_pictures,num_non_mp_methods,title_length
0,0,-0.006082,-0.006082,-0.004688,-0.056156,-0.080423,0.150644,-0.176346,0.92602,0.005477,-0.140042,0.635837,1.610996,-0.442911,1.003931,1.127878
1,1,-0.005784,-0.005784,-0.004688,-0.056156,-0.080423,0.150644,-0.176346,-0.57036,0.005477,-0.140042,-1.130135,2.051044,1.459301,0.295280,0.897041
2,1,-0.006084,-0.006084,-0.004688,-0.056156,-0.080423,0.150644,-0.176346,-0.57036,0.005477,-0.140042,-1.168364,-0.374722,-0.918464,1.003931,-0.180203
3,0,-0.006024,-0.006024,-0.004688,-0.056156,-0.080423,0.150644,-0.176346,-0.57036,0.005477,-0.140042,-0.529088,0.219343,-0.442911,1.003931,0.512311
4,1,-0.006088,-0.006088,-0.004688,-0.056156,-0.080423,0.150644,-0.176346,0.92602,0.005477,-0.407615,0.867336,-0.737762,-0.442911,1.003931,-1.565230
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1,-0.006088,-0.006088,-0.004688,-0.056156,-0.080423,0.150644,-0.176346,-0.57036,0.005477,-0.407615,-0.646961,-0.737762,1.459301,0.295280,1.050932
99996,0,-0.006063,-0.006063,-0.004688,-0.056156,-0.070917,0.150644,-0.176346,0.92602,0.005477,-0.407615,-0.068215,-0.737762,0.032642,0.295280,1.127878
99997,1,-0.006022,-0.006022,-0.004688,-0.056156,-0.080423,0.150644,-0.176346,-0.57036,0.005477,-0.407615,0.145230,-0.737762,-0.918464,-1.122020,-1.796068
99998,1,-0.006085,-0.006085,-0.004688,-0.056156,-0.080423,0.150644,-0.176346,-0.57036,0.005477,-0.140042,-1.278803,-0.534239,-0.918464,0.295280,0.127581


In [3]:
print(f"Tipo de 'condition': {df['condition'].dtype}")

Tipo de 'condition': int64


## Preparacion de entrenamiento

In [4]:
X = df.drop(columns=['condition'])  # Características (todo excepto 'condition')
y = df['condition']  # Variable objetivo ('condition')

# Dividir el dataset en entrenamiento y prueba (80% - 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verificación de las dimensiones de los conjuntos de entrenamiento y prueba
print(f"Tamaño del conjunto de entrenamiento: {X_train.shape[0]}")
print(f"Tamaño del conjunto de prueba: {X_test.shape[0]}")

Tamaño del conjunto de entrenamiento: 80000
Tamaño del conjunto de prueba: 20000


## Logic Regression

In [5]:
# Crear y entrenar el modelo de regresión logística
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Realizar predicciones con el modelo de regresión logística
y_pred_lr = lr_model.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix

# Evaluar el modelo de regresión logística
print("Matriz de Confusión (Regresión Logística):\n", confusion_matrix(y_test, y_pred_lr))
print("\nReporte de clasificación (Regresión Logística):\n", classification_report(y_test, y_pred_lr))

Matriz de Confusión (Regresión Logística):
 [[6718 3999]
 [2080 7203]]

Reporte de clasificación (Regresión Logística):
               precision    recall  f1-score   support

           0       0.76      0.63      0.69     10717
           1       0.64      0.78      0.70      9283

    accuracy                           0.70     20000
   macro avg       0.70      0.70      0.70     20000
weighted avg       0.71      0.70      0.70     20000



## Random Forest

In [6]:
# Crear y entrenar el modelo de Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Realizar predicciones con el modelo de Random Forest
y_pred_rf = rf_model.predict(X_test)

print("Matriz de Confusión (Random Forest):\n", confusion_matrix(y_test, y_pred_rf))
print("\nReporte de clasificación (Random Forest):\n", classification_report(y_test, y_pred_rf))

Matriz de Confusión (Random Forest):
 [[8449 2268]
 [1363 7920]]

Reporte de clasificación (Random Forest):
               precision    recall  f1-score   support

           0       0.86      0.79      0.82     10717
           1       0.78      0.85      0.81      9283

    accuracy                           0.82     20000
   macro avg       0.82      0.82      0.82     20000
weighted avg       0.82      0.82      0.82     20000



## XGBoost

In [7]:
# Crear y entrenar el modelo de XGBoost
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

# Realizar predicciones con el modelo de XGBoost
y_pred_xgb = xgb_model.predict(X_test)

print("\nMatriz de Confusión (XGBoost):\n", confusion_matrix(y_test, y_pred_xgb))
print("\nReporte de clasificación (XGBoost):\n", classification_report(y_test, y_pred_xgb))


Matriz de Confusión (XGBoost):
 [[8339 2378]
 [1178 8105]]

Reporte de clasificación (XGBoost):
               precision    recall  f1-score   support

           0       0.88      0.78      0.82     10717
           1       0.77      0.87      0.82      9283

    accuracy                           0.82     20000
   macro avg       0.82      0.83      0.82     20000
weighted avg       0.83      0.82      0.82     20000

