<a href="https://colab.research.google.com/github/LCaravaggio/AnalisisPredictivo/blob/master/Kaggle/2024Q2/Baseline_Kaggle_2024Q2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import userdata
import json

!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

api_token = {
    'username': userdata.get('KAGGLE_USER'),
    'key': userdata.get('KAGGLE_KEY')}
with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c analisis-predictivo-2024-q-2

mkdir: cannot create directory ‘/root/.kaggle’: File exists
analisis-predictivo-2024-q-2.zip: Skipping, found more recently modified local copy (use --force to force download)


In [2]:
import zipfile
import os

os.listdir()

for file in os.listdir():
    if file.endswith('.zip'):
      zip_ref = zipfile.ZipFile(file, 'r')
      zip_ref.extractall()
      zip_ref.close()

In [3]:
!pip install optuna



In [4]:
import pandas as pd
import numpy as np
import optuna
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Cargar los datos de entrenamiento desde el archivo JSONLines
train_file_path = '/content/train_data.jsonlines'
df_train = pd.read_json(train_file_path, lines=True)

# Función para expandir columnas con listas o diccionarios
def expand_columns(df):
    cols_to_expand = True
    while cols_to_expand:
        cols_to_expand = False
        for col in df.columns:
            if df[col].apply(lambda x: isinstance(x, dict)).any():
                expanded_df = pd.json_normalize(df[col])
                expanded_df.columns = [f'{col}_{key}' for key in expanded_df.columns]
                df = df.drop(columns=[col]).join(expanded_df)
                cols_to_expand = True
            elif df[col].apply(lambda x: isinstance(x, list)).any():
                expanded_df = df[col].apply(pd.Series)
                expanded_df.columns = [f'{col}_{i}' for i in range(expanded_df.shape[1])]
                df = df.drop(columns=[col]).join(expanded_df)
                cols_to_expand = True
    return df

# Expande las columnas que contienen diccionarios o listas
df_train = expand_columns(df_train)

# Separar características (X) y la variable objetivo (y)
X_train = df_train.drop('condition', axis=1)
y_train = df_train['condition']

# Seleccionar las columnas numéricas y categóricas
num_features = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_features = X_train.select_dtypes(include=['object', 'category']).columns

# Preprocesamiento para las columnas numéricas
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocesamiento para las columnas categóricas
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Crear un preprocesador que aplique las transformaciones necesarias a las columnas correspondientes
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

# Función de objetivo para Optuna
def objective(trial):
    # Definir los parámetros a ajustar
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)

    # Crear el modelo con los hiperparámetros propuestos
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        ))
    ])

    # Entrenar el modelo
    model.fit(X_train, y_train)

    # Evaluar el modelo
    y_train_pred = model.predict(X_train)
    accuracy = accuracy_score(y_train, y_train_pred)
    return accuracy

# Crear un estudio de Optuna para maximizar la precisión
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Imprimir los mejores parámetros encontrados
print("Mejores parámetros encontrados: ", study.best_params)
print("Mejor precisión en datos de entrenamiento: ", study.best_value)

# Usar los mejores parámetros para ajustar el modelo final
best_params = study.best_params
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        min_samples_leaf=best_params['min_samples_leaf'],
        random_state=42
    ))
])

# Entrenar el modelo con los mejores parámetros
model.fit(X_train, y_train)

# Evaluar el modelo final
y_train_pred = model.predict(X_train)
print("Accuracy en datos de entrenamiento con mejor modelo:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

# Cargar los datos de prueba desde el archivo JSONLines
test_file_path = '/content/test_data.jsonlines'
df_test = pd.read_json(test_file_path, lines=True)

# Expande las columnas que contienen diccionarios o listas en los datos de prueba
df_test = expand_columns(df_test)

# Asegurarse de que las columnas de test coincidan con las del train
# Obtener columnas que están en train pero no en test
missing_cols = set(X_train.columns) - set(df_test.columns)

# Crear columnas faltantes en df_test con valores por defecto (0)
for col in missing_cols:
    df_test[col] = 0

# Asegurarse de que las columnas de test coincidan con las del train
df_test = df_test[X_train.columns]

# Preprocesar los datos de prueba usando el preprocesador del modelo entrenado
X_test_preprocessed = model.named_steps['preprocessor'].transform(df_test)

# Hacer predicciones en los datos de prueba
predictions = model.named_steps['classifier'].predict(X_test_preprocessed)

# Convertir las predicciones a un DataFrame
df_predictions = pd.DataFrame(predictions, columns=['condition'])

# Guardar las predicciones en un archivo CSV
output_file_path = '/content/predictions.csv'
df_predictions.to_csv(output_file_path, index=False)

print("Predicciones guardadas en:", output_file_path)

[I 2024-07-11 19:30:00,256] A new study created in memory with name: no-name-2294e4c6-c417-45bd-9982-69e9ac919f98
[I 2024-07-11 19:33:02,452] Trial 0 finished with value: 0.6711857142857143 and parameters: {'n_estimators': 215, 'max_depth': 21, 'min_samples_split': 8, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.6711857142857143.
[I 2024-07-11 19:35:00,993] Trial 1 finished with value: 0.5513857142857143 and parameters: {'n_estimators': 82, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.6711857142857143.
[I 2024-07-11 19:37:31,951] Trial 2 finished with value: 0.6318285714285714 and parameters: {'n_estimators': 151, 'max_depth': 24, 'min_samples_split': 10, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.6711857142857143.
[I 2024-07-11 19:40:01,444] Trial 3 finished with value: 0.7067571428571429 and parameters: {'n_estimators': 153, 'max_depth': 25, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 3 with value

Mejores parámetros encontrados:  {'n_estimators': 206, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1}
Mejor precisión en datos de entrenamiento:  0.7726
Accuracy en datos de entrenamiento con mejor modelo: 0.7726
              precision    recall  f1-score   support

         new       0.71      0.96      0.82     37588
        used       0.93      0.55      0.69     32412

    accuracy                           0.77     70000
   macro avg       0.82      0.76      0.76     70000
weighted avg       0.81      0.77      0.76     70000



  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[col] = 0
  df_test[co

Predicciones guardadas en: /content/predictions.csv


In [5]:
from google.colab import files

# Descargar el archivo de predicciones
files.download('/content/predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>