In [1]:
import pandas as pd
import lightgbm as lgb
import os

In [2]:
# Definir los parámetros de la corrida en un diccionario
PARAM = {
    "experimento": "KA4240_python",
    "input": {
        "dataset": "/home/matias/datasets/dataset_pequeno.csv",
        "training": [202107],  # Meses de entrenamiento
        "future": [202109]  # Meses de aplicación del modelo
    },
    "finalmodel": {
        "semilla": 374063,
        "num_iterations": 559,
        "learning_rate": 0.0100746999,
        "feature_fraction": 0.5144127527,
        "min_data_in_leaf": 505,
        "num_leaves": 44,
        "max_bin": 31
    }
}

In [3]:
# Establecer el directorio de trabajo
os.chdir("/home/matias/Dropbox/ITBA/Mineria_de_Datos/dm2023b/")

In [4]:
# Cargar el dataset donde vamos a entrenar el modelo
dataset = pd.read_csv(PARAM["input"]["dataset"])

# Crear la columna clase01 (binaria)
dataset["clase01"] = dataset["clase_ternaria"].apply(lambda x: 1 if x in ["BAJA+2", "BAJA+1"] else 0)


  dataset = pd.read_csv(PARAM["input"]["dataset"])


In [5]:
# Definir las características a utilizar
campos_buenos = [col for col in dataset.columns if col not in ["clase_ternaria", "clase01"]]


In [6]:
# Establecer la columna "train" para el conjunto de entrenamiento
dataset["train"] = 0
dataset.loc[dataset["foto_mes"].isin(PARAM["input"]["training"]), "train"] = 1


In [7]:
# Crear carpetas para los resultados del experimento
os.makedirs("./exp/", exist_ok=True)
os.makedirs(f"./exp/{PARAM['experimento']}/", exist_ok=True)


In [8]:
# Establecer el directorio de trabajo del experimento
os.chdir(f"./exp/{PARAM['experimento']}/")

In [9]:
# Crear el conjunto de entrenamiento en el formato requerido por LightGBM
dtrain = lgb.Dataset(
    data=dataset.loc[dataset["train"] == 1, campos_buenos],
    label=dataset.loc[dataset["train"] == 1, "clase01"]
)

In [10]:
# Generar el modelo
parametros_modelo = {
    "objective": "binary",
    "max_bin": PARAM["finalmodel"]["max_bin"],
    "learning_rate": PARAM["finalmodel"]["learning_rate"],
    "num_iterations": PARAM["finalmodel"]["num_iterations"],
    "num_leaves": PARAM["finalmodel"]["num_leaves"],
    "min_data_in_leaf": PARAM["finalmodel"]["min_data_in_leaf"],
    "feature_fraction": PARAM["finalmodel"]["feature_fraction"],
    "seed": PARAM["finalmodel"]["semilla"]
}

modelo = lgb.train(parametros_modelo, dtrain)



[LightGBM] [Info] Number of positive: 2346, number of negative: 162336
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3436
[LightGBM] [Info] Number of data points in the train set: 164682, number of used features: 142
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014246 -> initscore=-4.236957
[LightGBM] [Info] Start training from score -4.236957


In [11]:
# Guardar la importancia de las variables
tb_importancia = pd.DataFrame({
    "Feature": campos_buenos,
    "Importance": modelo.feature_importance(importance_type="gain")
})
archivo_importancia = "impo.txt"
tb_importancia.to_csv(archivo_importancia, sep="\t", index=False)

In [12]:
# Aplicar el modelo a los datos sin clase
dapply = dataset.loc[dataset["foto_mes"].isin(PARAM["input"]["future"])]

In [13]:
# Aplicar el modelo a los nuevos datos
prediccion = modelo.predict(dapply[campos_buenos])

In [14]:
# Generar la tabla de entrega
tb_entrega = dapply[["numero_de_cliente", "foto_mes"]]
tb_entrega["prob"] = prediccion

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tb_entrega["prob"] = prediccion


In [15]:
# Guardar las probabilidades del modelo
tb_entrega.to_csv("prediccion.txt", sep="\t", index=False)

In [16]:
# Generar archivos con los mejores "envios"
cortes = list(range(8000, 12001, 500))
for envios in cortes:
    tb_entrega["Predicted"] = 0
    tb_entrega.loc[0:envios, "Predicted"] = 1
    archivo_salida = f"{PARAM['experimento']}_{envios}.csv"
    tb_entrega[["numero_de_cliente", "Predicted"]].to_csv(archivo_salida, sep=",", index=False)

print("\n\nLa generación de los archivos para Kaggle ha terminado\n")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tb_entrega["Predicted"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tb_entrega["Predicted"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tb_entrega["Predicted"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instea



La generación de los archivos para Kaggle ha terminado
