In [1]:
## ensemble

In [8]:
# limpio la memoria
Sys.time()
rm(list=ls(all.names=TRUE)) # remove all objects
gc(full=TRUE, verbose=FALSE) # garbage collection

[1] "2025-12-01 10:46:24 UTC"

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,781831,41.8,1810727,96.8,1567777,83.8
Vcells,1598929,12.2,1706870909,13022.4,1545618018,11792.2


In [9]:
setwd("~")

In [10]:
Sys.time()
require( "data.table" )

# leo el dataset
dataset1 <- fread("~/datasets/competencia_02_crudo.csv.gz" )
dataset2<- fread("~/datasets/competencia_03_crudo.csv.gz" )

dataset <- rbind(dataset1, dataset2)


# calculo el periodo0 consecutivo
dsimple <- dataset[, list(
  "pos" = .I,
  numero_de_cliente,
  periodo0 = as.integer(foto_mes/100)*12 +  foto_mes%%100 )
]


# ordeno
setorder( dsimple, numero_de_cliente, periodo0 )

# calculo topes
periodo_ultimo <- dsimple[, max(periodo0) ]
periodo_anteultimo <- periodo_ultimo - 1


# calculo los leads de orden 1 y 2
dsimple[, c("periodo1", "periodo2") :=
  shift(periodo0, n=1:2, fill=NA, type="lead"),  numero_de_cliente
]

# assign most common class values = "CONTINUA"
dsimple[ periodo0 < periodo_anteultimo, clase_ternaria := "CONTINUA" ]

# calculo BAJA+1
dsimple[ periodo0 < periodo_ultimo &
  ( is.na(periodo1) | periodo0 + 1 < periodo1 ),
  clase_ternaria := "BAJA+1"
]

# calculo BAJA+2
dsimple[ periodo0 < periodo_anteultimo & (periodo0+1 == periodo1 )
  & ( is.na(periodo2) | periodo0 + 2 < periodo2 ),
  clase_ternaria := "BAJA+2"
]

# pego el resultado en el dataset original y grabo
setorder( dsimple, pos )
dataset[, clase_ternaria := dsimple$clase_ternaria ]

rm(dsimple)
gc()
Sys.time()

[1] "2025-12-01 10:46:58 UTC"

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,782726,41.9,1810727,96.8,1567777,83.8
Vcells,1301417553,9929.1,2048325090,15627.5,1704858989,13007.1


[1] "2025-12-01 10:51:18 UTC"

In [11]:
# 1) Traer dataset del mes y ganancia real
dfuture <- dataset[foto_mes == 202107]
dfuture[, ganancia := ifelse(clase_ternaria=="BAJA+2", 780000, -20000)]


# 2) Traer archivo de predicciones en test
files <- list.files("/home/guadalesandro/buckets/b1/exp",
                    pattern="^prediccion_mid\\.txt$",
                    recursive=TRUE,
                    full.names=TRUE)


# 3) leer todos los archivos con header=TRUE
pred_list <- lapply(files, function(f){
  fread(f, sep="\t", header=TRUE, encoding="UTF-8")
})

pred <- rbindlist(pred_list, use.names=TRUE, fill=TRUE)


# 4) Une la predicción y la ganancia real por número de cliente

pred[, numero_de_cliente := as.character(numero_de_cliente)]
dfuture[, numero_de_cliente := as.character(numero_de_cliente)]
pred[, prob := as.numeric(prob)]


pred <- merge(
  pred[, .(numero_de_cliente, prob)],
  dfuture[, .(numero_de_cliente, ganancia)],
  by = "numero_de_cliente",
  all.x = TRUE
)

# 4) Promedia la probabilidad por cliente
pred_ensamble <- pred[, .(
  prob_mean = mean(prob),
  ganancia = first(ganancia)
), by = numero_de_cliente]

# 5) Ordenar y ganancia acumulada
setorder(pred_ensamble, -prob_mean)
pred_ensamble[, gan_acum := cumsum(ganancia)]

# 6) Ganancia final de los 11000 más probables
envios <- 11500
ganancia_final <- pred_ensamble[envios, gan_acum]
ganancia_final

print(files)

 [1] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_b2/prediccion_mid.txt"
 [2] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_d/prediccion_mid.txt" 
 [3] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_e/prediccion_mid.txt" 
 [4] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_e2/prediccion_mid.txt"
 [5] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_f/prediccion_mid.txt" 
 [6] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_h/prediccion_mid.txt" 
 [7] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_j/prediccion_mid.txt" 
 [8] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_k/prediccion_mid.txt" 
 [9] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_k2/prediccion_mid.txt"
[10] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_k3/prediccion_mid.txt"
[11] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_k4/prediccion_mid.txt"
[12] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_k5/prediccion_mid.txt"
[13] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_k6/prediccion_mid.txt"
[14] "/home/guadalesandro

In [12]:
# ====== 7) Métricas del ensamble ======

# Clase real: 1 si BAJA+2 (ganancia positiva)
pred_ensamble[, clase_real := ifelse(ganancia == 780000, 1, 0)]

# Clase predicha: 1 para los 11500 más probables
pred_ensamble[, clase_pred := 0]
pred_ensamble[1:envios, clase_pred := 1]

# Matriz de confusión
TP <- pred_ensamble[clase_real == 1 & clase_pred == 1, .N]
FP <- pred_ensamble[clase_real == 0 & clase_pred == 1, .N]
TN <- pred_ensamble[clase_real == 0 & clase_pred == 0, .N]
FN <- pred_ensamble[clase_real == 1 & clase_pred == 0, .N]

# Métricas
accuracy  <- (TP + TN) / (TP + TN + FP + FN)
precision <- TP / (TP + FP)
recall    <- TP / (TP + FN)
f1        <- 2 * precision * recall / (precision + recall)

# Mostrar resultados
list(
  ganancia_final = ganancia_final,
  accuracy  = accuracy,
  precision = precision,
  recall    = recall,
  f1        = f1,
  TP = TP, FP = FP, TN = TN, FN = FN
)


In [14]:
# ========= 7) Métricas por cada modelo individual ==========

# 7.1 volver a cargar cada archivo y agregar el nombre del modelo
pred_list <- lapply(files, function(f){
  df <- fread(f, sep="\t", header=TRUE, encoding="UTF-8")

  # nombre de la carpeta del experimento como identificador del modelo
  df[, modelo := basename(dirname(f))]

  df
})

pred_all <- rbindlist(pred_list, use.names=TRUE, fill=TRUE)

# 7.2 corregir tipos y unir con la verdad real
pred_all[, numero_de_cliente := as.character(numero_de_cliente)]
pred_all[, prob := as.numeric(prob)]

dfuture2 <- copy(dfuture)
dfuture2[, numero_de_cliente := as.character(numero_de_cliente)]

pred_all <- merge(
  pred_all,
  dfuture2[, .(numero_de_cliente, ganancia)],
  by = "numero_de_cliente",
  all.x = TRUE
)

# clase real binaria
pred_all[, clase_real := ifelse(ganancia == 780000, 1, 0)]

envios <- 11500

# 7.3 calcular métricas por cada modelo
metricas_modelo <- pred_all[
  , {
      # ordenar por prob en este modelo
      dt <- .SD[order(-prob)]

      # clase predicha según el top K
      dt[, clase_pred := 0]
      dt[1:envios, clase_pred := 1]

      TP <- dt[clase_real == 1 & clase_pred == 1, .N]
      FP <- dt[clase_real == 0 & clase_pred == 1, .N]
      TN <- dt[clase_real == 0 & clase_pred == 0, .N]
      FN <- dt[clase_real == 1 & clase_pred == 0, .N]

      accuracy  <- (TP + TN) / (TP + TN + FP + FN)
      precision <- TP / (TP + FP)
      recall    <- TP / (TP + FN)
      f1        <- 2 * precision * recall / (precision + recall)

      list(
        accuracy = accuracy,
        precision = precision,
        recall = recall,
        f1 = f1,
        TP = TP, FP = FP, TN = TN, FN = FN
      )
    },
  by = modelo
]

metricas_modelo


modelo,accuracy,precision,recall,f1,TP,FP,TN,FN
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>
apo-505_C3_b2,0.9324892,0.07113043,0.6554487,0.1283339,818,10682,152666,430
apo-505_C3_d,0.9323313,0.07,0.6450321,0.1262943,805,10695,152653,443
apo-505_C3_e,0.9324406,0.07078261,0.6522436,0.1277063,814,10686,152662,434
apo-505_C3_e2,0.9324892,0.07113043,0.6554487,0.1283339,818,10682,152666,430
apo-505_C3_f,0.9324528,0.07086957,0.6530449,0.1278632,815,10685,152663,433
apo-505_C3_h,0.932392,0.07043478,0.6490385,0.1270788,810,10690,152658,438
apo-505_C3_j,0.9324042,0.07052174,0.6498397,0.1272356,811,10689,152659,437
apo-505_C3_k,0.9325986,0.07191304,0.6626603,0.1297458,827,10673,152675,421
apo-505_C3_k2,0.9325135,0.07130435,0.6570513,0.1286476,820,10680,152668,428
apo-505_C3_k3,0.9323799,0.07034783,0.6482372,0.1269219,809,10691,152657,439


In [None]:
## predicción final ensamble

In [21]:
dfuture <- dataset[foto_mes == 202109]

In [28]:
# 1 - Levanta predicciones y las concatena
files <- list.files("/home/guadalesandro/buckets/b1/exp", #/C3
                    pattern="^prediccion\\.txt$",
                    recursive=TRUE,
                    full.names=TRUE)

print(files)

pred_list <- lapply(files, function(f){
  fread(f, sep="\t", header=TRUE, encoding="UTF-8")})
pred <- rbindlist(pred_list, use.names=TRUE, fill=TRUE)


pred[, numero_de_cliente := as.character(numero_de_cliente)]
pred[, prob := as.numeric(prob)]

#promedia probabilidades y las ordena
pred_ensamble <- pred[, .(prob_mean = mean(prob, na.rm=TRUE)), by = numero_de_cliente]
setorder(pred_ensamble, -prob_mean)

# Filtrar solo clientes de dfuture
clientes_future <- unique(dfuture$numero_de_cliente)
pred_ensamble <- pred_ensamble[numero_de_cliente %in% clientes_future]


envios <- 11500
pred_ensamble[, Predicted := 0L]
pred_ensamble[1:envios, Predicted := 1L]

cat("Cantidad de registros en el archivo final:", nrow(pred_ensamble), "\n")

#Guarda el archivo en carpeta final
output_dir <- "/home/guadalesandro/dmeyf2025/babycode/R_C3/C3_final"

# crear la carpeta si no existe
if(!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)
archivo_final <- file.path(output_dir, "C3_final_Alesandro.csv")

fwrite(
  pred_ensamble[, .(numero_de_cliente, Predicted)],
  file = archivo_final,
  sep = ","
)

cat("Archivo guardado en:", archivo_final, "\n")


 [1] "/home/guadalesandro/buckets/b1/exp/apo-505_a/prediccion.txt"                                                 
 [2] "/home/guadalesandro/buckets/b1/exp/apo-505_a2/prediccion.txt"                                                
 [3] "/home/guadalesandro/buckets/b1/exp/apo-505_base_integrada/prediccion.txt"                                    
 [4] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_b/prediccion.txt"                                              
 [5] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_c/prediccion.txt"                                              
 [6] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_d/prediccion.txt"                                              
 [7] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_e/prediccion.txt"                                              
 [8] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_e2/prediccion.txt"                                             
 [9] "/home/guadalesandro/buckets/b1/exp/apo-505_C3_f/prediccion.txt"   