## El siguiente código es un prototipo del código real que buscará el mejor modelo para predecir los horarios óptimos de llamada por votante

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# 1) Iniciar sesión Spark
spark = SparkSession.builder \
    .appName("Modelo_Prediccion_Respuesta_Llamada") \
    .enableHiveSupport() \
    .getOrCreate()

# 2) Cargar las tablas registradas en el metastore
voters_df = spark.table("call_center_db.silver_voters")
calls_df = spark.table("call_center_db.silver_calls")

# 3) Unir por voter_id (código único de votante)
joined_df = calls_df.alias("c") \
    .join(voters_df.alias("v"), on="voter_id", how="left") \
    .select(
        "v.voter_id",
        "v.full_name",
        "v.district_number",
        "v.age",
        "v.gender",
        "c.call_date",
        "c.phone_number_dialed",
        "c.status"
    )

# 4) Extraer la hora de la llamada y construir la variable binaria 'answered'
from pyspark.sql.functions import hour, to_timestamp, col, lower, when
processed_df = joined_df.withColumn("call_hour", hour(to_timestamp("call_date", "yyyy-MM-dd HH:mm:ss"))) \
                         .withColumn("answered", when(lower(col("status")).isin("answered", "completed"), 1).otherwise(0)) \
                         .select("age", "gender", "call_hour", "answered") \
                         .dropna()

# 5) Preprocesamiento de variables categóricas
index_gender = StringIndexer(inputCol="gender", outputCol="gender_index")
onehot_gender = OneHotEncoder(inputCol="gender_index", outputCol="gender_vec")

# 6) Ensamblar características
assembler = VectorAssembler(
    inputCols=["age", "call_hour", "gender_vec"],
    outputCol="features"
)

# 7) Modelo de clasificación
rf = RandomForestClassifier(labelCol="answered", featuresCol="features", seed=42)

# 8) Pipeline
pipeline = Pipeline(stages=[index_gender, onehot_gender, assembler, rf])

# 9) Entrenar modelo
train_data, test_data = processed_df.randomSplit([0.8, 0.2], seed=123)
model = pipeline.fit(train_data)

# 10) Evaluación
predictions = model.transform(test_data)
evaluator = BinaryClassificationEvaluator(labelCol="answered")
auc = evaluator.evaluate(predictions)

print(f"AUC del modelo: {auc:.4f}")
