In [None]:
!pip install pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tf').getOrCreate()



In [None]:
df = spark.read.csv("BankChurners.csv", inferSchema=True, header=True)
df.printSchema()
df.show(5)

root
 |-- CLIENTNUM: integer (nullable = true)
 |-- Attrition_Flag: string (nullable = true)
 |-- Customer_Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Dependent_count: integer (nullable = true)
 |-- Education_Level: string (nullable = true)
 |-- Marital_Status: string (nullable = true)
 |-- Income_Category: string (nullable = true)
 |-- Card_Category: string (nullable = true)
 |-- Months_on_book: integer (nullable = true)
 |-- Total_Relationship_Count: integer (nullable = true)
 |-- Months_Inactive_12_mon: integer (nullable = true)
 |-- Contacts_Count_12_mon: integer (nullable = true)
 |-- Credit_Limit: double (nullable = true)
 |-- Total_Revolving_Bal: integer (nullable = true)
 |-- Avg_Open_To_Buy: double (nullable = true)
 |-- Total_Amt_Chng_Q4_Q1: double (nullable = true)
 |-- Total_Trans_Amt: integer (nullable = true)
 |-- Total_Trans_Ct: integer (nullable = true)
 |-- Total_Ct_Chng_Q4_Q1: double (nullable = true)
 |-- Avg_Utilization_Ratio: double (n

In [None]:
df = df.select(
          'Attrition_Flag', 'Total_Amt_Chng_Q4_Q1', 'Total_Ct_Chng_Q4_Q1',
          'Total_Revolving_Bal', 'Contacts_Count_12_mon', 'Avg_Utilization_Ratio', 'Total_Trans_Amt',
          'Months_Inactive_12_mon', 'Total_Relationship_Count')

# Eliminar filas con datos faltantes
df = df.na.drop()

df.show(5)

+-----------------+--------------------+-------------------+-------------------+---------------------+---------------------+---------------+----------------------+------------------------+
|   Attrition_Flag|Total_Amt_Chng_Q4_Q1|Total_Ct_Chng_Q4_Q1|Total_Revolving_Bal|Contacts_Count_12_mon|Avg_Utilization_Ratio|Total_Trans_Amt|Months_Inactive_12_mon|Total_Relationship_Count|
+-----------------+--------------------+-------------------+-------------------+---------------------+---------------------+---------------+----------------------+------------------------+
|Existing Customer|               1.335|              1.625|                777|                    3|                0.061|           1144|                     1|                       5|
|Existing Customer|               1.541|              3.714|                864|                    2|                0.105|           1291|                     1|                       6|
|Existing Customer|               2.594|              2

In [None]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)
from pyspark.ml import Pipeline

In [None]:
assembler = VectorAssembler(inputCols=[
          'Total_Amt_Chng_Q4_Q1', 'Total_Ct_Chng_Q4_Q1',
          'Total_Revolving_Bal', 'Contacts_Count_12_mon', 'Avg_Utilization_Ratio', 'Total_Trans_Amt',
          'Months_Inactive_12_mon', 'Total_Relationship_Count'],
                            outputCol='features')
attrition_indexer = StringIndexer(inputCol='Attrition_Flag',outputCol='Attrition_Flag_Index')

In [None]:
pipeline = Pipeline(stages=[attrition_indexer, assembler])

In [None]:
df = pipeline.fit(df).transform(df)
df.show(5)

+-----------------+--------------------+-------------------+-------------------+---------------------+---------------------+---------------+----------------------+------------------------+--------------------+--------------------+
|   Attrition_Flag|Total_Amt_Chng_Q4_Q1|Total_Ct_Chng_Q4_Q1|Total_Revolving_Bal|Contacts_Count_12_mon|Avg_Utilization_Ratio|Total_Trans_Amt|Months_Inactive_12_mon|Total_Relationship_Count|Attrition_Flag_Index|            features|
+-----------------+--------------------+-------------------+-------------------+---------------------+---------------------+---------------+----------------------+------------------------+--------------------+--------------------+
|Existing Customer|               1.335|              1.625|                777|                    3|                0.061|           1144|                     1|                       5|                 0.0|[1.335,1.625,777....|
|Existing Customer|               1.541|              3.714|                

In [None]:
df = df.select('Attrition_Flag_Index','features')
df_train, df_test = df.randomSplit([0.7, 0.3])
df_train.show(5)

+--------------------+--------------------+
|Attrition_Flag_Index|            features|
+--------------------+--------------------+
|                 0.0|[0.256,0.522,1519...|
|                 0.0|[0.276,0.174,2189...|
|                 0.0|[0.289,0.391,1901...|
|                 0.0|[0.293,1.167,2076...|
|                 0.0|[0.296,0.429,0.0,...|
+--------------------+--------------------+
only showing top 5 rows



In [None]:
from pyspark.ml.classification import (DecisionTreeClassifier,
                                       RandomForestClassifier,
                                       GBTClassifier)
from pyspark.ml.classification import LogisticRegression

In [None]:
# Árbol de decisiones
dt = DecisionTreeClassifier(labelCol="Attrition_Flag_Index", featuresCol="features", maxDepth=5, minInstancesPerNode=1)

# Random Forest
rf = RandomForestClassifier(labelCol="Attrition_Flag_Index", featuresCol="features", numTrees=100)

# Gradient Boosting Tree
gb = GBTClassifier(labelCol="Attrition_Flag_Index", featuresCol="features", maxIter=20)

# Regresión Logística
rl = LogisticRegression(featuresCol='features',
                                     labelCol='Attrition_Flag_Index')

In [None]:
# Entrenar los 3 modelos con datos de entrenamiento
modelo_DT = dt.fit(df_train)
modelo_RF = rf.fit(df_train)
modelo_GB = gb.fit(df_train)
modelo_RL = rl.fit(df_train)

In [None]:
# Aplicar el modelo de árbol de decisiones a los datos de prueba
preds_DT = modelo_DT.transform(df_test)

# Resultados con Árbol de decisiones
preds_DT.show(5)

# Mostrar solo algunas columnas
preds_DT.select("prediction", "Attrition_Flag_Index", "features").show(5)

+--------------------+--------------------+--------------+--------------------+----------+
|Attrition_Flag_Index|            features| rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------+--------------------+----------+
|                 0.0|(8,[0,1,5,7],[0.8...|  [111.0,32.0]|[0.77622377622377...|       0.0|
|                 0.0|[0.278,0.217,1704...|  [297.0,39.0]|[0.88392857142857...|       0.0|
|                 0.0|[0.294,0.172,1752...|   [73.0,53.0]|[0.57936507936507...|       0.0|
|                 0.0|[0.308,0.679,1277...|[4032.0,153.0]|[0.96344086021505...|       0.0|
|                 0.0|[0.31,0.65,0.0,1....|  [790.0,23.0]|[0.97170971709717...|       0.0|
+--------------------+--------------------+--------------+--------------------+----------+
only showing top 5 rows

+----------+--------------------+--------------------+
|prediction|Attrition_Flag_Index|            features|
+----------+--------------------+-------------

In [None]:
preds_RF = modelo_RF.transform(df_test)

# Resultados con Random Forest
preds_RF.show(5)

+--------------------+--------------------+--------------------+--------------------+----------+
|Attrition_Flag_Index|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+----------+
|                 0.0|(8,[0,1,5,7],[0.8...|[71.4329799679194...|[0.71432979967919...|       0.0|
|                 0.0|[0.278,0.217,1704...|[83.4404768634218...|[0.83440476863421...|       0.0|
|                 0.0|[0.294,0.172,1752...|[72.4643381838908...|[0.72464338183890...|       0.0|
|                 0.0|[0.308,0.679,1277...|[88.395880369172,...|[0.88395880369172...|       0.0|
|                 0.0|[0.31,0.65,0.0,1....|[78.4430750061674...|[0.78443075006167...|       0.0|
+--------------------+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [None]:
# Aplicar el modelo de gradient boost a los datos de prueba
preds_GB = modelo_GB.transform(df_test)

# Resultados con Gradient Boosting
preds_GB.show(5)

+--------------------+--------------------+--------------------+--------------------+----------+
|Attrition_Flag_Index|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+----------+
|                 0.0|(8,[0,1,5,7],[0.8...|[0.79123055741689...|[0.82955278950431...|       0.0|
|                 0.0|[0.278,0.217,1704...|[0.85877443284756...|[0.84781284528982...|       0.0|
|                 0.0|[0.294,0.172,1752...|[0.35834162771226...|[0.67187622959874...|       0.0|
|                 0.0|[0.308,0.679,1277...|[1.12950097307150...|[0.90542420105831...|       0.0|
|                 0.0|[0.31,0.65,0.0,1....|[1.34075299516463...|[0.93592649427219...|       0.0|
+--------------------+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [None]:
preds_RL = modelo_RL.transform(df_test)

# Resultados con Random Forest
preds_RL.show(5)

+--------------------+--------------------+--------------------+--------------------+----------+
|Attrition_Flag_Index|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+----------+
|                 0.0|(8,[0,1,5,7],[0.8...|[5.58061140995187...|[0.99624390182566...|       0.0|
|                 0.0|[0.278,0.217,1704...|[0.16975759056321...|[0.54233777354872...|       0.0|
|                 0.0|[0.294,0.172,1752...|[-0.7457223343307...|[0.32175409629306...|       1.0|
|                 0.0|[0.308,0.679,1277...|[3.26953514264053...|[0.96336877072950...|       0.0|
|                 0.0|[0.31,0.65,0.0,1....|[1.38631697664469...|[0.80000361845941...|       0.0|
+--------------------+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [None]:
from pyspark.ml.evaluation import (BinaryClassificationEvaluator,
                                   MulticlassClassificationEvaluator)
import numpy as np
from sklearn.metrics import classification_report

In [None]:
# Evaluador de exactitud
evaluador = MulticlassClassificationEvaluator(labelCol="Attrition_Flag_Index", predictionCol="prediction", metricName="accuracy")

# Evaluador: usando AUC
evaluadorAUC = BinaryClassificationEvaluator(labelCol="Attrition_Flag_Index", 
                                             rawPredictionCol="prediction",
                                             metricName="areaUnderROC")

In [None]:
# Métricas con árboles de decisión
exactitud_dt = evaluador.evaluate(preds_DT)
auc_dt = evaluadorAUC.evaluate(preds_DT)

print("Usando Árboles de decisión: exactitud={}, AUC={:.3f}".format(exactitud_dt, auc_dt))
print(classification_report(preds_DT.select('Attrition_Flag_Index').collect(), preds_DT.select('prediction').collect()))

Usando Árboles de decisión: exactitud=0.9048723897911833, AUC=0.792
              precision    recall  f1-score   support

         0.0       0.94      0.95      0.94      2559
         1.0       0.71      0.63      0.67       458

    accuracy                           0.90      3017
   macro avg       0.82      0.79      0.81      3017
weighted avg       0.90      0.90      0.90      3017



In [None]:
# Métricas con random forest
exactitud_rf = evaluador.evaluate(preds_RF)
auc_rf = evaluadorAUC.evaluate(preds_RF)

print("Usando Random Forest: exactitud={:3f}, AUC={:.3f}".format(exactitud_rf, auc_rf))
print(classification_report(preds_RF.select('Attrition_Flag_Index').collect(), preds_RF.select('prediction').collect()))

Usando Random Forest: exactitud=0.910507, AUC=0.752
              precision    recall  f1-score   support

         0.0       0.92      0.98      0.95      2559
         1.0       0.82      0.52      0.64       458

    accuracy                           0.91      3017
   macro avg       0.87      0.75      0.79      3017
weighted avg       0.91      0.91      0.90      3017



In [None]:
# Métricas con gradient boosting
exactitud_gb = evaluador.evaluate(preds_GB)
auc_gb = evaluadorAUC.evaluate(preds_GB)

print("Usando Gradient Boosting: exactitud={:3f}, AUC={:.3f}".format(exactitud_gb, auc_gb))
print(classification_report(preds_GB.select('Attrition_Flag_Index').collect(), preds_GB.select('prediction').collect()))

Usando Gradient Boosting: exactitud=0.937355, AUC=0.841
              precision    recall  f1-score   support

         0.0       0.95      0.98      0.96      2559
         1.0       0.86      0.70      0.77       458

    accuracy                           0.94      3017
   macro avg       0.90      0.84      0.87      3017
weighted avg       0.93      0.94      0.93      3017



In [None]:
# Métricas con regresión logística
exactitud_rl = evaluador.evaluate(preds_RL)
auc_rl = evaluador.evaluate(preds_RL)

print("Usando Regresión Logística: exactitud={:3f}, AUC={:.3f}".format(exactitud_rl, auc_rl))
print(classification_report(preds_RL.select('Attrition_Flag_Index').collect(), preds_RL.select('prediction').collect()))

Usando Regresión Logística: exactitud=0.885648, AUC=0.886
              precision    recall  f1-score   support

         0.0       0.90      0.97      0.94      2559
         1.0       0.72      0.40      0.51       458

    accuracy                           0.89      3017
   macro avg       0.81      0.69      0.72      3017
weighted avg       0.87      0.89      0.87      3017



In [None]:
import numpy as np
from sklearn.metrics import average_precision_score

average_precision = average_precision_score(salida.predictions.select('Attrition_Flag_Index').collect(), salida.predictions.select('prediction').collect())

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))

#np.array(salida.predictions.select('Attrition_Flag_Index','prediction').collect())


Average precision-recall score: 0.17
