<a href="https://colab.research.google.com/github/MiguelAngeloTr/BIGDATA/blob/main/C2/ProyectoFinal/Proyectofinal1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Info base de datos: https://archive.ics.uci.edu/dataset/222/bank+marketing

In [1]:
!pip install -q pyspark
!pip install -q findspark

In [2]:
import zipfile
import os, pathlib, PIL, shutil, glob
from google.colab import files

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.window as W
import pyspark.sql.types as T

import seaborn as sns
import matplotlib.pyplot as plt

from pandas.core.dtypes.api import is_numeric_dtype, is_string_dtype
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

from pyspark.ml.classification import LogisticRegression
from mlxtend.plotting import plot_confusion_matrix #falsos positivos falsos negativos
from mlxtend.evaluate import confusion_matrix
from yellowbrick.classifier import ROCAUC
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [None]:
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"bonangelrock","key":"799beae43c9c16dfa76c77a938f1711d"}'}

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d janiobachmann/bank-marketing-dataset
!ls

Dataset URL: https://www.kaggle.com/datasets/janiobachmann/bank-marketing-dataset
License(s): CC0-1.0
bank-marketing-dataset.zip  kaggle.json  sample_data


In [None]:
zip_ref = zipfile.ZipFile('bank-marketing-dataset.zip', 'r') #localizar el nombre del archivo .zip y colocarlo
zip_ref.extractall('Data') #Extracción de archivos descargados en una carpeta llamada 'files', podria ser cualquier nombre
zip_ref.close()

In [None]:
spark = SparkSession.builder.appName("LogReg PySpark").getOrCreate()
spark

In [None]:
datos = spark.read.csv('/content/Data/bank.csv',inferSchema=True, header=True)
datos.show(5, truncate=False)

In [None]:
datos.printSchema()

#Informacion Faltante y Columnas innecesarias


In [None]:
datos_f1 = datos.drop("duration")
datos_f1.show(5, truncate=False)

In [None]:
def faltantes(df):
  falta = df.select([F.sum(F.col(c).isNull().cast('int')).alias(c) for c in df.columns])
  return falta.show()

In [None]:
faltantes(datos_f1)

# Analisis Exploratorio de Datos (EDA)

In [None]:
numeric_features = [t[0] for t in datos_f1.dtypes if t[1] == 'int' or t[1] == 'double']
datos_f1.select(numeric_features).describe().toPandas().transpose()

In [None]:
datos_EDA = datos_f1.toPandas()
num_list = []
cat_list = []

fig, axes = plt.subplots(2, len(datos_EDA.columns), figsize=(300, 10))  # Ajusta el tamaño según sea necesario
fig.subplots_adjust(hspace=0.4, wspace=0.4)

axes = axes.flatten()

# Contador para los gráficos
graph_count = 0

# Iterar sobre las columnas del DataFrame
for column in datos_EDA.columns:
    if is_numeric_dtype(datos_EDA[column]):
        # Histograma en la primera fila
        sns.histplot(datos_EDA[column], kde=True, ax=axes[graph_count])
        axes[graph_count].set_title(f"{column}")

        # Boxplot en la segunda fila
        sns.boxplot(x=datos_EDA[column], ax=axes[graph_count + len(datos_EDA.columns)])
        axes[graph_count + len(datos_EDA.columns)].set_title(f"{column}")
        num_list.append(column)
        graph_count += 1  # Pasar al siguiente gráfico

    elif is_string_dtype(datos_EDA[column]):
        sns.countplot(data=datos_EDA, x=datos_EDA[column], ax=axes[graph_count])
        axes[graph_count].set_title(f"{column}")
        cat_list.append(column)
        axes[graph_count + len(datos_EDA.columns)].axis("off")
        graph_count += 1  # Pasar al siguiente gráfico

# Mostrar la gráfica
plt.show()


In [None]:
numeric_data = datos.select(numeric_features).toPandas()
sns.pairplot(numeric_data, height=1.4)
plt.show()

In [None]:
sns.heatmap(numeric_data.corr('spearman'),annot=True)
plt.show()

Es evidente que no hay variables numéricas altamente correlacionadas. Por lo tanto, las mantendremos todas para el modelo. Sin embargo, las columnas de día y mes no son realmente útiles, eliminaremos estas dos columnas.

In [None]:
datos_f2 = datos_f1.select('age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact','campaign', 'pdays', 'previous', 'poutcome', 'deposit')
cols = datos_f2.columns
datos_f2.printSchema()

In [None]:
salida = 'deposit'
caracteristicas = [car for car in cols if car not in salida]
cat_car = [c for c, t in datos_f2.dtypes if t == 'string' and c not in salida]
num_car = [c for c, t in datos_f2.dtypes if t in ['int', 'double'] and c not in salida]
print('Cat:',cat_car,'\nNum:', num_car)

In [None]:
indexers = [StringIndexer(inputCol = col,
                          outputCol = "c_{}".format(col)) for col in cat_car]

encoders = [OneHotEncoder(inputCol = "c_{}".format(col),
                          outputCol = "o_{}".format(col)) for col in cat_car]

assembler = VectorAssembler(inputCols = num_car + ["o_{}".format(col) for col in cat_car], outputCol = "features")

labelIndexer = StringIndexer(inputCol=salida, outputCol='label') if salida in datos_f2.columns else None

preprocessor = Pipeline(stages = indexers + encoders + [assembler] + [labelIndexer]).fit(datos_f2)


In [None]:
datos_f3 = preprocessor.transform(datos_f2)
selectedCols = ['label', 'features'] #+ cols
datos_f4 = datos_f3.select(selectedCols)
datos_f4.printSchema()

In [None]:
datos_f4.show(5,truncate=False)

## División de conjuntos de ajuste (entrenamiento) y prueba

In [None]:
train, test = datos_f4.randomSplit([0.7, 0.3], seed = 2024)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

#Aplicación del Algoritmo de Clasificación: Regresión Logística

In [None]:
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(train)

In [None]:
print(lrModel.coefficients)

# Evaluación del Modelo

In [None]:
def calculate_metrics(predictions, label_col):
    evaluator = BinaryClassificationEvaluator(labelCol=label_col)
    auc_roc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})

    multi_evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction")
    accuracy = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "accuracy"})
    precision = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedPrecision"})
    recall = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedRecall"})
    f1 = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "f1"})

    return {
        "AUC-ROC": auc_roc,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

In [None]:
def conf_matrix(y_log):
  fig, ax = plot_confusion_matrix(confusion_matrix(y_log.label, y_log.prediction), show_normed=True,
                                  figsize=(3,3))
  fig.show()


In [None]:
train_predictions = lrModel.transform(train)
test_predictions = lrModel.transform(test)

## Datos de Entrenamiento

In [None]:
print("Métricas en datos de entrenamiento:\n")
train_metrics = calculate_metrics(train_predictions, 'label')
for metric, value in train_metrics.items():
    print(f"{metric}: {value}")

In [None]:
conf_matrix(train_predictions.select('label','prediction').toPandas())

## Datos de Prueba

In [None]:
print("\nMétricas en datos de prueba:\n")
test_metrics = calculate_metrics(test_predictions, 'label')
for metric, value in test_metrics.items():
    print(f"{metric}: {value}")

In [None]:
conf_matrix(test_predictions.select('label','prediction').toPandas())