In [7]:
import socket, os
print("Host:", socket.gethostname())
print("CWD :", os.getcwd())


Host: 9bd8768ed5ea
CWD : /content


In [5]:
# =============================================================================
# TÍTULO
# Particionamiento de Datos de Préstamos LendingClub (2007–2020Q3) en PySpark
# =============================================================================

# DESCRIPCIÓN
# Este conjunto de datos contiene información histórica de préstamos personales
# emitidos por LendingClub entre 2007 y el tercer trimestre de 2020. Incluye
# variables como monto solicitado, propósito del préstamo, duración, estado del
# préstamo, historial crediticio, ingresos del solicitante, entre otros.

# REGLAS DE PARTICIONAMIENTO
# Cada préstamo se asigna a un estrato exclusivo definido por la combinación de
# las variables `grade` (A–G) y `loan_status` (Fully Paid, Charged Off, Default,
# Current, etc.).  Estrato = grade × loan_status → 68 combinaciones no vacías.

# OBJETIVO
# Extraer sub-muestras para cada combinación identificada y verificar el
# correcto funcionamiento del código. Estas muestras son preliminares y servirán
# como base para construir conjuntos de entrenamiento y prueba en futuras etapas.
# =============================================================================

In [8]:
# =============================================================================
# CARGA DE DATOS
# =============================================================================
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import (
    col, countDistinct, min, max, mean, stddev, expr, percentile_approx
)
from pathlib import Path

# -----------------------------------------------------------------------------
# 0) MONTAJE DE GOOGLE DRIVE (solo Colab) –- omítelo si trabajas local
# -----------------------------------------------------------------------------
try:
    # Si ya está montado, esto no hace nada; si no, te pedirá autorización
    from google.colab import drive
    drive.mount("/content/drive", force_remount=False)
except ModuleNotFoundError:
    # No estás en Colab; continúa sin montar Drive
    pass


# -----------------------------------------------------------------------------
# 1) DETECTAR RUTA DEL DATASET
#    - Busca primero en /content (copiado localmente).
#    - Si no lo encuentra, busca en Drive.
# -----------------------------------------------------------------------------
def find_dataset():
    candidates = [
        Path("/content/Loan_status_2007-2020Q3.gzip"),
        Path("/content/Loan_status_2007-2020Q3.csv.gz"),
        Path("/content/drive/MyDrive/Proyecto/Loan_status_2007-2020Q3.gzip"),
        Path("/content/drive/MyDrive/Proyecto/Loan_status_2007-2020Q3.csv.gz"),
    ]
    for p in candidates:
        if p.exists():
            return str(p)
    raise FileNotFoundError(
        "No se encontró el archivo Loan_status_2007-2020Q3 (.gzip o .csv.gz). "
        "Verifica la ruta o súbelo a /content."
    )

file_path = find_dataset()
print("→ Leyendo archivo:", file_path)

# -----------------------------------------------------------------------------
# 2) INICIAR SPARK Y LEER EL ARCHIVO COMPROMIDO
# -----------------------------------------------------------------------------
spark = (
    SparkSession.builder
               .appName("LoanStatusPartitioning")
               .getOrCreate()
)

df = (
    spark.read
         .option("header", "true")
         .option("inferSchema", "true")
         .option("compression", "gzip")   # Descomprime .gz/.gzip al vuelo
         .csv(file_path)
)

# =============================================================================
# LIMPIEZA BÁSICA
# =============================================================================
categorical_vars = [
    "term", "grade", "emp_length", "home_ownership",
    "verification_status", "purpose", "loan_status"
]

numerical_vars = [
    "loan_amnt", "int_rate", "installment", "fico_range_low", "fico_range_high",
    "annual_inc", "dti", "open_acc", "total_acc", "revol_bal", "revol_util"
]

# 1) Eliminar registros con nulos en variables clave
df = df.dropna(subset=categorical_vars + numerical_vars)

# 2) Quitar el símbolo % y castear a double SOLO en las columnas que lo necesitan
percent_cols = ["int_rate", "revol_util"]
for c in percent_cols:
    df = df.withColumn(c, F.regexp_replace(col(c), '%', '').cast('double'))

# 3) Asegurar que el resto de numéricos sean double
for c in set(numerical_vars) - set(percent_cols):
    df = df.withColumn(c, col(c).cast('double'))

# 4) Columna de estrato para muestreo estratificado
df = df.withColumn("grade_status", F.concat_ws("_", "grade", "loan_status"))

df.cache()   # El DF se usará varias veces
print("✔ Dataset cargado, limpiado y listo para análisis.")

Mounted at /content/drive
→ Leyendo archivo: /content/drive/MyDrive/Proyecto/Loan_status_2007-2020Q3.gzip
✔ Dataset cargado, limpiado y listo para análisis.


In [9]:
# =============================================================================
# CARACTERIZACIÓN DE LA POBLACIÓN
# =============================================================================
print("\n================ ESTADÍSTICAS CATEGÓRICAS ================\n")
for c in categorical_vars:
    print(f"\n► {c}")
    (
        df.groupBy(c).count()
          .orderBy("count", ascending=False)
          .show(truncate=False)
    )

print("\n================= ESTADÍSTICAS NUMÉRICAS =================\n")
for c in numerical_vars:
    print(f"\n► {c}")
    (
        df.select(
            countDistinct(col(c)).alias("Valores_únicos"),
            min(col(c)).alias("Mínimo"),
            percentile_approx(col(c), 0.5).alias("Mediana"),
            mean(col(c)).alias("Media"),
            stddev(col(c)).alias("Desv_Estándar"),
            max(col(c)).alias("Máximo")
        )
        .show(truncate=False)
    )

# =============================================================================
# EXPLORACIÓN DE COMBINACIONES (estratos reales existentes)
# =============================================================================
print("\n================ LISTA DE ESTRATOS EXISTENTES ================\n")
df.select("grade_status").distinct().orderBy("grade_status").show(truncate=False)

# =============================================================================
# MUESTREO ESTRATIFICADO (10 % DE CADA ESTRATO)
# =============================================================================
estratos  = [r.grade_status for r in df.select("grade_status").distinct().collect()]
fractions = {e: 0.10 for e in estratos}           # 10 % en cada estrato
sample_df = df.sampleBy("grade_status", fractions, seed=42)

# Verificar distribución población vs. muestra
total_pop = df.count()
pop_dist = (
    df.groupBy("grade_status")
      .count()
      .withColumnRenamed("count", "original")
      .withColumn("pct_original", F.round(col("original") / total_pop * 100, 2))
)
sample_dist = (
    sample_df.groupBy("grade_status")
             .count()
             .withColumnRenamed("count", "muestra")
)
comparison = (
    pop_dist.join(sample_dist, "grade_status")
            .withColumn(
                "proporcion",
                F.round(col("muestra") / col("original") * 100, 2)
            )
            .orderBy("grade_status")
)

print("\n=========== COMPARACIÓN POBLACIÓN vs. MUESTRA ===========\n")
comparison.show(20, truncate=False)

# =============================================================================
# EJEMPLO: MOSTRAR ALGUNOS REGISTROS DE UN ESTRATO ESPECÍFICO
# =============================================================================
print("\n====== EJEMPLO DE REGISTROS MUESTREADOS (B, Fully Paid) ======\n")
(
    sample_df.filter(
        (col("grade") == "B") & (col("loan_status") == "Fully Paid")
    ).select(
        "grade", "loan_status", "loan_amnt", "int_rate", "annual_inc", "dti"
    ).show(3, truncate=False)
)




► term
+----------+-------+
|term      |count  |
+----------+-------+
| 36 months|1899703|
| 60 months|817879 |
+----------+-------+


► grade
+-----+------+
|grade|count |
+-----+------+
|B    |796324|
|C    |744780|
|A    |611109|
|D    |384498|
|E    |129949|
|F    |39420 |
|G    |11502 |
+-----+------+


► emp_length
+----------+------+
|emp_length|count |
+----------+------+
|10+ years |945379|
|< 1 year  |271703|
|2 years   |261642|
|3 years   |232108|
|1 year    |193968|
|5 years   |182216|
|4 years   |176008|
|6 years   |130614|
|7 years   |115977|
|8 years   |112614|
|9 years   |95352 |
| reactors"|1     |
+----------+------+


► home_ownership
+--------------+-------+
|home_ownership|count  |
+--------------+-------+
|MORTGAGE      |1344604|
|RENT          |1082750|
|OWN           |287044 |
|ANY           |2958   |
|OTHER         |177    |
|NONE          |48     |
|2 years       |1      |
+--------------+-------+


► verification_status
+-------------------+-------+
|verif