<a href="https://colab.research.google.com/github/Junior11995/Challenge-Telecom-X-an-lisis-de-evasi-n-de-clientes/blob/main/desaf_o_telecom_x.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Encabezado y librerías




In [1]:
# --- Encabezado ---
# =========================================================
# 0) ENCABEZADO / CONFIG
# =========================================================
PROYECTO = "Telecom X - Churn"
AUTOR = "Junior Valera"
REPO = "https://github.com/Junior11995/Challenge-Telecom-X-an-lisis-de-evasi-n-de-clientes"

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from pandas import json_normalize

pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: f"{x:,.2f}")

def normalize_columns(cols: pd.Index) -> pd.Index:
    return (cols
            .str.strip().str.lower()
            .str.replace(r"\s+", "_", regex=True)
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8"))

def quality_report(df: pd.DataFrame) -> pd.DataFrame:
    rep = pd.DataFrame({
        "dtype": df.dtypes.astype(str),
        "n_null": df.isna().sum(),
        "pct_null": df.isna().mean().round(4),
        "n_unique": df.nunique(dropna=True)
    })
    rep["example_values"] = [df[c].dropna().unique()[:5] for c in df.columns]
    return rep

print(PROYECTO, AUTOR, REPO)



Telecom X - Churn Junior Valera https://github.com/Junior11995/Challenge-Telecom-X-an-lisis-de-evasi-n-de-clientes


# Extracción de datos desde API JSON (TelecomX_Data.json)

In [2]:
# === EXTRACCIÓN DESDE LA API (JSON) ===
# Fuente oficial (ramo main del repo que compartiste):
URL_FUENTE = "https://raw.githubusercontent.com/ingridcristh/challenge2-data-science-LATAM/main/TelecomX_Data.json"

import pandas as pd
import requests
from pandas import json_normalize

# Descarga segura
resp = requests.get(URL_FUENTE, timeout=30)
resp.raise_for_status()
data = resp.json()

# Conversión robusta a DataFrame
if isinstance(data, list):
    df_raw = pd.DataFrame(data)
elif isinstance(data, dict):
    try:
        df_raw = json_normalize(data, max_level=1)
    except Exception:
        df_raw = pd.DataFrame([data])
else:
    raise ValueError("Formato JSON no soportado.")

print("RAW shape:", df_raw.shape)
display(df_raw.head(5))

# (Opcional) guardar copia cruda para trazabilidad
df_raw.to_csv("telecomx_raw.csv", index=False)

RAW shape: (7267, 6)


Unnamed: 0,customerID,Churn,customer,phone,internet,account
0,0002-ORFBO,No,"{'gender': 'Female', 'SeniorCitizen': 0, 'Part...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'DSL', 'OnlineSecurity': '...","{'Contract': 'One year', 'PaperlessBilling': '..."
1,0003-MKNFE,No,"{'gender': 'Male', 'SeniorCitizen': 0, 'Partne...","{'PhoneService': 'Yes', 'MultipleLines': 'Yes'}","{'InternetService': 'DSL', 'OnlineSecurity': '...","{'Contract': 'Month-to-month', 'PaperlessBilli..."
2,0004-TLHLJ,Yes,"{'gender': 'Male', 'SeniorCitizen': 0, 'Partne...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'Fiber optic', 'OnlineSecu...","{'Contract': 'Month-to-month', 'PaperlessBilli..."
3,0011-IGKFF,Yes,"{'gender': 'Male', 'SeniorCitizen': 1, 'Partne...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'Fiber optic', 'OnlineSecu...","{'Contract': 'Month-to-month', 'PaperlessBilli..."
4,0013-EXCHZ,Yes,"{'gender': 'Female', 'SeniorCitizen': 1, 'Part...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'Fiber optic', 'OnlineSecu...","{'Contract': 'Month-to-month', 'PaperlessBilli..."


# Conoce el conjunto de datos

In [6]:
# =========================================================
# CONOCE EL CONJUNTO DE DATOS (perfilado inicial)
# Requiere: df_raw ya cargado desde la etapa de Extracción
# =========================================================

import pandas as pd
import numpy as np

# Trabajamos sobre una copia normalizada
df = df_raw.copy()
df.columns = (df.columns
              .str.strip().str.lower()
              .str.replace(r"\s+", "_", regex=True)
              .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8"))

# 1) Vista general
print("Shape (filas, columnas):", df.shape)
display(df.head(3))
df.info()          # tipos y nulos a alto nivel
display(df.dtypes) # tipos por columna

# 2) Resumen de calidad por columna
perfil = pd.DataFrame({
    "dtype": df.dtypes.astype(str),
    "n_null": df.isna().sum(),
    "pct_null": df.isna().mean().round(4),
    "n_unique": df.nunique(dropna=True)
}).sort_values(["pct_null","n_unique"], ascending=[False, True])
display(perfil.head(20))

# 3) Posibles columnas objetivo (churn/baja/cancelación)
target_candidates = [c for c in df.columns
                     if any(k in c for k in ["churn","baja","cancel","evas"])]

print("Posibles columnas objetivo:", target_candidates)

# Elegimos provisionalmente la 1ª columna candidata que parezca binaria
target = None
for c in target_candidates:
    if df[c].astype(str).str.lower().isin(["0","1","true","false","yes","no","si","sí"]).any():
        target = c
        break
if target is None and len(target_candidates) > 0:
    target = target_candidates[0]
print("Target provisional:", target)

# 4) Separación de numéricas y categóricas para orientar el EDA
num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
cat_cols = [c for c in df.columns if c not in num_cols]

print(f"Número de columnas numéricas: {len(num_cols)}")
print(f"Número de columnas categóricas: {len(cat_cols)}")

# 5) Top columnas potencialmente informativas:
#    - numéricas con baja proporción de nulos y varianza > 0
#    - categóricas con cardinalidad moderada (2..30 categorías)
informativas_num = [c for c in num_cols
                    if df[c].notna().mean() > 0.9 and df[c].nunique() > 1]
informativas_cat = [c for c in cat_cols
                    if 2 <= df[c].nunique(dropna=True) <= 30]

print("Numéricas informativas (candidatas):", informativas_num[:10])
print("Categóricas informativas (candidatas):", informativas_cat[:10])

# 6) Si hay target binaria, medimos relación rápida:
if target is not None:
    # Codificamos target a 0/1 de forma robusta
    y = df[target].astype(str).str.lower().map(
        {"1":1,"true":1,"yes":1,"si":1,"sí":1,
         "0":0,"false":0,"no":0}
    )
    # Si sigue con NaN (p.ej. 'No'/'Yes' capitalizado o 'Y'/'N'), intentamos fallback
    if y.isna().mean() > 0.3:
        y = (df[target].astype(str).str.lower().isin(["1","true","yes","si","sí","y"])).astype(int)

    # a) Correlación punto-biserial aproximada para numéricas
    import math
    corr_num = {}
    for c in informativas_num:
        s = pd.to_numeric(df[c], errors="coerce")
        m = pd.concat([s, y], axis=1).dropna()
        if m[target].nunique() == 2 and m[c].nunique() > 1:
            try:
                corr_num[c] = m[c].corr(m[target])
            except Exception:
                pass
    corr_num = pd.Series(corr_num).sort_values(key=lambda s: s.abs(), ascending=False).head(10)
    print("\nTop numéricas por |correlación| con target:")
    display(corr_num)

    # b) Dispersión de tasas de churn por categoría (gap máx - min) como señal
    churn_spread = {}
    for c in informativas_cat[:20]:
        tmp = pd.concat([df[c], y], axis=1).dropna()
        if tmp[c].nunique() >= 2:
            rates = tmp.groupby(c)[target].mean()
            churn_spread[c] = float(rates.max() - rates.min())
    churn_spread = pd.Series(churn_spread).sort_values(ascending=False).head(10)
    print("\nTop categóricas por diferencia de tasas de churn (max-min):")
    display(churn_spread)

Shape (filas, columnas): (7267, 6)


Unnamed: 0,customerid,churn,customer,phone,internet,account
0,0002-ORFBO,No,"{'gender': 'Female', 'SeniorCitizen': 0, 'Part...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'DSL', 'OnlineSecurity': '...","{'Contract': 'One year', 'PaperlessBilling': '..."
1,0003-MKNFE,No,"{'gender': 'Male', 'SeniorCitizen': 0, 'Partne...","{'PhoneService': 'Yes', 'MultipleLines': 'Yes'}","{'InternetService': 'DSL', 'OnlineSecurity': '...","{'Contract': 'Month-to-month', 'PaperlessBilli..."
2,0004-TLHLJ,Yes,"{'gender': 'Male', 'SeniorCitizen': 0, 'Partne...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'Fiber optic', 'OnlineSecu...","{'Contract': 'Month-to-month', 'PaperlessBilli..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7267 entries, 0 to 7266
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   customerid  7267 non-null   object
 1   churn       7267 non-null   object
 2   customer    7267 non-null   object
 3   phone       7267 non-null   object
 4   internet    7267 non-null   object
 5   account     7267 non-null   object
dtypes: object(6)
memory usage: 340.8+ KB


Unnamed: 0,0
customerid,object
churn,object
customer,object
phone,object
internet,object
account,object


TypeError: unhashable type: 'dict'