In [1]:
import pandas as pd
from pathlib import Path

In [2]:
root_dir = Path("../")
data_path = root_dir / "data" / "interim" / "teleco_churn_limpio.csv"

try:
    df = pd.read_csv(data_path)
    print(f"Dimensiones del dataset: {df.shape} (Filas, Columnas)")
except FileNotFoundError:
    print("Error: No se encuentra el archivo.")

Dimensiones del dataset: (7032, 21) (Filas, Columnas)


In [3]:
df.select_dtypes(include=['object']).columns

Index(['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'Contract', 'PaperlessBilling', 'PaymentMethod'],
      dtype='object')

In [4]:
df = df.drop('customerID', axis=1)


In [5]:
df.select_dtypes(include=['object']).columns

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')

In [6]:
df['InternetService'].head(20)

0             DSL
1             DSL
2             DSL
3             DSL
4     Fiber optic
5     Fiber optic
6     Fiber optic
7             DSL
8     Fiber optic
9             DSL
10            DSL
11             No
12    Fiber optic
13    Fiber optic
14    Fiber optic
15    Fiber optic
16             No
17    Fiber optic
18            DSL
19    Fiber optic
Name: InternetService, dtype: object

Nota: La columna 'InternetService' tiene tres categorías: 'DSL', 'Fiber optic' y 'No'. Para convertir esta columna en variables dummy, pandas creará dos columnas nuevas (ya que drop_first=True) para evitar la trampa de las variables ficticias.

In [7]:
#df = df['InternetService'].drop(columns=['customerID'])

In [8]:
# Esta parte sirve para que pandas detecte automáticamente cuáles columnas son texto.
df_processed = pd.get_dummies(df, drop_first=True, dtype=int)

In [9]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 31 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7032 non-null   int64  
 1   tenure                                 7032 non-null   int64  
 2   MonthlyCharges                         7032 non-null   float64
 3   TotalCharges                           7032 non-null   float64
 4   Churn                                  7032 non-null   int64  
 5   gender_Male                            7032 non-null   int64  
 6   Partner_Yes                            7032 non-null   int64  
 7   Dependents_Yes                         7032 non-null   int64  
 8   PhoneService_Yes                       7032 non-null   int64  
 9   MultipleLines_No phone service         7032 non-null   int64  
 10  MultipleLines_Yes                      7032 non-null   int64  
 11  Inte

In [10]:
df_processed.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,1,0
1,0,34,56.95,1889.5,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
2,0,2,53.85,108.15,1,1,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
3,0,45,42.3,1840.75,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,0,2,70.7,151.65,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0


In [11]:
df_processed.to_csv(root_dir / "data" / "processed" / "teleco_churn_final.csv", index=False)