In [64]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [65]:
df = pd.read_csv('data/raw/telco_customer_churn.csv')

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [67]:
def Encoder(df):
    encoder = LabelEncoder()
    
    df['Churn'] = encoder.fit_transform(df['Churn'])

    for col in df.columns:
        if col == 'Churn':
            continue
        if df[col].dtype == 'object':
            if df[col].nunique() <= 5:
                dummies = pd.get_dummies(df[col], prefix=col, dtype=int)
                df = pd.concat([df.drop(columns=[col]), dummies], axis=1)
            else:
                df[col] = encoder.fit_transform(df[col])
    return df


In [68]:
df = Encoder(df)

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 47 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   customerID                               7043 non-null   int64  
 1   SeniorCitizen                            7043 non-null   int64  
 2   tenure                                   7043 non-null   int64  
 3   MonthlyCharges                           7043 non-null   float64
 4   TotalCharges                             7043 non-null   int64  
 5   Churn                                    7043 non-null   int64  
 6   gender_Female                            7043 non-null   int64  
 7   gender_Male                              7043 non-null   int64  
 8   Partner_No                               7043 non-null   int64  
 9   Partner_Yes                              7043 non-null   int64  
 10  Dependents_No                            7043 no

In [70]:
df.head(10)

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,5375,0,1,29.85,2505,0,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
1,3962,0,34,56.95,1466,0,0,1,1,0,...,0,0,1,0,1,0,0,0,0,1
2,2564,0,2,53.85,157,1,0,1,1,0,...,0,1,0,0,0,1,0,0,0,1
3,5535,0,45,42.3,1400,0,0,1,1,0,...,0,0,1,0,1,0,1,0,0,0
4,6511,0,2,70.7,925,1,1,0,1,0,...,0,1,0,0,0,1,0,0,1,0
5,6551,0,8,99.65,6104,1,1,0,1,0,...,1,1,0,0,0,1,0,0,1,0
6,1002,0,22,89.1,1550,0,0,1,1,0,...,0,1,0,0,0,1,0,1,0,0
7,4770,0,10,29.75,2609,0,1,0,1,0,...,0,1,0,0,1,0,0,0,0,1
8,5604,0,28,104.8,2646,1,1,0,0,1,...,1,1,0,0,0,1,0,0,1,0
9,4534,0,62,56.15,3022,0,0,1,1,0,...,0,0,1,0,1,0,1,0,0,0


In [71]:
def Scaler(df):
    scaler = MinMaxScaler()

    num_col = df.select_dtypes(include=['float64', 'int64']).columns.drop('Churn')
    df[num_col] = scaler.fit_transform(df[num_col])
    
    return df

In [72]:
df = Scaler(df)

In [73]:
df.head(10)

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0.763277,0.0,0.013889,0.115423,0.383614,0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.562624,0.0,0.472222,0.385075,0.224502,0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.364101,0.0,0.027778,0.354229,0.024043,1,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.785998,0.0,0.625,0.239303,0.214395,0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.924595,0.0,0.027778,0.521891,0.141654,1,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5,0.930275,0.0,0.111111,0.80995,0.934763,1,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6,0.142289,0.0,0.305556,0.704975,0.237366,0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
7,0.677364,0.0,0.138889,0.114428,0.399541,0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
8,0.795797,0.0,0.388889,0.861194,0.405207,1,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
9,0.643851,0.0,0.861111,0.377114,0.462787,0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [74]:
import os

In [75]:
output_folder = 'data/preprocessed'

os.makedirs(output_folder, exist_ok=True)
output_path = os.path.join(output_folder, 'preprocessed_telco_customer_churn.csv')
df.to_csv(output_path, index=False)

print(f"preprocessed dataset saved to: {output_path}")

preprocessed dataset saved to: data/preprocessed\preprocessed_telco_customer_churn.csv
