In [7]:
import pandas as pd

customers_df = pd.read_csv("customers.csv")
customers_df.head()

Unnamed: 0,ID,CHURNRISK,GENDER,STATUS,CHILDREN,ESTINCOME,HOMEOWNER,AGE,TOTALDOLLARVALUETRADED,TOTALUNITSTRADED,LARGESTSINGLETRANSACTION,SMALLESTSINGLETRANSACTION,PERCENTCHANGECALCULATION,DAYSSINCELASTLOGIN,DAYSSINCELASTTRADE,NETREALIZEDGAINS_YTD,NETREALIZEDLOSSES_YTD
0,1703,Medium,,M,2,28766.9,N,47,6110.61,58,1527.6525,152.76525,8.7,2,13,0.0,152.76525
1,1704,Low,,M,2,91272.2,Y,25,26992.7,13,13496.35,1349.635,3.25,4,10,1349.635,0.0
2,1705,Low,,S,0,73228.3,N,42,22472.25,28,11236.125,1123.6125,7.0,4,5,1123.6125,0.0
3,1706,High,M,M,1,64792.3,N,52,13051.31,36,6525.655,652.5655,9.0,3,6,0.0,652.5655
4,1707,High,F,S,0,93322.1,Y,40,29922.99,8,14961.495,1496.1495,2.0,4,9,0.0,1496.1495


In [8]:
# Count the number of missing values in each column
customers_df.isna().sum()

ID                           0
CHURNRISK                    0
GENDER                       3
STATUS                       0
CHILDREN                     0
ESTINCOME                    0
HOMEOWNER                    0
AGE                          0
TOTALDOLLARVALUETRADED       0
TOTALUNITSTRADED             0
LARGESTSINGLETRANSACTION     0
SMALLESTSINGLETRANSACTION    0
PERCENTCHANGECALCULATION     0
DAYSSINCELASTLOGIN           0
DAYSSINCELASTTRADE           0
NETREALIZEDGAINS_YTD         0
NETREALIZEDLOSSES_YTD        0
dtype: int64

In [9]:
# Drop unncecessary columns
customers_df = customers_df.drop(columns='ID')

In [10]:
# Get the numerical columns
numerical_columns = customers_df.select_dtypes(include=['int64', 'float64']).columns
numerical_columns

Index(['CHILDREN', 'ESTINCOME', 'AGE', 'TOTALDOLLARVALUETRADED',
       'TOTALUNITSTRADED', 'LARGESTSINGLETRANSACTION',
       'SMALLESTSINGLETRANSACTION', 'PERCENTCHANGECALCULATION',
       'DAYSSINCELASTLOGIN', 'DAYSSINCELASTTRADE', 'NETREALIZEDGAINS_YTD',
       'NETREALIZEDLOSSES_YTD'],
      dtype='object')

In [11]:
# Get the categorical columns
categorical_columns = customers_df.select_dtypes(include=['object']).columns
categorical_columns

Index(['CHURNRISK', 'GENDER', 'STATUS', 'HOMEOWNER'], dtype='object')

In [12]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

std_scaler = StandardScaler()
impute_categorical = SimpleImputer(strategy='most_frequent')
onehot_categorical = OneHotEncoder(handle_unknown='ignore')

# Create a pipeline for categorical columns with simple imputer and one hot encoder
categorical_transformer = Pipeline(steps=[
    ('impute', impute_categorical),
    ('onehot', onehot_categorical)
    ])

# Create the pipeline for numerical columns
numerical_transformer = Pipeline(steps=[('scale', std_scaler)])

# Create the preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_columns),
    ('num', numerical_transformer, numerical_columns)
    ], remainder='passthrough')

# Fit and transform the data
df_transformed = preprocessor.fit_transform(customers_df)
transformed_df = pd.DataFrame(df_transformed, columns=preprocessor.get_feature_names_out())

# Print the transformed data
transformed_df.info()
transformed_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2066 entries, 0 to 2065
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   cat__CHURNRISK_High             2066 non-null   float64
 1   cat__CHURNRISK_Low              2066 non-null   float64
 2   cat__CHURNRISK_Medium           2066 non-null   float64
 3   cat__GENDER_F                   2066 non-null   float64
 4   cat__GENDER_M                   2066 non-null   float64
 5   cat__STATUS_D                   2066 non-null   float64
 6   cat__STATUS_M                   2066 non-null   float64
 7   cat__STATUS_S                   2066 non-null   float64
 8   cat__HOMEOWNER_N                2066 non-null   float64
 9   cat__HOMEOWNER_Y                2066 non-null   float64
 10  num__CHILDREN                   2066 non-null   float64
 11  num__ESTINCOME                  2066 non-null   float64
 12  num__AGE                        20

Unnamed: 0,cat__CHURNRISK_High,cat__CHURNRISK_Low,cat__CHURNRISK_Medium,cat__GENDER_F,cat__GENDER_M,cat__STATUS_D,cat__STATUS_M,cat__STATUS_S,cat__HOMEOWNER_N,cat__HOMEOWNER_Y,...,num__AGE,num__TOTALDOLLARVALUETRADED,num__TOTALUNITSTRADED,num__LARGESTSINGLETRANSACTION,num__SMALLESTSINGLETRANSACTION,num__PERCENTCHANGECALCULATION,num__DAYSSINCELASTLOGIN,num__DAYSSINCELASTTRADE,num__NETREALIZEDGAINS_YTD,num__NETREALIZEDLOSSES_YTD
0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.281434,-1.011294,-0.010312,-1.168367,-1.168367,-0.332917,-0.602899,0.857719,-0.679273,-0.510607
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,-1.19346,1.094057,-0.790986,1.084439,1.084439,-0.723373,0.732138,0.151724,1.835463,-0.795935
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,-0.053769,0.638301,-0.530761,0.659009,0.659009,-0.45471,0.732138,-1.024934,1.414322,-0.795935
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.616638,-0.311526,-0.391974,-0.227619,-0.227619,-0.311424,0.064619,-0.789603,-0.679273,0.422898
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,-0.18785,1.389491,-0.877727,1.360216,1.360216,-0.812928,0.732138,-0.083608,-0.679273,1.998506
