In [110]:
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer

In [92]:
df = pd.read_parquet('./data/cleaner_sample.parquet')
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


## 1. pandas get_dummy

- `pd.get_dummies`, can add prefix, but can't split by separator

    this one is quite powerful, it can ignore none object type columns.

- `series.str.get_dummies`, can split values by separator, but can't add prefix.

In [9]:
# method 1,

pd.get_dummies(df.paymentmethod, prefix='payment').head()

Unnamed: 0,payment_bank_transfer_(automatic),payment_credit_card_(automatic),payment_electronic_check,payment_mailed_check
0,0,0,1,0
1,0,0,0,1
2,0,0,0,1
3,1,0,0,0
4,0,0,1,0


In [37]:
# method 1.1, apply to all dataframe, automatically ignore none object type

pd.get_dummies(df, prefix='dummy').head().T

Unnamed: 0,0,1,2,3,4
seniorcitizen,0.00,0.00,0.00,0.00,0.00
tenure,1.00,34.00,2.00,45.00,2.00
monthlycharges,29.85,56.95,53.85,42.30,70.70
totalcharges,29.85,1889.50,108.15,1840.75,151.65
churn,0.00,0.00,1.00,0.00,1.00
...,...,...,...,...,...
dummy_yes,1.00,0.00,1.00,0.00,1.00
dummy_bank_transfer_(automatic),0.00,0.00,0.00,1.00,0.00
dummy_credit_card_(automatic),0.00,0.00,0.00,0.00,0.00
dummy_electronic_check,1.00,0.00,0.00,0.00,1.00


In [35]:
# method 2
series = pd.Series(dtype=str)
series.at[0] = 'c1,c2'
series.at[1] = 'c1'
series.at[2] = 'c3'
series.at[3] = 'c2,c3'
series.at[4] = 'c1,c3'
series

0    c1,c2
1       c1
2       c3
3    c2,c3
4    c1,c3
dtype: object

In [31]:
df_dummy = series.str.get_dummies(',').add_prefix('prefix_')

In [34]:
## finally, merge back with concat
pd.concat([df, df_dummy], axis=1).head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn,prefix_c1,prefix_c2,prefix_c3
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,month-to-month,yes,electronic_check,29.85,29.85,0,1.0,1.0,0.0
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,no,one_year,no,mailed_check,56.95,1889.5,0,1.0,0.0,0.0
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,month-to-month,yes,mailed_check,53.85,108.15,1,0.0,0.0,1.0
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,0,0.0,1.0,1.0
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,month-to-month,yes,electronic_check,70.7,151.65,1,1.0,0.0,1.0


## Sklearn

- `DictVectorizer` can ignore none object type automatically.
- `OneHotEncoder`
- `LabelBinarizer`

In [41]:
# method 1.
dv = DictVectorizer()

## can pass whole data frame, it will ignore numerical types
dv.fit(df[['paymentmethod']].to_dict(orient='records'))

In [60]:
v = dv.transform(df[['paymentmethod']].to_dict(orient='records'))
v.toarray()[:10]

array([[0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.]])

In [55]:
dv.get_feature_names_out()

array(['paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check'],
      dtype=object)

In [59]:
dv.inverse_transform(v.toarray()[:5])

[{'paymentmethod=electronic_check': 1.0},
 {'paymentmethod=mailed_check': 1.0},
 {'paymentmethod=mailed_check': 1.0},
 {'paymentmethod=bank_transfer_(automatic)': 1.0},
 {'paymentmethod=electronic_check': 1.0}]

In [107]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

In [135]:
## method 2.
transformer = make_column_transformer(
    (OneHotEncoder(), ['paymentmethod']),
    remainder='passthrough'
)
v2 = transformer.fit_transform((df))

In [145]:
# order changed, all columns has prefix
pd.DataFrame(v2, columns=transformer.get_feature_names_out()).head()

Unnamed: 0,onehotencoder__paymentmethod_bank_transfer_(automatic),onehotencoder__paymentmethod_credit_card_(automatic),onehotencoder__paymentmethod_electronic_check,onehotencoder__paymentmethod_mailed_check,remainder__customerid,remainder__gender,remainder__seniorcitizen,remainder__partner,remainder__dependents,remainder__tenure,...,remainder__onlinebackup,remainder__deviceprotection,remainder__techsupport,remainder__streamingtv,remainder__streamingmovies,remainder__contract,remainder__paperlessbilling,remainder__monthlycharges,remainder__totalcharges,remainder__churn
0,0.0,0.0,1.0,0.0,7590-vhveg,female,0,yes,no,1,...,yes,no,no,no,no,month-to-month,yes,29.85,29.85,0
1,0.0,0.0,0.0,1.0,5575-gnvde,male,0,no,no,34,...,no,yes,no,no,no,one_year,no,56.95,1889.5,0
2,0.0,0.0,0.0,1.0,3668-qpybk,male,0,no,no,2,...,yes,no,no,no,no,month-to-month,yes,53.85,108.15,1
3,1.0,0.0,0.0,0.0,7795-cfocw,male,0,no,no,45,...,no,yes,yes,no,no,one_year,no,42.3,1840.75,0
4,0.0,0.0,1.0,0.0,9237-hqitu,female,0,no,no,2,...,no,no,no,no,no,month-to-month,yes,70.7,151.65,1
