In [1]:
from category_encoders import TargetEncoder 
## http://contrib.scikit-learn.org/category_encoders/targetencoder.html
import pandas as pd

### Regression tasks 
+ target is numerical
+ feature: categorical->numerical by a blend of the expected value of the target

In [2]:
train = pd.read_csv('house-prices-advanced-regression-techniques/train.csv')
X_test = pd.read_csv('house-prices-advanced-regression-techniques/test.csv')

In [3]:
X_train = train.drop('SalePrice',axis=1)
y_train = train['SalePrice']

In [4]:
X_train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [5]:
X_train.MSZoning.value_counts(dropna=False)

RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64

In [6]:
X_test.MSZoning.value_counts(dropna=False)

RL         1114
RM          242
FV           74
C (all)      15
RH           10
NaN           4
Name: MSZoning, dtype: int64

In [7]:
te = TargetEncoder(cols = 'MSZoning').fit(X_train,y_train)

In [8]:
X_train_encoded = te.transform(X_train)

In [9]:
X_train_encoded.MSZoning.value_counts()

191004.994787    1151
126316.830275     218
214014.061538      65
131558.390100      16
74541.128343       10
Name: MSZoning, dtype: int64

In [10]:
X_test_encoded = te.transform(X_test)

In [11]:
X_test_encoded.MSZoning.value_counts()  # use overall mean for NaN data

191004.994787    1114
126316.830275     242
214014.061538      74
74541.128343       15
131558.390100      10
180921.195890       4
Name: MSZoning, dtype: int64

In [13]:
y_train.mean()

180921.19589041095

### Classification tasks
+ target is categorical
+ feature: categorical->numerical by a blend of posterior probability of the target

In [17]:
train = pd.read_csv('telco_churn/train.csv')
test = pd.read_csv('telco_churn/test.csv')

In [22]:
train.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,5442-PPTJY,Male,0,Yes,Yes,12,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.7,258.35,No
1,6261-RCVNS,Female,0,No,No,42,Yes,No,DSL,Yes,...,Yes,Yes,No,Yes,One year,No,Credit card (automatic),73.9,3160.55,Yes
2,2176-OSJUV,Male,0,Yes,No,71,Yes,Yes,DSL,Yes,...,No,Yes,No,No,Two year,No,Bank transfer (automatic),65.15,4681.75,No
3,6161-ERDGD,Male,0,Yes,Yes,71,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,No,Electronic check,85.45,6300.85,No
4,2364-UFROM,Male,0,No,No,30,Yes,No,DSL,Yes,...,No,Yes,Yes,No,One year,No,Electronic check,70.4,2044.75,No


In [20]:
train.PaymentMethod.value_counts(dropna=False)

Electronic check             1893
Mailed check                 1305
Bank transfer (automatic)    1219
Credit card (automatic)      1217
Name: PaymentMethod, dtype: int64

In [21]:
test.PaymentMethod.value_counts(dropna=False)

Electronic check             472
Bank transfer (automatic)    325
Mailed check                 307
Credit card (automatic)      305
Name: PaymentMethod, dtype: int64

In [26]:
train['Churn']=[1 if x == 'Yes' else 0 for x in train['Churn'] ]
train.Churn.value_counts()

0    4113
1    1521
Name: Churn, dtype: int64

In [27]:
X_train = train.drop('Churn',axis=1)
y_train = train['Churn']
X_test = test.drop('Churn',axis=1)

In [29]:
y_train.value_counts()

0    4113
1    1521
Name: Churn, dtype: int64

In [30]:
te = TargetEncoder(cols = 'PaymentMethod').fit(X_train,y_train)

In [31]:
X_train_encoded = te.transform(X_train)
X_train_encoded.PaymentMethod.value_counts()

0.455890    1893
0.193870    1305
0.168171    1219
0.164339    1217
Name: PaymentMethod, dtype: int64

In [32]:
X_test_encoded = te.transform(X_test)
X_test_encoded.PaymentMethod.value_counts() 

0.455890    472
0.168171    325
0.193870    307
0.164339    305
Name: PaymentMethod, dtype: int64