#### WOE function for categorical variables

In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv("C:/Users/User/Desktop/Mayada Kh/University/Дипломна/materials/datasets/application_train.csv")

In [2]:
data.TARGET

0         1
1         0
2         0
3         0
4         0
         ..
307506    0
307507    0
307508    0
307509    1
307510    0
Name: TARGET, Length: 307511, dtype: int64

In [3]:
def WOE(df, var, target):
    df[var] = df[var].fillna('missing')
    k = df[[var,target]].groupby(var)[target].agg(['count','sum']).reset_index()
    k.columns = [var,'Count','Good']
    k['Bad'] = k['Count'] - k['Good']
    k['Good %'] = (k['Good'] / k['Good'].sum()*100).round(2)
    k['Bad %'] = (k['Bad'] / k['Bad'].sum()*100).round(2)
    k[var+'_WOE'] = np.log(k['Good %'] / k['Bad %']).round(2)
    k = k.sort_values(by=var+'_WOE')
    return(k)

Testing on the **OCCUPATION_TYPE** column

In [4]:
df_woe = WOE(data, 'OCCUPATION_TYPE', 'TARGET') 
df_woe

Unnamed: 0,OCCUPATION_TYPE,Count,Good,Bad,Good %,Bad %,OCCUPATION_TYPE_WOE
0,Accountants,9813,474,9339,1.91,3.3,-0.55
6,High skill tech staff,11380,701,10679,2.82,3.78,-0.29
10,Managers,21371,1328,20043,5.35,7.09,-0.28
3,Core staff,27570,1738,25832,7.0,9.14,-0.27
5,HR staff,563,36,527,0.15,0.19,-0.24
12,Private service staff,2652,175,2477,0.7,0.88,-0.23
18,missing,96391,6278,90113,25.29,31.88,-0.23
11,Medicine staff,8537,572,7965,2.3,2.82,-0.2
7,IT staff,526,34,492,0.14,0.17,-0.19
15,Secretaries,1305,92,1213,0.37,0.43,-0.15


In the above result we can see each unique value in the **OCCUPATION_TYPE** column, its value counts, number of Good and Bad customers and the corresponding WOE value. We can then use the resulting WOE values in our models.

In [5]:
#Creating a mapping dictionary
a = df_woe.OCCUPATION_TYPE.values
b =  df_woe.OCCUPATION_TYPE_WOE.values
mapping_dict = dict(zip(a,b))
print(mapping_dict)

{'Accountants': -0.55, 'High skill tech staff': -0.29, 'Managers': -0.28, 'Core staff': -0.27, 'HR staff': -0.24, 'Private service staff': -0.23, 'missing': -0.23, 'Medicine staff': -0.2, 'IT staff': -0.19, 'Secretaries': -0.15, 'Realty agents': 0.0, 'Cleaning staff': 0.19, 'Sales staff': 0.19, 'Cooking staff': 0.29, 'Laborers': 0.3, 'Security staff': 0.32, 'Waiters/barmen staff': 0.37, 'Drivers': 0.37, 'Low-skill Laborers': 0.87}


In [6]:
#Add mapped column to dataset
data['OCCUPATION_TYPE_WOE'] = data.OCCUPATION_TYPE.replace(mapping_dict)

In [7]:
data.OCCUPATION_TYPE_WOE.head()

0    0.30
1   -0.27
2    0.30
3    0.30
4   -0.27
Name: OCCUPATION_TYPE_WOE, dtype: float64

#### Target encoding 

In [8]:
def target_encoder(df, column, target, index=None, method='mean'):

    index = df.index if index is None else index # Encode the entire input df if no specific indices is supplied

    if method == 'mean':
        encoded_column = df[column].map(df.iloc[index].groupby(column)[target].mean())
    elif method == 'median':
        encoded_column = df[column].map(df.iloc[index].groupby(column)[target].median())
    elif method == 'std':
        encoded_column = df[column].map(df.iloc[index].groupby(column)[target].std())
    else:
        raise ValueError("Incorrect method supplied: '{}'. Must be one of 'mean', 'median', 'std'".format(method))

    return encoded_column

Testing on the same column

In [9]:
data['OCCUPATION_TYPE'] = data['OCCUPATION_TYPE'].fillna('NoData')

In [10]:
data['OCCUPATION_TYPE_Target'] = target_encoder(data, 'OCCUPATION_TYPE', 'TARGET')

In [11]:
data.OCCUPATION_TYPE_Target.head()

0    0.105788
1    0.063040
2    0.105788
3    0.105788
4    0.063040
Name: OCCUPATION_TYPE_Target, dtype: float64

#### ENCODING FUNCTION

In [12]:
def encode(var, target, df, method = "WoE", index=None):
    
    index = df.index 
    if method == 'WoE':
        df[var] = df[var].fillna('missing')
        k = df[[var,target]].groupby(var)[target].agg(['count','sum']).reset_index()
        k.columns = [var,'Count','Good']
        k['Bad'] = k['Count'] - k['Good']
        k['Good %'] = (k['Good'] / k['Good'].sum()*100).round(2)
        k['Bad %'] = (k['Bad'] / k['Bad'].sum()*100).round(2)
        k[var+'_WOE'] = np.log(k['Good %'] / k['Bad %']).round(2)
        k = k.sort_values(by=var+'_WOE')
        dictionary = dict(zip(k[var], k[var+'_WOE']))

    elif method == 'target_enc':
        k = df.iloc[index].groupby(var)[target].mean().reset_index()
        k.columns = [var, var+'_DepMean']
        k = k.sort_values(by=var+'_DepMean')
        dictionary = dict(zip(k[var], k[var+'_DepMean']))
    
    return dictionary

In [13]:
mapping_dictionary = encode(var ='OCCUPATION_TYPE', method ='target_enc', df =data, target = 'TARGET' )

In [14]:
mapping_dictionary

{'Accountants': 0.04830327117089575,
 'High skill tech staff': 0.06159929701230228,
 'Managers': 0.06214028356183613,
 'Core staff': 0.06303953572723975,
 'HR staff': 0.06394316163410302,
 'IT staff': 0.06463878326996197,
 'missing': 0.06513056198192778,
 'Private service staff': 0.06598793363499246,
 'Medicine staff': 0.06700245988052009,
 'Secretaries': 0.07049808429118774,
 'Realty agents': 0.07856191744340879,
 'Cleaning staff': 0.09606705351386202,
 'Sales staff': 0.0963179864182917,
 'Cooking staff': 0.10443995963673057,
 'Laborers': 0.10578769977892943,
 'Security staff': 0.10742449040321381,
 'Waiters/barmen staff': 0.11275964391691394,
 'Drivers': 0.11326130194054722,
 'Low-skill Laborers': 0.17152412804586717}