* another variant of contrast coding that compares mean of dependent var for a level with mean of dependent var of previous cat for categories
* quite useful when working with ordinal data and also captures linear effect of cat on dependent var 'y'

In [2]:
#6th contrast coding technique

import pandas as pd
import numpy as np
from statsmodels.formula.api import ols

import category_encoders as ce



In [3]:
iris_data = pd.read_csv("./datasets/iris.csv")
iris_data.sample(10)

Unnamed: 0,Species,sepal_length,sepal_width,petal_length,petal_width
77,Iris-versicolor,6.7,3.0,5.0,1.7
34,Iris-setosa,4.9,3.1,1.5,0.1
118,Iris-virginica,7.7,2.6,6.9,2.3
120,Iris-virginica,6.9,3.2,5.7,2.3
27,Iris-setosa,5.2,3.5,1.5,0.2
40,Iris-setosa,5.0,3.5,1.3,0.3
149,Iris-virginica,5.9,3.0,5.1,1.8
32,Iris-setosa,5.2,4.1,1.5,0.1
96,Iris-versicolor,5.7,2.9,4.2,1.3
108,Iris-virginica,6.7,2.5,5.8,1.8


In [4]:
iris_data.drop(columns = ["sepal_length", "sepal_width", "petal_width"], inplace = True)

iris_data.sample(5)

Unnamed: 0,Species,petal_length
140,Iris-virginica,5.6
24,Iris-setosa,1.9
48,Iris-setosa,1.5
71,Iris-versicolor,4.0
100,Iris-virginica,6.0


In [6]:
iris_data.describe()

Unnamed: 0,petal_length
count,150.0
mean,3.758667
std,1.76442
min,1.0
25%,1.6
50%,4.35
75%,5.1
max,6.9


In [7]:
# mean petal length value by species
iris_data.groupby("Species").mean()

Unnamed: 0_level_0,petal_length
Species,Unnamed: 1_level_1
Iris-setosa,1.464
Iris-versicolor,4.26
Iris-virginica,5.552


In [8]:
# diff in mean value in each cat by comparing it with mean value of previous cat
iris_data.groupby("Species").mean().diff()

Unnamed: 0_level_0,petal_length
Species,Unnamed: 1_level_1
Iris-setosa,
Iris-versicolor,2.796
Iris-virginica,1.292


In [9]:
# backward diff encoding
mod = ols("petal_length ~ C(Species, Diff)",  # diff is for backward diff encoding
         data = iris_data)  

res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,petal_length,R-squared:,0.941
Model:,OLS,Adj. R-squared:,0.941
Method:,Least Squares,F-statistic:,1179.0
Date:,"Tue, 12 Jan 2021",Prob (F-statistic):,3.05e-91
Time:,14:40:39,Log-Likelihood:,-84.84
No. Observations:,150,AIC:,175.7
Df Residuals:,147,BIC:,184.7
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.7587,0.035,106.978,0.000,3.689,3.828
"C(Species, Diff)[D.Iris-setosa]",2.7960,0.086,32.488,0.000,2.626,2.966
"C(Species, Diff)[D.Iris-versicolor]",1.2920,0.086,15.012,0.000,1.122,1.462

0,1,2,3
Omnibus:,4.393,Durbin-Watson:,2.0
Prob(Omnibus):,0.111,Jarque-Bera (JB):,5.37
Skew:,0.121,Prob(JB):,0.0682
Kurtosis:,3.895,Cond. No.,3.0


**Alt way of doing back diff encoding**

In [11]:
encoder = ce.BackwardDifferenceEncoder(cols = ["Species"])  # col to be encoded
encoder

BackwardDifferenceEncoder(cols=['Species'])

In [13]:
species_encoded = encoder.fit_transform(iris_data)

species_encoded.sample(5)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,intercept,Species_0,Species_1,petal_length
90,1,0.333333,-0.333333,4.4
62,1,0.333333,-0.333333,4.0
110,1,0.333333,0.666667,5.1
115,1,0.333333,0.666667,5.3
139,1,0.333333,0.666667,5.4


In [14]:
encoded_iris = pd.concat([iris_data["Species"], species_encoded], axis = 1)

encoded_iris.sample(6)

Unnamed: 0,Species,intercept,Species_0,Species_1,petal_length
77,Iris-versicolor,1,0.333333,-0.333333,5.0
53,Iris-versicolor,1,0.333333,-0.333333,4.0
39,Iris-setosa,1,-0.666667,-0.333333,1.5
76,Iris-versicolor,1,0.333333,-0.333333,4.8
98,Iris-versicolor,1,0.333333,-0.333333,3.0
79,Iris-versicolor,1,0.333333,-0.333333,3.5


In [16]:
X = encoded_iris.drop(columns = ["Species", "petal_length"])

y = encoded_iris.petal_length

In [17]:
# since intercept added by default for encoded data, fit linear model without intercept
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression(fit_intercept = False)

linear_model.fit(X, y)
print("Training score: ", linear_model.score(X, y))

Training score:  0.9413189735606261


In [18]:
linear_model.coef_

array([3.75866667, 2.796     , 1.292     ])