# Box Cox transformation

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import plotly.express as px

## Implementation using NumPy

In [None]:
import numpy as np

def box_cox(x, lmbda):
  if lmbda == 0:
    return np.log(x)

  return (np.power(x, lmbda) - 1) / lmbda

## Implementation using Python Standard library

In [None]:
import math

def box_cox_(x, lmbda):
  if lmbda == 0:
    return [math.log(v) for v in x]

  return [(math.pow(v, lmbda) - 1 ) / lmbda for v in x]

### Assert

In [None]:
array = [1,2,3]
lmbda = 2

box_cox_(array, lmbda) == box_cox(array, lmbda)

array([ True,  True,  True])

# Shift values to positive

In [None]:
def shift_to_positive(x):
  min_value = np.min(x)
  if min_value > 0:
    return x, 0
  
  shift_value = np.abs(min_value) + 1
  
  return x + shift_value, shift_value

In [None]:
array = np.array([-2, -1, 0])
shift_to_positive(array)

(array([1, 2, 3]), 3)

# Lambda search using SciPy

In [None]:
x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
x_transformed, lmbda_for_x = stats.boxcox(x)

print(f'x_transformed: {x_transformed.round(3)} \nlmbda_for_x: {lmbda_for_x}')

x_transformed: [0.    0.899 1.674 2.38  3.036 3.657 4.249 4.818 5.368] 
lmbda_for_x: 0.7200338588580096


## Real case scenario

In [None]:
df_gap_minder = px.data.gapminder() 
df_gap_minder.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,AFG,4
1,Afghanistan,Asia,1957,30.332,9240934,820.85303,AFG,4
2,Afghanistan,Asia,1962,31.997,10267083,853.10071,AFG,4
3,Afghanistan,Asia,1967,34.02,11537966,836.197138,AFG,4
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,AFG,4


In [None]:
population_in_2007 =  df_gap_minder.loc[lambda df: df.year == 2007]['pop']

### Input data distribution

In [None]:
px.histogram(x=population_in_2007)

In [None]:
population_in_2007_box_cox, lmda_maximinzing_ll_for_population_in_2007 = stats.boxcox(population_in_2007)

In [None]:

from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=1, 
                    cols=3,
                    subplot_titles=['X histogram' , 
                                    'log(X) histogram', 
                                    'boxcox(X) histogram'])
fig.add_trace(
    go.Histogram(x=population_in_2007),
    row=1, col=1
)

fig.add_trace(
    go.Histogram(x=np.log(population_in_2007)),
    row=1, col=2
)
fig.add_trace(
    go.Histogram(x=population_in_2007_box_cox),
    row=1, col=3
)

fig.update_layout(showlegend=False) 

# Implement Box-Cox as scikit-learn Transformer

In [None]:
from sklearn.base import (
    TransformerMixin, 
    BaseEstimator
)
class BoxCoxTransformer(BaseEstimator, TransformerMixin):
    fitted_lambda: float

    def fit(self, x: np.array) -> 'BoxCoxTransformer':

        _, self.fitted_lambda = stats.boxcox(x)
        return self

    def transform(self, x: np.array) -> np.array:
        # Note that for x of length = 1 stats.boxcox will raise error
        return stats.boxcox(x, self.fitted_lambda)
        

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(population_in_2007, test_size=.2, shuffle=True)
transformer = BoxCoxTransformer().fit(X_train)
X_train_transformed = transformer.transform(X_train)
X_test_transformed = transformer.transform(X_test)

In [None]:
df = pd.concat([
    pd.DataFrame(dict(value=X_train_transformed, kind='X_train_transformed')),
    pd.DataFrame(dict(value=X_train_transformed, kind='X_test_transformed')),
])

In [None]:
df.head(2)

Unnamed: 0,value,kind
0,19.42937,X_train_transformed
1,16.746938,X_train_transformed


In [None]:
px.histogram(df, x='value', color='kind', title='Porównanie kształtu rozkłazdu zbioru treningowego i testowego')