In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [1]:
from __future__ import print_function

import pandas as pd
import numpy as np
import scipy as sp

from scipy.stats import skew

In [2]:
from sonic_screwdrivers import DFLogNormalizer

[nltk_data] Downloading package 'stopwords' to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package 'punkt' to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package 'maxent_treebank_pos_tagger' to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package 'wordnet' to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
Norm = DFLogNormalizer()

In [26]:
x1 = [1, 1, 0, 2, 3, 0]
x2 = [-10, 0, 0, 0, 0, 0]
x3 = [10, 0, 0, 0, 0, 0]

In [27]:
skew(x2)

-1.7888543819998317

In [28]:
skew(x3)

1.7888543819998317

In [29]:
# Training data
df = pd.DataFrame(
        {"A": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True),
         "B": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=False),
         "C": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
                             categories=['a', 'b', 'c', 'd']),
         "D": x1,
         "E": x2,
         "F": x3,
         }
    )

In [30]:
# Test Data
df_test = pd.DataFrame(
        {"A": pd.Categorical(['a', 'b', 'b', 'a', 'b']),
         "B": pd.Categorical(['a', 'b', 'c', 'a', 'b']),
         "C": pd.Categorical(['a', 'b', 'c', 'a', 'b']),
         "D": [1, 2, 3, 4, 5],
         "E": [-2, -3, -4, -5, 1],
         "F": [1, 2, 4, 4, 4]
         }
    )

Use the transform 
===

In [31]:
Norm.fit(df)

DFLogNormalizer(fillna_after=True, fillna_strategy='raise', skew_cutoff=1.4)

In [32]:
Norm.norm_params_map_

{'D': {'mean': 1.1666666666666667,
  'na_replacement': nan,
  'std': 1.0671873729054748,
  'take_log': False},
 'E': {'max_val_for_log': 0,
  'mean': None,
  'na_replacement': nan,
  'std': 0.89364280548375985,
  'take_log': True},
 'F': {'mean': None,
  'min_val_for_log': 0,
  'na_replacement': nan,
  'std': 0.89364280548375985,
  'take_log': True}}

In [33]:
Norm.transform(df)

Unnamed: 0,A,B,C,D,E,F
0,a,a,a,-0.156174,2.683282,2.683282
1,b,b,b,-0.156174,0.0,0.0
2,c,c,c,-1.093216,0.0,0.0
3,a,a,a,0.780869,0.0,0.0
4,b,b,b,1.717911,0.0,0.0
5,c,c,c,-1.093216,0.0,0.0


In [34]:
Norm.transform(df_test)

Unnamed: 0,A,B,C,D,E,F
0,a,a,a,-0.156174,1.229364,0.775642
1,b,b,b,0.780869,1.551285,1.229364
2,b,c,c,1.717911,1.800986,1.800986
3,a,a,a,2.654954,2.005006,1.800986
4,b,b,b,3.591997,0.0,1.800986


### Here I'm manually checking the data to ensure the transformer does the correct thing

#### For unskewed data, subtract mean and divide by standard deviation

In [35]:
m1 = np.nanmean(x1)

In [36]:
s1 = np.nanstd(x1)

In [37]:
skew1 = skew(x1)

In [38]:
skew1

0.48756686638394064

In [39]:
(x1-m1)/s1

array([-0.15617376, -0.15617376, -1.09321633,  0.78086881,  1.71791138,
       -1.09321633])

#### For negatively skewed data, subtract data from max value + 1 and then take log. Then normalize

In [40]:
m2 = np.nanmean(x2)

In [41]:
skew2 = skew(x2)

In [42]:
skew2

-1.7888543819998317

In [43]:
max_val = np.max(x2)

In [44]:
log_x2 = np.log((max_val+1)-x2)

In [45]:
log_x2

array([ 2.39789527,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

In [46]:
s2 = np.nanstd(log_x2)

In [47]:
skew(log_x2)

1.7888543819998313

In [48]:
log_x2/s2

array([ 2.68328157,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

#### For positively skewed data, add minimum value divided by 2 to data and then take log. Then do normalization.

In [49]:
m3 = np.nanmean(x3)

In [50]:
skew3 = skew(x3)

In [51]:
skew3

1.7888543819998317

In [52]:
min_val = np.min(x3)

In [53]:
log_x3 = np.log(x3+(1-min_val))

In [54]:
log_x3

array([ 2.39789527,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

In [55]:
s3 = np.nanstd(log_x3)

In [56]:
skew(log_x3)

1.7888543819998313

In [57]:
log_x3/s3

array([ 2.68328157,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

#### Check for missing value exception if fillna_strategy is coerce

In [58]:
df

Unnamed: 0,A,B,C,D,E,F
0,a,a,a,1,-10,10
1,b,b,b,1,0,0
2,c,c,c,0,0,0
3,a,a,a,2,0,0
4,b,b,b,3,0,0
5,c,c,c,0,0,0


In [59]:
df_w_na=df.copy()
df_w_na.loc[0,'D']=np.NaN
df_w_na

Unnamed: 0,A,B,C,D,E,F
0,a,a,a,,-10,10
1,b,b,b,1.0,0,0
2,c,c,c,0.0,0,0
3,a,a,a,2.0,0,0
4,b,b,b,3.0,0,0
5,c,c,c,0.0,0,0


In [60]:
Norm.fit_transform(df_w_na)

SonicScrewdriverError: The DataFrame contains NaNs. Handle NaNs before applying this transformer or use the fillna_strategy option to set a method for replacing the missing data.

In [61]:
Norm.fit_transform(df)

Unnamed: 0,A,B,C,D,E,F
0,a,a,a,-0.156174,2.683282,2.683282
1,b,b,b,-0.156174,0.0,0.0
2,c,c,c,-1.093216,0.0,0.0
3,a,a,a,0.780869,0.0,0.0
4,b,b,b,1.717911,0.0,0.0
5,c,c,c,-1.093216,0.0,0.0


In [62]:
Norm.fit_transform(df_w_na)

SonicScrewdriverError: The DataFrame contains NaNs. Handle NaNs before applying this transformer or use the fillna_strategy option to set a method for replacing the missing data.

#### Check fillna_strategies

In [63]:
df_anticipated=df.iloc[1:].copy()

In [64]:
Norm = DFLogNormalizer(skew_cutoff=2, fillna_strategy="mean")
print('input\n', df_w_na.D.values)
print('result\n', Norm.fit_transform(df_w_na).D.values)
fit_wo_na=Norm.fit_transform(df_anticipated)
print('anticipated\n', np.insert(fit_wo_na.D.values,0,fit_wo_na.D.mean()))


input
 [ nan   1.   0.   2.   3.   0.]
result
 [ 0.         -0.17149859 -1.02899151  0.68599434  1.54348727 -1.02899151]
anticipated
 [ 0.         -0.17149859 -1.02899151  0.68599434  1.54348727 -1.02899151]


In [65]:
Norm = DFLogNormalizer(skew_cutoff=2, fillna_strategy="median")
print('input\n', df_w_na.D.values)
print('result\n', Norm.fit_transform(df_w_na).D.values)
fit_wo_na=Norm.fit_transform(df_anticipated)
print('anticipated\n', np.insert(fit_wo_na.D.values,0,fit_wo_na.D.median()))


input
 [ nan   1.   0.   2.   3.   0.]
result
 [-0.17149859 -0.17149859 -1.02899151  0.68599434  1.54348727 -1.02899151]
anticipated
 [-0.17149859 -0.17149859 -1.02899151  0.68599434  1.54348727 -1.02899151]


In [66]:
Norm = DFLogNormalizer(skew_cutoff=2, fillna_strategy="most_frequent")
print('input\n', df_w_na.D.values)
print('result\n', Norm.fit_transform(df_w_na).D.values)
fit_wo_na=Norm.fit_transform(df_anticipated)
print('anticipated\n', np.insert(fit_wo_na.D.values,0,fit_wo_na.D.mode()))


input
 [ nan   1.   0.   2.   3.   0.]
result
 [-1.02899151 -0.17149859 -1.02899151  0.68599434  1.54348727 -1.02899151]
anticipated
 [-1.02899151 -0.17149859 -1.02899151  0.68599434  1.54348727 -1.02899151]


### Test filling na before log normalizing

In [67]:
df_anticipated=df.copy()

In [68]:
df_anticipated.loc[0,'D']=df_anticipated.loc[1:,'D'].mean()
Norm = DFLogNormalizer(skew_cutoff=2, fillna_strategy="mean", fillna_after=False)
print('input\n', df_w_na.D.values)
print('result\n', Norm.fit_transform(df_w_na).D.values)
print('anticipated\n', Norm.fit_transform(df_anticipated).D.values)


input
 [ nan   1.   0.   2.   3.   0.]
result
 [ 0.         -0.18786729 -1.12720372  0.75146915  1.69080559 -1.12720372]
anticipated
 [ 0.         -0.18786729 -1.12720372  0.75146915  1.69080559 -1.12720372]


In [69]:
df_anticipated.loc[0,'D']=df_anticipated.loc[1:,'D'].median()
Norm = DFLogNormalizer(skew_cutoff=2, fillna_strategy="median", fillna_after=False)
print('input\n', df_w_na.D.values)
print('result\n', Norm.fit_transform(df_w_na).D.values)
print('anticipated\n', Norm.fit_transform(df_anticipated).D.values)


input
 [ nan   1.   0.   2.   3.   0.]
result
 [-0.15617376 -0.15617376 -1.09321633  0.78086881  1.71791138 -1.09321633]
anticipated
 [-0.15617376 -0.15617376 -1.09321633  0.78086881  1.71791138 -1.09321633]


In [70]:
df_anticipated.loc[0,'D']=df_anticipated.loc[1:,'D'].mode()[0]
Norm = DFLogNormalizer(skew_cutoff=2, fillna_strategy="most_frequent", fillna_after=False)
print('input\n', df_w_na.D.values)
print('result\n', Norm.fit_transform(df_w_na).D.values)
print('anticipated\n', Norm.fit_transform(df_anticipated).D.values)


input
 [ nan   1.   0.   2.   3.   0.]
result
 [-0.8660254   0.         -0.8660254   0.8660254   1.73205081 -0.8660254 ]
anticipated
 [-0.8660254   0.         -0.8660254   0.8660254   1.73205081 -0.8660254 ]


### check error handling for inappropriate strategy

In [71]:
df_anticipated.loc[0,'D']=df_anticipated.loc[1:,'D'].mode()[0]
Norm = DFLogNormalizer(skew_cutoff=2, fillna_strategy="mode", fillna_after=False)
print('input\n', df_w_na.D.values)
print('result\n', Norm.fit_transform(df_w_na).D.values)
print('anticipated\n', Norm.fit_transform(df_anticipated).D.values)


input
 [ nan   1.   0.   2.   3.   0.]


SonicScrewdriverError: Can only use these fillna strategies: ['raise', 'mean', 'median', 'most_frequent', 'drop'] got strategy=mode

### Check most_frequent startegy for condition of no mode

In [72]:
df_w_na.loc[2,'D']=4
Norm = DFLogNormalizer(skew_cutoff=2, fillna_strategy="most_frequent", fillna_after=False)
print('input\n', df_w_na.D.values)
print('result\n', Norm.fit_transform(df_w_na).D.values)


input
 [ nan   1.   4.   2.   3.   0.]


SonicScrewdriverError: No values in column D occur at least 2 times. Replacement for missing data could not be determined. Select fillna_strategy other than "most_frequent".

### Test most_frequent with ties

In [73]:
df_w_na.loc[3,'D']=4
df_w_na.loc[4,'D']=0

Norm = DFLogNormalizer(skew_cutoff=2, fillna_strategy="most_frequent", fillna_after=False)
print('input\n', df_w_na.D.values)
print('result\n', Norm.fit_transform(df_w_na).D.values)


input
 [ nan   1.   4.   4.   0.   0.]
result
 [-0.83205029 -0.2773501   1.38675049  1.38675049 -0.83205029 -0.83205029]


### test drop strategy

In [74]:
Norm = DFLogNormalizer(skew_cutoff=2, fillna_strategy="drop", fillna_after=False)
print('input\n', df_w_na.D.values)
print('result\n', Norm.fit_transform(df_w_na).D.values)


input
 [ nan   1.   4.   4.   0.   0.]
result
 [-0.43643578  1.2001984   1.2001984  -0.98198051 -0.98198051]


### check error handling for column of NaNs

In [75]:
df_w_na.loc[:,'D']=np.NaN

Norm = DFLogNormalizer(skew_cutoff=2, fillna_strategy="most_frequent", fillna_after=False)
print('input\n', df_w_na.D.values)
print('result\n', Norm.fit_transform(df_w_na).D.values)

input
 [ nan  nan  nan  nan  nan  nan]


SonicScrewdriverError: A column in the Dataframe contains only NaNs. Replacement for missing data can not be determined

### check error handling for dropping all rows with drop strategy

In [76]:
df_w_na.loc[:,'D']=[np.NaN, 1, np.NaN, 4, np.NaN,np.NaN]

Norm = DFLogNormalizer(skew_cutoff=2, fillna_strategy="drop", fillna_after=False)
print('input\n', df_w_na)
print('result\n', Norm.fit_transform(df_w_na))

input
    A  B  C    D   E   F
0  a  a  a  NaN -10  10
1  b  b  b  1.0   0   0
2  c  c  c  NaN   0   0
3  a  a  a  4.0   0   0
4  b  b  b  NaN   0   0
5  c  c  c  NaN   0   0
result
    A  B  C    D  E  F
1  b  b  b -1.0  0  0
3  a  a  a  1.0  0  0


In [77]:
df_w_na.loc[1,'E']=np.NaN

Norm = DFLogNormalizer(skew_cutoff=2, fillna_strategy="drop", fillna_after=False)
print('input\n', df_w_na)
print('result\n', Norm.fit_transform(df_w_na))

input
    A  B  C    D     E   F
0  a  a  a  NaN -10.0  10
1  b  b  b  1.0   NaN   0
2  c  c  c  NaN   0.0   0
3  a  a  a  4.0   0.0   0
4  b  b  b  NaN   0.0   0
5  c  c  c  NaN   0.0   0
result
    A  B  C    D    E  F
3  a  a  a  0.0  0.0  0


In [78]:
df_w_na.loc[3,'F']=np.NaN

Norm = DFLogNormalizer(skew_cutoff=2, fillna_strategy="drop", fillna_after=False)
print('input\n', df_w_na)
print('result\n', Norm.fit_transform(df_w_na))

input
    A  B  C    D     E     F
0  a  a  a  NaN -10.0  10.0
1  b  b  b  1.0   NaN   0.0
2  c  c  c  NaN   0.0   0.0
3  a  a  a  4.0   0.0   NaN
4  b  b  b  NaN   0.0   0.0
5  c  c  c  NaN   0.0   0.0


SonicScrewdriverError: Each row contains at least one NaN. Select a fillna_strategy other than 'drop'.

### check processing of extremely skewed data

In [79]:
df_w_na=pd.DataFrame([[100000000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, np.NaN],
                      [2,2,np.NaN,4,5,6,np.NaN,8,9,10,11,12,13, np.NaN]]).T

Norm = DFLogNormalizer(skew_cutoff=2, fillna_strategy="most_frequent", fillna_after=False)
print('input\n', df_w_na)
print('result\n', Norm.fit_transform(df_w_na))

input
                0     1
0   1.000000e+11   2.0
1   0.000000e+00   2.0
2   0.000000e+00   NaN
3   0.000000e+00   4.0
4   0.000000e+00   5.0
5   0.000000e+00   6.0
6   0.000000e+00   NaN
7   0.000000e+00   8.0
8   0.000000e+00   9.0
9   0.000000e+00  10.0
10  0.000000e+00  11.0
11  0.000000e+00  12.0
12  0.000000e+00  13.0
13           NaN   NaN
result
            0         1
0   3.882901 -1.074172
1   0.000000 -1.074172
2   0.000000 -1.074172
3   0.000000 -0.572892
4   0.000000 -0.322252
5   0.000000 -0.071611
6   0.000000 -1.074172
7   0.000000  0.429669
8   0.000000  0.680309
9   0.000000  0.930949
10  0.000000  1.181590
11  0.000000  1.432230
12  0.000000  1.682870
13  0.000000 -1.074172


In [80]:
Norm.norm_params_map_

{0: {'mean': None,
  'min_val_for_log': 0.0,
  'na_replacement': 0.0,
  'std': 6.5230696291454002,
  'take_log': True},
 1: {'mean': 6.2857142857142856,
  'na_replacement': 2.0,
  'std': 3.9897828696482693,
  'take_log': False}}

In [81]:
df_w_na=pd.DataFrame([[10, 10, 20, 10, 50, 10, 0.1, 0, np.NaN, np.NaN, np.NaN, 0, 0, np.NaN],
                      [2,2,np.NaN,4,5,6,np.NaN,8,9,10,11,12,13, np.NaN]]).T

Norm = DFLogNormalizer(skew_cutoff=2, fillna_strategy="most_frequent", fillna_after=False)
print('input\n', df_w_na)
print('result\n', Norm.fit_transform(df_w_na))

input
        0     1
0   10.0   2.0
1   10.0   2.0
2   20.0   NaN
3   10.0   4.0
4   50.0   5.0
5   10.0   6.0
6    0.1   NaN
7    0.0   8.0
8    NaN   9.0
9    NaN  10.0
10   NaN  11.0
11   0.0  12.0
12   0.0  13.0
13   NaN   NaN
result
            0         1
0   0.421672 -1.074172
1   0.421672 -1.074172
2   0.943468 -1.074172
3   0.421672 -0.572892
4   1.659477 -0.322252
5   0.421672 -0.071611
6  -1.436398 -1.074172
7  -1.513308  0.429669
8   0.421672  0.680309
9   0.421672  0.930949
10  0.421672  1.181590
11 -1.513308  1.432230
12 -1.513308  1.682870
13  0.421672 -1.074172


In [82]:
Norm.norm_params_map_

{0: {'mean': 1.8753443166170745,
  'min_val_for_log': 0.0,
  'na_replacement': 10.0,
  'std': 1.2392347679873852,
  'take_log': True},
 1: {'mean': 6.2857142857142856,
  'na_replacement': 2.0,
  'std': 3.9897828696482693,
  'take_log': False}}

## check if max val in test data is greater than train

In [84]:
df

Unnamed: 0,A,B,C,D,E,F
0,a,a,a,1,-10,10
1,b,b,b,1,0,0
2,c,c,c,0,0,0
3,a,a,a,2,0,0
4,b,b,b,3,0,0
5,c,c,c,0,0,0


In [86]:
df_test.loc[0, 'F'] = 11

In [93]:
Norm = DFLogNormalizer()

In [94]:
Norm.fit(df)

DFLogNormalizer(fillna_after=True, fillna_strategy='raise', skew_cutoff=1.4)

In [95]:
Norm.norm_params_map_

{'D': {'mean': 1.1666666666666667,
  'na_replacement': nan,
  'std': 1.0671873729054748,
  'take_log': False},
 'E': {'max_val_for_log': 0,
  'mean': None,
  'na_replacement': nan,
  'std': 0.89364280548375985,
  'take_log': True},
 'F': {'mean': None,
  'min_val_for_log': 0,
  'na_replacement': nan,
  'std': 0.89364280548375985,
  'take_log': True}}

In [96]:
Norm.transform(df_test)

Unnamed: 0,A,B,C,D,E,F
0,a,a,a,-0.156174,2.780649,2.780649
1,b,b,b,0.780869,1.551285,1.229364
2,b,c,c,1.717911,1.800986,1.800986
3,a,a,a,2.654954,2.005006,1.800986
4,b,b,b,3.591997,0.0,1.800986


## check if min val in test data is greater than train

In [97]:
df_test.loc[0, 'E'] = -11

In [98]:
Norm.transform(df_test)

Unnamed: 0,A,B,C,D,E,F
0,a,a,a,-0.156174,2.780649,2.780649
1,b,b,b,0.780869,1.551285,1.229364
2,b,c,c,1.717911,1.800986,1.800986
3,a,a,a,2.654954,2.005006,1.800986
4,b,b,b,3.591997,0.0,1.800986
