In [1]:
import datatable as dt
import pandas as pd
from tqdm.notebook import tqdm

from sklearn.feature_selection import chi2

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 400)

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

### Load csv to datatable dataframe 

In [11]:
FPATH = "data/"

train_df = dt.fread(FPATH+"credit_card_clients.csv")

#### Basic
##### remove constant features


In [12]:
variables = []
std_variable = []
for feat in tqdm(enumerate(train_df.names)):
    if train_df[:, dt.f[feat[0]]].stypes[0] not in [dt.stype.str32, dt.stype.date32]:
        variables.append(feat[1])
        std_variable.append(train_df[:, dt.sd(dt.f[feat[0]])].to_list()[0][0])

        #print(feat[1], ":", train_df[:, dt.sd(dt.f[feat[0]])].to_list()[0][0])
         


constant_features = pd.DataFrame({"feature": variables, "std":std_variable})
constant_features

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,feature,std
0,id,8660.398374
1,limit_bal,129747.661567
2,sex,0.489129
3,education,0.790349
4,marriage,0.52197
5,age,9.217904
6,pay_0,1.123802
7,pay_2,1.197186
8,pay_3,1.196868
9,pay_4,1.169139


In [13]:
constant_features[constant_features['std']==0]['feature'].to_list()

[]

In [14]:
threshold=0.01

In [15]:
constant_features_lst = constant_features[constant_features['std']<=threshold]['feature'].to_list()
constant_features_lst

[]

The dataset doesn't have any constant variable. 

### Correlation  
#### Brute force approach

In [16]:
def frame_corr(dt_frame):
    numcols = [col for col in dt_frame]
    result = dt.rbind([dt_frame[:, [dt.corr(col1, col2) for col2 in numcols]] for col1 in numcols])
    result.names = dt_frame[:,numcols].names
    corr_result = result.to_pandas()

    return corr_result.set_index([pd.Index(corr_result.columns)])

In [17]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything else
# without any other insight.

def correlation(corr_matrix, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    #corr_matrix = dataset.corr().compute()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [22]:
cat_vars = ["id", "sex", "education", "marriage", "target"]

numeric_vars = [c for c in train_df.names if c not in cat_vars]

In [23]:
corr = frame_corr(train_df[:, numeric_vars])

In [25]:
correlation(corr.abs(), threshold=0.9)

{'bill_amt2', 'bill_amt3', 'bill_amt4', 'bill_amt5', 'bill_amt6'}

In [38]:
# build a dataframe with the correlation between features
# remember that the absolute value of the correlation
# coefficient is important and not the sign

corrmat = corr.abs().unstack() # absolute value of corr coef
corrmat = corrmat.sort_values(ascending=False)
corrmat = corrmat[corrmat >= 0.9]
corrmat = corrmat[corrmat < 1]
corrmat = pd.DataFrame(corrmat).reset_index()
corrmat.columns = ['feature1', 'feature2', 'corr']
corrmat.head()

Unnamed: 0,feature1,feature2,corr
0,bill_amt1,bill_amt2,0.951484
1,bill_amt2,bill_amt1,0.951484
2,bill_amt5,bill_amt6,0.946197
3,bill_amt6,bill_amt5,0.946197
4,bill_amt4,bill_amt5,0.940134


In [39]:
# find groups of correlated features

grouped_feature_ls = []
correlated_groups = []

for feature in corrmat.feature1.unique():
    if feature not in grouped_feature_ls:

        # find all features correlated to a single feature
        correlated_block = corrmat[corrmat.feature1 == feature]
        grouped_feature_ls = grouped_feature_ls + list(
            correlated_block.feature2.unique()) + [feature]

        # append the block of features to the list
        correlated_groups.append(correlated_block)

print('found {} correlated groups'.format(len(correlated_groups)))
print('out of {} total features'.format(train_df.shape[1]))

found 3 correlated groups
out of 25 total features


In [40]:
# now we can visualise each group. We see that some groups contain
# only 2 correlated features, some other groups present several features 
# that are correlated among themselves.

for group in correlated_groups:
    print(group)
    print()

    feature1   feature2      corr
0  bill_amt1  bill_amt2  0.951484

    feature1   feature2      corr
2  bill_amt5  bill_amt6  0.946197
5  bill_amt5  bill_amt4  0.940134

    feature1   feature2      corr
6  bill_amt3  bill_amt2  0.928326
9  bill_amt3  bill_amt4  0.923969



In [41]:
correlated_groups[0]

Unnamed: 0,feature1,feature2,corr
0,bill_amt1,bill_amt2,0.951484


In [42]:
var2drop = ['bill_amt1', 'bill_amt5', 'bill_amt3']

In [43]:
train_df = train_df[:, dt.f[:].remove([dt.f.bill_amt1, dt.f.bill_amt5, dt.f.bill_amt3])]

In [44]:
train_df.shape

(30000, 22)

## Filter Methods - Fisher Score

Compute chi-squared stats between each non-negative feature and class. 

- This score should be used to evaluate categorical variables in a classification task.

It compares the observed distribution of the different classes of target Y among the different categories of the feature, against the expected distribution of the target classes, regardless of the feature categories. I explained this in more detail the introductory lecture of this section.

In [47]:
cat_vars = [ "sex", "education", "marriage"]
vars = [ "sex", "education", "marriage", "target"]

In [48]:
train_catvars_df = train_df[:, vars].to_pandas()

In [52]:
# calculate the chi2 p_value between each of the variables
# and the target
# it returns 2 arrays, one contains the F-Scores which are then 
# evaluated against the chi2 distribution to obtain the pvalue
# the pvalues are in the second array, see below
f_score = []
varnames = []
for catvar in cat_vars:
    

    f_score.append(chi2(train_catvars_df[[catvar, 'target']], train_catvars_df['target'])[1][0])
    varnames.append(catvar)

pvalues = pd.Series(f_score)
pvalues.index = varnames
pvalues.sort_values(ascending=False)

marriage     0.077336
sex          0.007512
education    0.004859
dtype: float64