1. Importing Data

In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_selection import chi2, RFE
from sklearn.svm import SVR

In [3]:
data = pd.read_csv('creditcard.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
X = data.iloc[:, :-1].values
y= data.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = True,stratify=y)
print(X)
print(y)

[[ 0.00000000e+00 -1.35980713e+00 -7.27811733e-02 ...  1.33558377e-01
  -2.10530535e-02  1.49620000e+02]
 [ 0.00000000e+00  1.19185711e+00  2.66150712e-01 ... -8.98309914e-03
   1.47241692e-02  2.69000000e+00]
 [ 1.00000000e+00 -1.35835406e+00 -1.34016307e+00 ... -5.53527940e-02
  -5.97518406e-02  3.78660000e+02]
 ...
 [ 1.72788000e+05  1.91956501e+00 -3.01253846e-01 ...  4.45477214e-03
  -2.65608286e-02  6.78800000e+01]
 [ 1.72788000e+05 -2.40440050e-01  5.30482513e-01 ...  1.08820735e-01
   1.04532821e-01  1.00000000e+01]
 [ 1.72792000e+05 -5.33412522e-01 -1.89733337e-01 ... -2.41530880e-03
   1.36489143e-02  2.17000000e+02]]
[0 0 0 ... 0 0 0]


2. Feature Selection

In [5]:
feature_names =data.iloc[:,:-1].columns.tolist()

2.1 Filter Based - Chi2

In [12]:
# Runtime: 0.2s

# Scale features to [0,1] because chi2 needs non-negative data
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)

chi2_stats, p_values = chi2(X_train_minmax, y_train)

feature_dict={}

for feature, (name, score) in enumerate(zip(feature_names, p_values)):
    feature_dict[name]=score

from collections import OrderedDict
sorted_dict = OrderedDict(sorted(feature_dict.items(), key=lambda item: item[1],reverse=False))

print("Sorted feature importance based on Chi Squared Sampling with p-value < 0.05")

for key, value in sorted_dict.items():
    if(value < 0.05):
        print(key, value)


Sorted feature importance based on Chi Squared Sampling
V11 1.1625016088982962e-16
V4 3.896224045410223e-15
V14 9.55050954444407e-09
V12 3.180737932355024e-08
V17 1.7572094650447125e-05
V16 0.0001696850178127307
V18 0.00029138722000186763
V10 0.0015501771988898833
V3 0.009639517281485386
V9 0.013882236970447126
Time 0.027349413572258584


2.2 Wrapper Based - RFE

In [15]:
estimator = SVR(kernel="linear")
selector = RFE(estimator, n_features_to_select=10, step=1)
selector = selector.fit(X_train, y_train)

print(selector.support_)
print(selector.ranking_)

2.3 Embedded - Lasso

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import Lasso

# parameters to be tested on GridSearchCV
params = {"alpha":np.arange(0.00001, 10, 500)}

# Number of Folds and adding the random state for replication
kf=KFold(n_splits=5,shuffle=True, random_state=42)

# Initializing the Model
lasso = Lasso()

# GridSearchCV with model, params and folds.
lasso_cv=GridSearchCV(lasso, param_grid=params, cv=kf)
lasso_cv.fit(X_train, y_train)
print("Best Params {}".format(lasso_cv.best_params_))