#### Make new features
1. Focus on featured_importance and domain knowledge.
2. Recall top 20 importance:
['EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'EXT_SOURCE_1', 'DAYS_ID_PUBLISH', 'DAYS_REGISTRATION', 'df_avg_bureau_full_DAYS_CREDIT', 'df_avg_bureau_full_DAYS_CREDIT_ENDDATE', 'AMT_PAYMENT_df_avg_install', 'AMT_ANNUITY', 'DAYS_EMPLOYED', 'df_avg_bureau_full_DAYS_CREDIT_UPDATE', 'df_avg_pos_cash_CNT_INSTALMENT_FUTURE', 'AMT_INSTALMENT_df_avg_install', 'DAYS_LAST_PHONE_CHANGE', 'AMT_CREDIT', 'DAYS_ENTRY_PAYMENT_df_avg_install', 'DAYS_INSTALMENT_df_avg_install', 'df_avg_previous_app_DAYS_FIRST_DUE', 'df_avg_previous_app_DAYS_DECISION', 'df_avg_previous_app_HOUR_APPR_PROCESS_START', 'df_avg_previous_app_AMT_ANNUITY', 'df_avg_previous_app_AMT_CREDIT', 'df_avg_previous_app_AMT_GOODS_PRICE', 'df_avg_previous_app_AMT_APPLICATION', 'df_avg_previous_app_SELLERPLACE_AREA', 'REGION_POPULATION_RELATIVE', 'df_avg_previous_app_DAYS_LAST_DUE_1ST_VERSION', 'df_avg_bureau_full_AMT_CREDIT_SUM', 'AMT_INCOME_TOTAL']

In [81]:
## IMPORTS ##

# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib for plotting
import matplotlib.pyplot as plt

# garbage collector
import gc

In [82]:
# Load data fcn
def load_credit_data(data_path):
    csv_path = os.path.join("data", data_path)
    return pd.read_csv(csv_path)

In [83]:
# Load training data
training_df = load_credit_data ("application_train.csv")
print (training_df.shape)
training_df.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
# Load test data
testing_df = load_credit_data ("application_test.csv")
print (testing_df.shape)
testing_df.head()

(48744, 121)


Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


In [85]:
tr_features_df = training_df[['SK_ID_CURR']]
te_features_df = testing_df[['SK_ID_CURR']]

In [86]:
# New application features
def add_new_features (df_new, existing_df):
    df_new['APP_NEW_CREDIT_TO_ANNUITY_RATIO'] = existing_df['AMT_CREDIT'] / existing_df['AMT_ANNUITY']
    df_new['APP_NEW_CREDIT_TO_GOODS_RATIO'] = existing_df['AMT_CREDIT'] / existing_df['AMT_GOODS_PRICE']
    df_new['APP_NEW_INC_PER_CHLD'] = existing_df['AMT_INCOME_TOTAL'] / (1 + existing_df['CNT_CHILDREN'])
    df_new['APP_NEW_ANNUITY_TO_INCOME_RATIO'] = existing_df['AMT_ANNUITY'] / (1 + existing_df['AMT_INCOME_TOTAL'])
    df_new['APP_NEW_SOURCES_PROD'] = existing_df['EXT_SOURCE_1'] * existing_df['EXT_SOURCE_2'] * existing_df['EXT_SOURCE_3']
    df_new['APP_NEW_EXT_SOURCES_MEAN'] = existing_df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    df_new['APP_NEW_SCORES_STD'] = existing_df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    df_new['APP_NEW_CAR_TO_BIRTH_RATIO'] = existing_df['OWN_CAR_AGE'] / existing_df['DAYS_BIRTH']
    df_new['APP_NEW_CAR_TO_EMPLOY_RATIO'] = existing_df['OWN_CAR_AGE'] / existing_df['DAYS_EMPLOYED']
    df_new['APP_NEW_PHONE_TO_BIRTH_RATIO'] = existing_df['DAYS_LAST_PHONE_CHANGE'] / existing_df['DAYS_BIRTH']
    df_new['APP_NEW_PHONE_TO_EMPLOY_RATIO'] = existing_df['DAYS_LAST_PHONE_CHANGE'] / existing_df['DAYS_EMPLOYED']
    df_new['APP_NEW_CREDIT_TO_INCOME_RATIO'] = existing_df['AMT_CREDIT'] / existing_df['AMT_INCOME_TOTAL']
    
    return df_new

In [87]:
tr_features_df = add_new_features (tr_features_df, training_df)
tr_features_df.fillna(tr_features_df.median(), inplace=True)
tr_features_df.head()

Unnamed: 0,SK_ID_CURR,APP_NEW_CREDIT_TO_ANNUITY_RATIO,APP_NEW_CREDIT_TO_GOODS_RATIO,APP_NEW_INC_PER_CHLD,APP_NEW_ANNUITY_TO_INCOME_RATIO,APP_NEW_SOURCES_PROD,APP_NEW_EXT_SOURCES_MEAN,APP_NEW_SCORES_STD,APP_NEW_CAR_TO_BIRTH_RATIO,APP_NEW_CAR_TO_EMPLOY_RATIO,APP_NEW_PHONE_TO_BIRTH_RATIO,APP_NEW_PHONE_TO_EMPLOY_RATIO,APP_NEW_CREDIT_TO_INCOME_RATIO
0,100002,16.461104,1.158397,202500.0,0.121977,0.003043,0.161787,0.092026,-0.000627,-0.004357,0.11986,1.78022,2.007889
1,100003,36.234085,1.145199,270000.0,0.132216,0.119932,0.466757,0.219895,-0.000627,-0.004357,0.049389,0.69697,4.79075
2,100004,20.0,1.0,67500.0,0.099999,0.119932,0.642739,0.122792,-0.001365,-0.115556,0.042791,3.622222,2.0
3,100006,10.532818,1.052803,135000.0,0.219898,0.119932,0.650442,0.136021,-0.000627,-0.004357,0.032465,0.203027,2.316167
4,100007,23.461618,1.0,121500.0,0.179961,0.119932,0.322738,0.136021,-0.000627,-0.004357,0.055489,0.364055,4.222222


In [88]:
te_features_df = add_new_features (te_features_df, testing_df)
te_features_df.fillna(te_features_df.median(), inplace=True)
te_features_df.head()

Unnamed: 0,SK_ID_CURR,APP_NEW_CREDIT_TO_ANNUITY_RATIO,APP_NEW_CREDIT_TO_GOODS_RATIO,APP_NEW_INC_PER_CHLD,APP_NEW_ANNUITY_TO_INCOME_RATIO,APP_NEW_SOURCES_PROD,APP_NEW_EXT_SOURCES_MEAN,APP_NEW_SCORES_STD,APP_NEW_CAR_TO_BIRTH_RATIO,APP_NEW_CAR_TO_EMPLOY_RATIO,APP_NEW_PHONE_TO_BIRTH_RATIO,APP_NEW_PHONE_TO_EMPLOY_RATIO,APP_NEW_CREDIT_TO_INCOME_RATIO
0,100001,27.664697,1.264,135000.0,0.152299,0.094803,0.567263,0.353601,-0.000624,-0.004,0.090432,0.747102,4.213333
1,100005,12.82487,1.2376,99000.0,0.175453,0.071345,0.429869,0.136694,-0.000624,-0.004,-0.0,-0.0,2.250182
2,100013,9.505482,1.0528,202500.0,0.344576,0.119686,0.655389,0.062788,-0.00025,-0.001122,0.042719,0.192014,3.275378
3,100028,32.130726,1.0,105000.0,0.155614,0.164177,0.549372,0.055432,-0.000624,-0.004,0.12915,0.96731,5.0
4,100038,19.506034,1.0,90000.0,0.178149,0.119686,0.313916,0.158068,-0.001227,-0.007303,0.06296,0.374715,3.475


In [89]:
# Bureau data - TBD

In [90]:
# Previous Applications

# Load  data
previous_app_df = load_credit_data ("previous_application.csv")

# Get counts of number previous applications
previous_app_df_prevs = previous_app_df[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
previous_app_df['SK_ID_PREV'] = previous_app_df['SK_ID_CURR'].map(previous_app_df_prevs['SK_ID_PREV'])

# Create new df 
new_prev_app_df = previous_app_df[['SK_ID_CURR', 'SK_ID_PREV']]

# Add on features
new_prev_app_df['PREV_NEW_PAYMENT_TO_CREDIT_RATIO'] =  previous_app_df['AMT_DOWN_PAYMENT'] / previous_app_df['AMT_CREDIT']
new_prev_app_df['PREV_NEW_CREDIT_TO_APPLICATION_RATIO'] = previous_app_df ['AMT_CREDIT'] / previous_app_df['AMT_APPLICATION']
new_prev_app_df['PREV_NEW_CREDIT_TO_ANNUITY_RATIO'] = previous_app_df['AMT_CREDIT'] / previous_app_df['AMT_ANNUITY']

# Drop PREV as would already exist in base file - could leave in if needed
new_prev_app_df = new_prev_app_df.drop ('SK_ID_PREV', axis=1)

# Fill before aggregate
new_prev_app_df.fillna(new_prev_app_df.median(), inplace=True)

# Aggregate - joined later to t*_features_df
previous_app_df_avg = new_prev_app_df.groupby('SK_ID_CURR').mean()

previous_app_df_avg.head()

Unnamed: 0_level_0,PREV_NEW_PAYMENT_TO_CREDIT_RATIO,PREV_NEW_CREDIT_TO_APPLICATION_RATIO,PREV_NEW_CREDIT_TO_ANNUITY_RATIO
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100001,0.10594,0.957782,6.020501
100002,0.0,1.0,19.353584
100003,0.050304,1.057664,8.677472
100004,0.241719,0.828021,3.753045
100005,0.080457,0.949975,9.212916


In [91]:
# Credit Card

# Load data
credit_card_df = load_credit_data ("credit_card_balance.csv")

# Get counts of number previous applications
credit_card_df_prevs = credit_card_df[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
credit_card_df['SK_ID_PREV'] = credit_card_df['SK_ID_CURR'].map(credit_card_df_prevs['SK_ID_PREV'])

# Create new df 
new_credit_card_df = credit_card_df[['SK_ID_CURR', 'SK_ID_PREV']]

# Add on features
new_credit_card_df['CREDIT_BAL_TO_LIMIT_RATIO'] =  credit_card_df['AMT_BALANCE'] / credit_card_df['AMT_CREDIT_LIMIT_ACTUAL']
new_credit_card_df['CREDIT_PAYMENT_TO_MININSTALLRATIO'] =  credit_card_df['AMT_PAYMENT_TOTAL_CURRENT'] / credit_card_df['AMT_INST_MIN_REGULARITY'] 

# Drop PREV as would already exist in base file - could leave in if needed
new_credit_card_df = new_credit_card_df.drop ('SK_ID_PREV', axis=1)

# Fill before aggregate
new_credit_card_df.fillna(new_credit_card_df.median(), inplace=True)

# Aggregate - joined later to t*_features_df
credit_card_df_avg = new_credit_card_df.groupby('SK_ID_CURR').mean()

credit_card_df_avg.head()

Unnamed: 0_level_0,CREDIT_BAL_TO_LIMIT_RATIO,CREDIT_PAYMENT_TO_MININSTALLRATIO
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1
100006,0.0,1.067054
100011,0.302678,1.093435
100013,0.115301,1.688193
100021,0.0,1.067054
100023,0.0,1.067054


In [92]:
# Cash Positions

# Load data
pos_cash_df = load_credit_data ("POS_CASH_balance.csv")

# Get counts of number previous applications
pos_cash_df_prevs = pos_cash_df[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
pos_cash_df['SK_ID_PREV'] = pos_cash_df['SK_ID_CURR'].map(pos_cash_df_prevs['SK_ID_PREV'])

# Create new df 
new_pos_cash_df = pos_cash_df[['SK_ID_CURR', 'SK_ID_PREV']]

# Add on features
new_pos_cash_df['CASH_FUTURE_TO_TERM_RATIO'] =  pos_cash_df['CNT_INSTALMENT_FUTURE'] / pos_cash_df['CNT_INSTALMENT']

# Drop PREV as would already exist in base file - could leave in if needed
new_pos_cash_df = new_pos_cash_df.drop ('SK_ID_PREV', axis=1)

# Fill before aggregate
new_pos_cash_df.fillna(new_pos_cash_df.median(), inplace=True)

# Aggregate - joined later to t*_features_df
pos_cash_df_avg = new_pos_cash_df.groupby('SK_ID_CURR').mean()

pos_cash_df_avg.head()


Unnamed: 0_level_0,CASH_FUTURE_TO_TERM_RATIO
SK_ID_CURR,Unnamed: 1_level_1
100001,0.361111
100002,0.625
100003,0.544643
100004,0.5625
100005,0.598485


In [93]:
# Installments

# Load data
installments_df = load_credit_data ("installments_payments.csv")

# Get counts of number previous applications
installments_df_prevs = installments_df[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
installments_df['SK_ID_PREV'] = installments_df['SK_ID_CURR'].map(installments_df_prevs['SK_ID_PREV'])

# Create new df 
new_installments_df = installments_df[['SK_ID_CURR', 'SK_ID_PREV']]

# Add on features
new_installments_df['INSTPAY_INSTALL_TO_PAY_RATIO'] =  installments_df['AMT_INSTALMENT'] / installments_df['AMT_PAYMENT']
new_installments_df['INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY'] =  installments_df['DAYS_INSTALMENT'] - installments_df['DAYS_ENTRY_PAYMENT']

# Drop PREV as would already exist in base file - could leave in if needed
new_installments_df = new_installments_df.drop ('SK_ID_PREV', axis=1)

# Fill before aggregate
new_installments_df.fillna(new_installments_df.median(), inplace=True)

# Aggregate - joined later to t*_features_df
installments_df_avg = new_installments_df.groupby('SK_ID_CURR').mean()

installments_df_avg.head()

Unnamed: 0_level_0,INSTPAY_INSTALL_TO_PAY_RATIO,INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1
100001,1.0,7.285714
100002,1.0,20.421053
100003,1.0,7.16
100004,1.0,7.666667
100005,1.0,23.555556


In [94]:
 # print (new_installments_df_avg ['INSTPAY_INSTALL_TO_PAY_RATIO'] != 1)  # ---> A few True

In [95]:
# Merging - training
training_df_merged = tr_features_df
training_df_merged = training_df_merged.merge(right=previous_app_df_avg.reset_index(), on = 'SK_ID_CURR', how = 'left')
training_df_merged = training_df_merged.merge(right=credit_card_df_avg.reset_index(), on = 'SK_ID_CURR', how = 'left')
training_df_merged = training_df_merged.merge(right=pos_cash_df_avg.reset_index(), on = 'SK_ID_CURR', how = 'left')
training_df_merged = training_df_merged.merge(right=installments_df_avg.reset_index(), on = 'SK_ID_CURR', how = 'left')

# Deal with NULLs from the merges - if no info exists, just assume 0.
training_df_merged.fillna(0, inplace=True)

training_df_merged.head()

Unnamed: 0,SK_ID_CURR,APP_NEW_CREDIT_TO_ANNUITY_RATIO,APP_NEW_CREDIT_TO_GOODS_RATIO,APP_NEW_INC_PER_CHLD,APP_NEW_ANNUITY_TO_INCOME_RATIO,APP_NEW_SOURCES_PROD,APP_NEW_EXT_SOURCES_MEAN,APP_NEW_SCORES_STD,APP_NEW_CAR_TO_BIRTH_RATIO,APP_NEW_CAR_TO_EMPLOY_RATIO,...,APP_NEW_PHONE_TO_EMPLOY_RATIO,APP_NEW_CREDIT_TO_INCOME_RATIO,PREV_NEW_PAYMENT_TO_CREDIT_RATIO,PREV_NEW_CREDIT_TO_APPLICATION_RATIO,PREV_NEW_CREDIT_TO_ANNUITY_RATIO,CREDIT_BAL_TO_LIMIT_RATIO,CREDIT_PAYMENT_TO_MININSTALLRATIO,CASH_FUTURE_TO_TERM_RATIO,INSTPAY_INSTALL_TO_PAY_RATIO,INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY
0,100002,16.461104,1.158397,202500.0,0.121977,0.003043,0.161787,0.092026,-0.000627,-0.004357,...,1.78022,2.007889,0.0,1.0,19.353584,0.0,0.0,0.625,1.0,20.421053
1,100003,36.234085,1.145199,270000.0,0.132216,0.119932,0.466757,0.219895,-0.000627,-0.004357,...,0.69697,4.79075,0.050304,1.057664,8.677472,0.0,0.0,0.544643,1.0,7.16
2,100004,20.0,1.0,67500.0,0.099999,0.119932,0.642739,0.122792,-0.001365,-0.115556,...,3.622222,2.0,0.241719,0.828021,3.753045,0.0,0.0,0.5625,1.0,7.666667
3,100006,10.532818,1.052803,135000.0,0.219898,0.119932,0.650442,0.136021,-0.000627,-0.004357,...,0.203027,2.316167,0.078823,1.008456,15.206011,0.0,1.067054,0.571429,1.0,19.375
4,100007,23.461618,1.0,121500.0,0.179961,0.119932,0.322738,0.136021,-0.000627,-0.004357,...,0.364055,4.222222,0.091961,1.046356,12.644075,0.0,0.0,0.557561,333.751175,3.636364


In [96]:
# Merging - training
testing_df_merged = te_features_df
testing_df_merged = testing_df_merged.merge(right=previous_app_df_avg.reset_index(), on = 'SK_ID_CURR', how = 'left')
testing_df_merged = testing_df_merged.merge(right=credit_card_df_avg.reset_index(), on = 'SK_ID_CURR', how = 'left')
testing_df_merged = testing_df_merged.merge(right=pos_cash_df_avg.reset_index(), on = 'SK_ID_CURR', how = 'left')
testing_df_merged = testing_df_merged.merge(right=installments_df_avg.reset_index(), on = 'SK_ID_CURR', how = 'left')

# Deal with NULLs from the merges - if no info exists, just assume 0.
testing_df_merged.fillna(0, inplace=True)

testing_df_merged.head()

Unnamed: 0,SK_ID_CURR,APP_NEW_CREDIT_TO_ANNUITY_RATIO,APP_NEW_CREDIT_TO_GOODS_RATIO,APP_NEW_INC_PER_CHLD,APP_NEW_ANNUITY_TO_INCOME_RATIO,APP_NEW_SOURCES_PROD,APP_NEW_EXT_SOURCES_MEAN,APP_NEW_SCORES_STD,APP_NEW_CAR_TO_BIRTH_RATIO,APP_NEW_CAR_TO_EMPLOY_RATIO,...,APP_NEW_PHONE_TO_EMPLOY_RATIO,APP_NEW_CREDIT_TO_INCOME_RATIO,PREV_NEW_PAYMENT_TO_CREDIT_RATIO,PREV_NEW_CREDIT_TO_APPLICATION_RATIO,PREV_NEW_CREDIT_TO_ANNUITY_RATIO,CREDIT_BAL_TO_LIMIT_RATIO,CREDIT_PAYMENT_TO_MININSTALLRATIO,CASH_FUTURE_TO_TERM_RATIO,INSTPAY_INSTALL_TO_PAY_RATIO,INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY
0,100001,27.664697,1.264,135000.0,0.152299,0.094803,0.567263,0.353601,-0.000624,-0.004,...,0.747102,4.213333,0.10594,0.957782,6.020501,0.0,0.0,0.361111,1.0,7.285714
1,100005,12.82487,1.2376,99000.0,0.175453,0.071345,0.429869,0.136694,-0.000624,-0.004,...,-0.0,2.250182,0.080457,0.949975,9.212916,0.0,0.0,0.598485,1.0,23.555556
2,100013,9.505482,1.0528,202500.0,0.344576,0.119686,0.655389,0.062788,-0.00025,-0.001122,...,0.192014,3.275378,0.060075,1.039272,11.163349,0.115301,1.688193,0.631173,47.584916,5.180645
3,100028,32.130726,1.0,105000.0,0.155614,0.164177,0.549372,0.055432,-0.000624,-0.004,...,0.96731,5.0,0.057698,inf,14.073381,0.035934,inf,0.491129,1.486138,3.0
4,100038,19.506034,1.0,90000.0,0.178149,0.119686,0.313916,0.158068,-0.001227,-0.007303,...,0.374715,3.475,0.06858,1.131358,14.564047,0.0,0.0,0.487179,1.0,12.25


In [97]:
# Saving to files ... to be added to appropriate base file later
training_df_merged.to_csv('training_new_features_v1.csv', index = False)
testing_df_merged.to_csv('testing_new_features_v1.csv', index = False)