In [162]:
import pandas as pd
import os
from multiprocessing.pool import ThreadPool
from functools import partial, reduce
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
%matplotlib inline

# Path to data folder
SRC_FOLDER = 'dataset'

In [160]:
def loadData(src_folder, *filenames, merge=False):
  """This will load the csv files and return DataFrames.
  
  Args:
    src_folder (str): The absolute/relation path to the folder containing the csv files.
    *filenames (str): The name of the files to read. 
        Multiple filenames are seperated by comma e.g.('application_train.csv', 'bureau.csv'). 
    merge (bool, optional): Merge all files to one dataframe if true, default to false.
  
  Returns:
    List[pd.DataFrame]: The DataFrames of the the files in the same order as `filenames` param
  """
  try:
    if filenames[0] == 'all':
        filenames = [filename for filename in os.listdir(SRC_FOLDER) if os.path.splitext(filename)[1] == '.csv']
    filepaths = [os.path.join(src_folder, filename) for filename in filenames] # get full path
    with ThreadPool() as pool:
        dfs = pool.map(partial(pd.read_csv, encoding='ISO-8859-1'), filepaths) 
    if (merge):
        dfs = reduce(lambda l, r: pd.merge(l, r, on='SK_ID_CURR'), dfs)

  except FileNotFoundError:
    print ('The src_folder/filenames is not correct')
  else:
    return dfs

def encodeCategoryLabel(df):
    df = df.copy()
    le = LabelEncoder()
    cat_cols = df.select_dtypes('object').columns
    trans_tf = df[cat_cols].astype('str').apply(le.fit_transform)
    if not trans_tf.empty: df[cat_cols] = trans_tf
    return df

def crossValidate(clf, X, y):
    clf.fit(X, y)
    scores = cross_val_score(clf, X, y, scoring='roc_auc', cv=10, n_jobs=-1)
    return sum(scores)/len(scores)

def dropTrivialFeatures(clf, X, y, feature_importances):
    res = {}
    for ratio in range(0.003, 1, 0.001):
        mask = feature_importance['Importance']>ratio
        subset_X = X[feature_importance.index[mask]]
        res[ratio] = crossValidate(clf, subset_X, y)
    return res

In [142]:
dfs = loadData(SRC_FOLDER, 'application_train.csv', 'bureau.csv', merge=True)
dfs.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY_x,...,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY_y
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,-1038.0,,0,40761.0,,,0.0,Credit card,-1038,0.0
1,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,-48.0,,0,0.0,0.0,,0.0,Credit card,-47,
2,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,-1185.0,0.0,0,135000.0,0.0,0.0,0.0,Consumer credit,-1185,0.0
3,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,-911.0,3321.0,0,19071.0,,,0.0,Consumer credit,-906,0.0
4,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,-36.0,5043.645,0,120735.0,0.0,0.0,0.0,Consumer credit,-34,0.0


In [126]:
# Preprocessing
df = encodeCategoryLabel(df)
df = df.fillna(0)

In [41]:
X = df.drop('TARGET', axis=1)
y = df['TARGET']

In [128]:
rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=123456, n_jobs=-1)
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=123456, verbose=0,
            warm_start=False)

In [149]:
feature_importance = pd.DataFrame(rf.feature_importances_, index=X.columns, columns=['Importance']).sort_values('Importance', ascending=False)
feature_importance = feature_importance.round(5)
feature_importance.to_csv('feature_importance.csv')
# cutoff around 0.003

In [42]:
rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=123456)
scores = cross_val_score(rf, X, y, scoring='roc_auc', cv=10, n_jobs=-1)
print (scores)

[0.71250551 0.7112077  0.71155777 0.70733194 0.70463546 0.70048274
 0.7050165  0.70954615 0.70815394 0.70729026]


In [159]:
ratio = 0.003
mask = feature_importance['Importance']>ratio
try_X = X[feature_importance.index[mask]]
scores = cross_val_score(rf, try_X, y, scoring='roc_auc', cv=10, n_jobs=-1)
print (scores)

[0.71559182 0.7085414  0.71078638 0.70821836 0.70616287 0.70237009
 0.70110499 0.71022418 0.71007784 0.70476565]
