In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd /content/drive/My Drive/newdata

In [None]:
!pip install fancyimpute
!pip install impyute

In [None]:
# To supress warnings
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Basic Libraries for Data organization, Statistical operations and Plotting
import numpy as np
import pandas as pd
%matplotlib inline
# For loading .arff files
from scipy.io import arff
# To analyze the type of missing data
import missingno as msno
# Library for performing k-NN and MICE imputations 
import fancyimpute
# Library to perform Expectation-Maximization (EM) imputation
import impyute as impy
# To perform mean imputation
#To perform kFold Cross Validation
from sklearn.model_selection import KFold
# Formatted counter of class labels
from collections import Counter
# Ordered Dictionary
from collections import OrderedDict
# Library imbalanced-learn to deal with the data imbalance. To use SMOTE oversampling
from imblearn.over_sampling import SMOTE 
from scipy.stats.mstats import winsorize


# Impoting classification models
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

import random

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/newdata/train_bankcruptcy.csv')
df_test = pd.read_csv('/content/drive/MyDrive/newdata/test_bankcruptcy.csv')

In [None]:
df_train.head()

# Module

In [None]:
def load_dataframes(a):
    return [pd.DataFrame(data_i_year[0]) for data_i_year in a]
def set_new_headers(dataframes):
    cols = ['X' + str(i+1) for i in range(len(dataframes[0].columns)-1)]
    cols.append('Y')
    for df in dataframes:
        df.columns = cols



In [None]:
def set_test_headers(dataframes):
    cols = ['X' + str(i+1) for i in range(len(dataframes[0].columns))]
    for df in dataframes:
        df.columns = cols


# Preprocess

In [None]:
df_train = df_train.replace({'?':np.nan})
df_test = df_test.replace({'?':np.nan})
df_train = df_train.rename(columns={"class":"TARGET"})

In [None]:
df_test.shape

In [None]:
index = 0 
while(index<=66):
  colname = df_train.columns[index]
  col = getattr(df_train, colname)
  df_train[colname] = col.astype(float)
  index+=1

In [None]:
index = 0 
while(index<=65):
  colname = df_test.columns[index]
  col = getattr(df_test, colname)
  df_test[colname] = col.astype(float)
  index+=1

In [None]:
df_train_1 = df_train[df_train['forecasting period']==1]
df_train_2 = df_train[df_train['forecasting period']==2]
df_train_3 = df_train[df_train['forecasting period']==3]
df_train_4 = df_train[df_train['forecasting period']==4]
dataframes = [df_train_1, df_train_2, df_train_3, df_train_4]
set_new_headers(dataframes)    


In [None]:
df_test_1 = df_test[df_test['forecasting period']==1]
df_test_2 = df_test[df_test['forecasting period']==2]
df_test_3 = df_test[df_test['forecasting period']==3]
df_test_4 = df_test[df_test['forecasting period']==4]
dataframes_test = [df_test_1, df_test_2, df_test_3, df_test_4]
set_test_headers(dataframes_test)  

In [None]:
df_test_1.shape

In [None]:
# print the first 5 rows of a dataset 'year1'
dataframes[3].shape

# Data_preprocessiong


In [None]:
############################################################
# Get Clean dataframes by dropping all the rows which have missing values
def drop_nan_rows(dataframes, verbose=False):
    clean_dataframes = [df.dropna(axis=0, how='any') for df in dataframes]
    if verbose:
        for i in range(len(dataframes)):
            print(str(i+1)+'year:','Original Length=', len(dataframes[i]), '\tCleaned Length=', len(clean_dataframes[i]), '\tMissing Data=', len(dataframes[i])-len(clean_dataframes[i]))
    return clean_dataframes

# Doing a quick analysis of how many missing values are there in each of the 5 dataframes
nan_dropped_dataframes = drop_nan_rows(dataframes, verbose=True)
#nan_dropped_dataframes = drop_nan_rows(dataframes_test, verbose=True)

In [None]:
dataframes[0].head()

In [None]:
def drop_missing(df):
  for i in range(len(df)):
    df[i].drop(columns=['X39', 'X23'], inplace=True)
  return df
dataframes = drop_missing(dataframes)
#test_frames = drop_missing(test_frames)

*Data imputation*

In [None]:
#KNN
def perform_knn_imputation(dfs):
    knn_imputed_datasets = [fancyimpute.KNN(k=100,verbose=True).fit_transform(dfs[i]) for i in range(len(dfs))]
    return [pd.DataFrame(data=knn_imputed_datasets[i]) for i in range(len(dfs))]
knn_imputed_dataframes_test = perform_knn_imputation(dataframes_test)
knn_imputed_dataframes = perform_knn_imputation(dataframes)
set_new_headers(knn_imputed_dataframes)
set_test_headers(knn_imputed_dataframes_test)

In [None]:
df = knn_imputed_dataframes.copy()
test_frames = knn_imputed_dataframes_test.copy()

# Handling Outlier


In [None]:
train_index = []
for i in range(4):
  a = df[i][['X1','X2']]
  train_index.append(a)

In [None]:
test_index= []
for i in range(4):
  b = df[i][['X1','X2']]
  test_index.append(b)

In [None]:
test_index[0].shape

In [None]:
def refeature(dfs):
  for i in range(len(dfs)):
    #======================domain_feature============================
    dfs[i]['total_assets'] = dfs[i]['X57']/dfs[i]['X5']
    dfs[i]['ebit'] = dfs[i]['X9']*dfs[i]['total_assets']
    dfs[i]['sales'] = dfs[i]['X11']*dfs[i]['total_assets']
    dfs[i]['inventory'] = dfs[i]['sales']/dfs[i]['X62']
    dfs[i]['cost_of_product_sold'] = dfs[i]['inventory']*365/dfs[i]['X49']
    dfs[i]['current_liabilities'] = dfs[i]['cost_of_product_sold']*dfs[i]['X34']/365
    dfs[i]['retained_earnings'] = dfs[i]['total_assets']*dfs[i]['X8']
    dfs[i]['inventory_turnover'] = 365/dfs[i]['X49']
    dfs[i]['gross_profit'] = dfs[i]['X20']*dfs[i]['total_assets']
    dfs[i]['interest_cost'] = dfs[i]['ebit']-dfs[i]['gross_profit']
    dfs[i]['interest_cost_divided_to_sales'] = dfs[i]['interest_cost']/dfs[i]['sales']
    dfs[i]['equity'] = dfs[i]['total_assets']*dfs[i]['X12']
    dfs[i]['operating_profit'] = dfs[i]['X24']*dfs[i]['total_assets']
    dfs[i]['total_liabilities'] = dfs[i]['X4']*dfs[i]['total_assets']
    dfs[i]['market_value_of_equity'] = dfs[i]['X10']*dfs[i]['total_liabilities']
    dfs[i]['cash_flow']= dfs[i]['X28']*dfs[i]['total_liabilities']
    dfs[i]['current_assets'] = dfs[i]['X52']*dfs[i]['total_liabilities']


    #====================================classification_score=====================================
    #Springate1
    dfs[i]['springate_model'] = 1.03*dfs[i]['X5'] + 3.07*dfs[i]['X9']+0.66*(dfs[i]['ebit']/dfs[i]['current_liabilities'])+0.4*dfs[i]['X11']
    conditions1 =[(dfs[i]['springate_model']>=0.862), dfs[i]['springate_model']<0.862]
    values = [0,1]
    dfs[i]['bin_springate_model'] = np.select(conditions1, values)
    #Altman1
    dfs[i]['Altman_pattern'] = 0.717*(1/dfs[i]['X5'])+0.847*(1/dfs[i]['X8'])+3.108*(1/dfs[i]['X9'])+0.428*(dfs[i]['total_liabilities']/dfs[i]['market_value_of_equity'])+0.987*(1/dfs[i]['X38'])
    conditions2 = [(dfs[i]['Altman_pattern']>=1.81), dfs[i]['Altman_pattern']<1.81]
    values2 = [0,1]
    dfs[i]['bin_Altman_pattern'] = np.select(conditions2, values2)
    #Springate2
    dfs[i]['springate_model2'] = 1.03*(1/dfs[i]['X5']) + 3.07*dfs[i]['X9']+0.66*(dfs[i]['X28'])+0.4*dfs[i]['X11']
    conditions3 =[(dfs[i]['springate_model2']>=0.862), dfs[i]['springate_model2']<0.862]
    values3 = [0,1]
    dfs[i]['bin_springate_model2'] = np.select(conditions3, values3)
    #Zmijewski
    dfs[i]['zmijewski_pattern'] = -4.3 - 4.5*dfs[i]['X3']+5.7*dfs[i]['X4']-0.004*(dfs[i]['current_assets']/dfs[i]['current_liabilities'])
    conditions4 =[(dfs[i]['zmijewski_pattern']>0), dfs[i]['zmijewski_pattern']<=0]
    values4 = [1,0]
    dfs[i]['bin_zmijewski_pattern'] = np.select(conditions4, values4)
    #Gover Pattern
    dfs[i]['grover_pattern'] = 1.65*(1/dfs[i]['X5'])+3.404*(1/dfs[i]['X9'])-0.016*(1/dfs[i]['X3'])+0.057
    conditions5 =[(dfs[i]['grover_pattern']>=0.02), dfs[i]['grover_pattern']<0.02]
    values5 = [0,1]
    dfs[i]['bin_grover_pattern'] = np.select(conditions5, values5)
    #Fulmer pattern
    dfs[i]['fulmer_pattern']= 5.528*dfs[i]['X8']+0.212*dfs[i]['X11']+0.073*(dfs[i]['ebit']/dfs[i]['equity'])+1.207*dfs[i]['X28']-0.12*dfs[i]['X4']+2.335*(dfs[i]['current_liabilities']/dfs[i]['total_assets'])+0.575*dfs[i]['X31']+1.083*dfs[i]['X57']/dfs[i]['total_liabilities']+0.894*(dfs[i]['ebit']/dfs[i]['interest_cost'])- 6.075
    conditions6 =[(dfs[i]['fulmer_pattern']>=0), dfs[i]['fulmer_pattern']<0]
    values6 = [0,1]
    dfs[i]['bin_fulmer_pattern'] = np.select(conditions6, values6)



    #=====================synthetic_feature======================================================
    dfs[i]['S_23'] = dfs[i]['X49']/dfs[i]['X29']
    dfs[i]['S_26'] = dfs[i]['X20']-dfs[i]['X36']
    dfs[i]['S_29'] = dfs[i]['X15']-dfs[i]['X48']
    dfs[i]['S_30'] = dfs[i]['X13']*dfs[i]['X46']
    dfs[i]['S_31'] = dfs[i]['X23']+dfs[i]['X64']
    dfs[i]['S_32'] = dfs[i]['X4']-dfs[i]['X47']
    dfs[i]['S_35'] = dfs[i]['X23']+dfs[i]['X64']
    dfs[i]['S_36'] = dfs[i]['X63']+dfs[i]['X23']
    dfs[i]['S_38'] = dfs[i]['X19']-dfs[i]['X7']
    #dfs[i]['S_41'] = dfs[i]['X64']-dfs[i]['X37']
  return dfs
clean_dataframes = refeature(df)
test_frames = refeature(test_frames)

In [None]:
from sklearn import preprocessing
scaler = preprocessing.RobustScaler()


In [None]:
def out_liers(df):
  for i in range(len(df)):
    df[i].drop(columns=['X1', 'X2'], inplace=True)
    col = df[i].columns
    df[i] = scaler.fit_transform(df[i])
    df[i] = pd.DataFrame(df[i], columns = col)
  for i in range(len(df)):
    for a in range(len(df)-1):
      df[a] = df[a].apply(lambda x: np.clip(x, a_min=np.quantile(x,q=0.02), a_max=np.quantile(x,q=0.98)))
  return df
clean_dataframes = out_liers(df)
test_frames = out_liers(test_frames)


# Poly

In [None]:
imp_feat_list = ['X5','X31','X51','X14','X59','X42','X53','X36','X6','X48'] #  ,'DAYS_EMPLOYED'
poly = [a[imp_feat_list], b[imp_feat_list], c[imp_feat_list], d[imp_feat_list]]
poly_test = [a_test[imp_feat_list], b_test[imp_feat_list], c_test[imp_feat_list], d_test[imp_feat_list]]

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_transform = PolynomialFeatures(degree=2)
for i in range(4):
    poly_test[i] = poly_transform.fit_transform(poly_test[i])
    poly_feat_name_list = poly_transform.get_feature_names(imp_feat_list)
    poly_test[i] = pd.DataFrame(poly_test[i],columns=poly_feat_name_list)

for i in range(4):
    poly[i] = poly_transform.fit_transform(poly[i])
    poly_feat_name_list = poly_transform.get_feature_names(imp_feat_list)
    poly[i] = pd.DataFrame(poly[i],columns=poly_feat_name_list)

In [None]:
for i in range(4):
    poly[i].index = poly[i].index
    clean_dataframes[i] = poly[i].merge(clean_dataframes[i],how='inner', on = imp_feat_list)

for i in range(4):
    poly_test[i].index = poly_test[i].index
    test_frames[i] = poly_test[i].merge(test_frames[i],how='inner', on = imp_feat_list)

# Dealing with imbalanced data

In [None]:
def check_data_imbalance(dfs):
    for i in range(len(dfs)):
        print('Dataset: '+str(i+1)+'year')
        print(dfs[i].groupby('Y').size())
        try:
          minority_percent = (dfs[i]['Y'].tolist().count(1) / len(dfs[i]['Y'].tolist()))*100
        except:
          minority_percent = 0 
        print('Minority (label 1) percentage: '+  str(minority_percent) + '%')
        print('-'*64)
        
check_data_imbalance(clean_dataframes)

Feature with light gbm

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt



def plot_feature_importances(df, threshold = 0.9):
    """
    Plots 15 most important features and the cumulative importance of features.
    Prints the number of features needed to reach threshold cumulative importance.
    
    Parameters
    --------
    df : dataframe
        Dataframe of feature importances. Columns must be feature and importance
    threshold : float, default = 0.9
        Threshold for prining information about cumulative importances
        
    Return
    --------
    df : dataframe
        Dataframe ordered by feature importances with a normalized column (sums to 1)
        and a cumulative importance column
    
    """
    
    plt.rcParams['font.size'] = 18
    
    # Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index()
    
    # Normalize the feature importances to add up to one
    df['importance_normalized'] = df['importance'] / df['importance'].sum()
    df['cumulative_importance'] = np.cumsum(df['importance_normalized'])

    # Make a horizontal bar chart of feature importances
    plt.figure(figsize = (10, 6))
    ax = plt.subplot()
    
    # Need to reverse the index to plot most important on top
    ax.barh(list(reversed(list(df.index[:15]))), 
            df['importance_normalized'].head(15), 
            align = 'center', edgecolor = 'k')
    
    # Set the yticks and labels
    ax.set_yticks(list(reversed(list(df.index[:15]))))
    ax.set_yticklabels(df['feature'].head(15))
    
    # Plot labeling
    plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
    plt.show()
    
    # Cumulative importance plot
    plt.figure(figsize = (8, 6))
    plt.plot(list(range(len(df))), df['cumulative_importance'], 'r-')
    plt.xlabel('Number of Features'); plt.ylabel('Cumulative Importance'); 
    plt.title('Cumulative Feature Importance');
    plt.show();
    
    importance_index = np.min(np.where(df['cumulative_importance'] > threshold))
    print('%d features required for %0.2f of cumulative importance' % (importance_index + 1, threshold))
    
    return df
def identify_zero_importance_features(train, train_labels, iterations = 2):
# Initialize an empty array to hold feature importances
    feature_importances = np.zeros(train.shape[1])

    # Create the model with several hyperparameters
    model = lgb.LGBMClassifier(objective='binary', boosting_type = 'goss', n_estimators = 10000, class_weight = 'balanced')
    
    # Fit the model multiple times to avoid overfitting
    for i in range(iterations):

        # Split into training and validation set
        train_features, valid_features, train_y, valid_y = train_test_split(train, train_labels, test_size = 0.25, random_state = i)

        # Train using early stopping
        model.fit(train_features, train_y, early_stopping_rounds=100, eval_set = [(valid_features, valid_y)], 
                  eval_metric = 'auc', verbose = 200)

        # Record the feature importances
        feature_importances += model.feature_importances_ / iterations
    
    feature_importances = pd.DataFrame({'feature': list(train.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)
    
    # Find the features with zero importance
    zero_features = list(feature_importances[feature_importances['importance'] == 0.0]['feature'])
    print('\nThere are %d features with 0.0 importance' % len(zero_features))
    
    return zero_features, feature_importances

def feature_selection(df):
  model = lgb.LGBMClassifier(objective='binary', boosting_type = 'goss', n_estimators = 1000, class_weight = 'balanced')
  feature_dfs = df.drop(columns='Y')
  label_dfs = df.Y
  feat_names = feature_dfs.columns
  feature_importances = np.zeros(feature_dfs.shape[1])

  for i in range(2):
    train_features, valid_features, train_y, valid_y = train_test_split(feature_dfs, label_dfs, test_size = 0.25, random_state = i)
    model.fit(train_features, train_y, early_stopping_rounds=100, eval_set = [(valid_features, valid_y)], 
              eval_metric = 'auc', verbose = 200)
    feature_importances += model.feature_importances_
  #feature_df
  feature_importances = feature_importances / 2

  feature_importances = pd.DataFrame({'feature': feat_names, 'importance': feature_importances}).sort_values('importance', ascending = False)
  #zero_feature
  zero_features = list(feature_importances[feature_importances['importance'] == 0.0]['feature'])

  norm_feature_importances = plot_feature_importances(feature_importances)

  df.drop(columns=zero_features, inplace=True)

  norm_feature_importances = plot_feature_importances(feature_importances, threshold = 0.95)
  threshold = 0.99

  # Extract the features to keep
  features_to_keep = list(norm_feature_importances[norm_feature_importances['cumulative_importance'] < threshold]['feature'])
  df_thin = df[features_to_keep]
  
  df_thin['Y'] = label_dfs
  return df_thin







In [None]:
def reshape_df(clean_dataframes):
  thin_df=[]
  for i in range(len(clean_dataframes)):
    a = feature_selection(clean_dataframes[i])
    thin_df.append(a)
  return thin_df
clean_dataframes = reshape_df(clean_dataframes=clean_dataframes)

In [None]:
test = []
for i in range(len(test_frames)):
  a = clean_dataframes[i].drop(columns='Y')
  b = test_frames[i][a.columns]
  test.append(b)

In [None]:
test_frames = test

In [None]:
#df= clean_dataframes[3]
#df.to_csv('df_synthetic_4y.csv')

In [None]:
from xgboost import XGBClassifier


In [None]:
param_a = {'booster': 'dart',
 'lambda': 0.0003152874317077816,
 'alpha': 0.041176956950987297,
 'max_depth': 7,
 'eta': 0.9825770244495555,
 'gamma': 0.8781445883047275,
 'grow_policy': 'lossguide',
 'sample_type': 'uniform',
 'normalize_type': 'tree',
 'rate_drop': 0.07516367633155714,
 'skip_drop': 0.00012781727831260767}
param_b = {'booster': 'gbtree',
 'lambda': 0.7453988037016246,
 'alpha': 1.7182754110102324e-07,
 'max_depth': 9,
 'eta': 0.6930606999370981,
 'gamma': 0.010731654947300806,
 'grow_policy': 'depthwise'}
param_c = {'booster': 'dart',
 'lambda': 2.0184275036888347e-06,
 'alpha': 0.01361968260753759,
 'max_depth': 9,
 'eta': 0.7079474373774491,
 'gamma': 1.938059463499507e-07,
 'grow_policy': 'lossguide',
 'sample_type': 'uniform',
 'normalize_type': 'tree',
 'rate_drop': 0.0002843621852759622,
 'skip_drop': 1.8323176726706248e-07}

param_d = {'booster': 'dart',
 'lambda': 2.0415476800032797e-08,
 'alpha': 0.00158681936909514,
 'max_depth': 9,
 'eta': 0.6675522927486546,
 'gamma': 9.22087534202792e-07,
 'grow_policy': 'depthwise',
 'sample_type': 'weighted',
 'normalize_type': 'tree',
 'rate_drop': 0.035899546173585185,
 'skip_drop': 1.283206678881014e-07}
param = [param_a, param_b, param_c, param_d]

In [None]:
def split_dataframes_features_labels(dfs):
    feature_dfs = [dfs[i].drop(columns='Y') for i in range(len(dfs))]
    label_dfs = [dfs[i].Y for i in range(len(dfs))]
    return feature_dfs, label_dfs

from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
feature_dfs, label_dfs = split_dataframes_features_labels(clean_dataframes)   


In [None]:
smote = SMOTE(sampling_strategy='auto' , random_state=42, k_neighbors=10)

# PCA


In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=30)

# Run on one year

In [None]:
X_train, X_test ,y_train, y_test = train_test_split(feature_dfs[0], label_dfs[0], test_size=.2)

ratio = float(np.sum(y_train == 0)) / np.sum(y_train==1)
#X_train = pca.fit_transform(X_train)
#X_test = pca.fit_transform(X_test)
#test_frames[0] = pca.fit_transform(test_frames[0])
#X_train, y_train =  smote.fit_resample(X_train, y_train)
clf = XGBClassifier(**param[0],scale_pos_weight=ratio)              
clf.fit(X_train, y_train, verbose=True, early_stopping_rounds=1000, eval_metric='logloss', eval_set=[[X_test,y_test]])
y_test_predicted = clf.predict(test_frames[0])
del X_train, X_test ,y_train, y_test, clf, ratio


In [None]:
X_train, X_test ,y_train, y_test = train_test_split(feature_dfs[1], label_dfs[1], test_size=.2)
ratio = float(np.sum(y_train == 0)) / np.sum(y_train==1)
#X_train, y_train =  smote.fit_resample(X_train, y_train)
#X_train = pca.fit_transform(X_train)
#X_test = pca.fit_transform(X_test)
#test_frames[1] = pca.fit_transform(test_frames[1])
clf = XGBClassifier(**param[1], scale_pos_weight=ratio)              
clf.fit(X_train, y_train, verbose=True, early_stopping_rounds=1000, eval_metric='logloss', eval_set=[[X_test,y_test]])
y_test_predicted2 = clf.predict(test_frames[1])
del X_train, X_test ,y_train, y_test, clf,ratio

In [None]:
X_train, X_test ,y_train, y_test = train_test_split(feature_dfs[2], label_dfs[2], test_size=.2) 
ratio = float(np.sum(y_train == 0)) / np.sum(y_train==1)
#X_train, y_train =  smote.fit_resample(X_train, y_train)
#X_train = pca.fit_transform(X_train)
#X_test = pca.fit_transform(X_test)
#test_frames[2] = pca.fit_transform(test_frames[2])
clf = XGBClassifier(**param[2], scale_pos_weight=ratio)              
clf.fit(X_train, y_train, verbose=True, early_stopping_rounds=1000, eval_metric='logloss', eval_set=[[X_test,y_test]])
y_test_predicted3 = clf.predict(test_frames[2])
del X_train, X_test ,y_train, y_test, clf

In [None]:
X_train, X_test ,y_train, y_test = train_test_split(feature_dfs[3], label_dfs[3], test_size=.2)
ratio = float(np.sum(y_train == 0)) / np.sum(y_train==1)
#X_train, y_train =  smote.fit_resample(X_train, y_train)
#X_train = pca.fit_transform(X_train)
#X_test = pca.fit_transform(X_test)
#test_frames[3] = pca.fit_transform(test_frames[3])
clf = XGBClassifier(**param[3], scale_pos_weight=ratio)              
clf.fit(X_train, y_train, verbose=True, early_stopping_rounds=1000, eval_metric='logloss', eval_set=[[X_test,y_test]])
y_test_predicted4 = clf.predict(test_frames[3])
del X_train, X_test ,y_train, y_test, clf

In [None]:
test_label = [y_test_predicted, y_test_predicted2, y_test_predicted3, y_test_predicted4]

In [None]:
len(test_label[0])

In [None]:
len(test_index[0])

# New Section

In [None]:
df_test_1['class'] = test_label[0]
df_test_2['class'] = test_label[1]
df_test_3['class'] = test_label[2]
df_test_4['class'] = test_label[3]


In [None]:
x = pd.concat([df_test_1, df_test_2, df_test_3, df_test_4])

In [None]:
x.set_index('X1').sort_index()

In [None]:
x['id']= x['X1']

In [None]:
out_put = x[['id','class']]


In [None]:
out_put['class'].value_counts()

In [None]:
out_put.to_csv('out0601_1345.csv')

In [None]:
df_test_1.shape

In [None]:
test_label[0]

In [None]:
from pandas.core.common import random_state
feature_dfs, label_dfs = split_dataframes_features_labels(clean_dataframes)   
verbose= False
k_folds = 5
result = []
for df_index in range(len(clean_dataframes)):
  if verbose: print('\t\tDataset: ' + '\033[1m' + str(df_index+1) + 'year' + '\033[0m')
                
  # Calling the 'prepare_kfold_cv_data' returns lists of features and labels 
  # for train and test sets respectively.
  # The number of items in the list is equal to k_folds
  X_train_list, y_train_list, X_test_list, y_test_list = train_test_split(feature_dfs[df_index], label_dfs[df_index], test_size=.2)             
  clf = XGBClassifier(**param[df_index])              
  clf.fit(X_train_list, y_train_list, verbose=True, early_stopping_rounds=1000, eval_metric='logloss', eval_set=[[X_test_list,y_test_list]])
  y_test_predicted = clf.predict(test_frames[df_index])
  result.append(y_test_predicted)
                    
    