In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split

## Load Data

In [2]:
df_data1 = pd.read_csv('train_1.csv')
df_data2 = pd.read_csv('train_2.csv')
df_data3 = pd.read_csv('train_3.csv')
df_data4 = pd.read_csv('train_4.csv')

## Standard scaler

In [3]:
def standardsc(df):
    data1 = sc.fit_transform(df.iloc[:,6:])
    return data1

In [4]:
df_data1.iloc[:,6:] = standardsc(df_data1)
df_data2.iloc[:,6:] = standardsc(df_data2)
df_data3.iloc[:,6:] = standardsc(df_data3)
df_data4.iloc[:,6:] = standardsc(df_data4)

## One hot encoding

In [5]:
#apply one hot encoding method to country code
def O_H_E_country(df):
    Country = pd.get_dummies(df['Country_Code'])
    df['Country_0']=  Country.iloc[:,0]
    df['Country_1']=  Country.iloc[:,1]
    df['Country_2']=  Country.iloc[:,2]
    df = df.drop(['Country_2'],axis =1)
    return df

df_data1 = O_H_E_country(df_data1)
df_data2 = O_H_E_country(df_data2)
df_data3 = O_H_E_country(df_data3)
df_data4 = O_H_E_country(df_data4)

In [6]:
#apply the one hot encoding method to period variable and separated the columns by years
def O_H_E_country_periods(df):   
    Periods = pd.get_dummies(df['Period'])
    Y2014Q4 = Periods.iloc[:,0]
    Y2015 = Periods.iloc[:,1:5].max(axis = 1)
    Y2016 = Periods.iloc[:,5:9].max(axis = 1)
    Y2017 = Periods.iloc[:,9:13].max(axis = 1)
    Y2018 = Periods.iloc[:,13:17].max(axis = 1)
    Y2019 = Periods.iloc[:,17:21].max(axis = 1)
    Y2020Q1 = Periods.iloc[:,21]
    
    df['Y2014Q4'] = Y2014Q4
    df['Y2015'] = Y2015
    df['Y2016'] = Y2016
    df['Y2017'] = Y2017
    df['Y2018'] = Y2018
    df['Y2019'] = Y2019
    df['Y2020Q1'] = Y2020Q1
    df = df.drop(['Y2020Q1'],axis =1)
    return df

df_data1 = O_H_E_country_periods(df_data1)
df_data2 = O_H_E_country_periods(df_data2)
df_data3 = O_H_E_country_periods(df_data3)
df_data4 = O_H_E_country_periods(df_data4)

In [8]:
#apply one hot encoding method to BR code 
def O_H_E_BR(df): 
    BR_data = pd.get_dummies(df['BR Code'])
    new_column_names = ['BR_Code_' + str(i) for i in range(len(BR_data.columns))]
    BR_data_renamed = BR_data.rename(columns=dict(zip(BR_data.columns, new_column_names)))
    BR_data_combined = pd.concat([df, BR_data_renamed], axis=1)
    BR_data_combined = BR_data_combined.drop(['BR_Code_0'],axis =1)
    return BR_data_combined

df_data1 = O_H_E_BR(df_data1)
df_data2 = O_H_E_BR(df_data2)
df_data3 = O_H_E_BR(df_data3)
df_data4 = O_H_E_BR(df_data4)

In [9]:
#Drop the orignal columns
def drop_columns(df):
    df =  df.drop(['BR Code','Country_Code','Period','Client','Self_exclude_flag'],axis =1)
    return df

df_data1 = drop_columns(df_data1)
df_data2 = drop_columns(df_data2)
df_data3 = drop_columns(df_data3)
df_data4 = drop_columns(df_data4)

## Create Target and input

To ensure that each target dataset after splitting contains only unique values in the 'risk-rating' column, we encounter an issue with dataset 1, as it contains two instances with a '1' value in this column. Consequently, we will handle dataset 1 differently, following our usual splitting method. However, for the other three datasets, we will employ a loop to identify a random state that satisfies the aforementioned condition during the splitting process.

In [10]:
#check point
data_1 = df_data1.copy()
data_2 = df_data2.copy()
data_3 = df_data3.copy()
data_4 = df_data4.copy()

In [11]:
#set the input and target
inputs = data_1.iloc[:,1:]
targets = data_1.iloc[:,:1]

#Separated the dataset by 80-10-10 for dataset 1
x_train1,x_valid1,y_train1,y_valid1 = train_test_split(inputs,targets,train_size = 0.9, random_state = 12)
x_train1,x_test1,y_train1,y_test1 = train_test_split(x_train1,y_train1,test_size = 1/9 , random_state = 10)

In [12]:
#Using loop the to ensure all unique in column risk-rating in three dataset
def splitting(df):
    for n in range(0,100):
        x_train,x_valid,y_train,y_valid = train_test_split(inputs,targets,train_size = 0.9, random_state = n)
        x_train,x_test,y_train,y_test = train_test_split(x_train,y_train,test_size = 1/9 , random_state = 10)
        if len(y_valid['risk_rating'].unique()) == 16 and len(y_test['risk_rating'].unique()) == 16 and len(y_train['risk_rating'].unique()) == 16:
            break
    return x_train,x_valid,y_train,y_valid,x_test,y_test

x_train2,x_valid2,y_train2,y_valid2,x_test2,y_test2 = splitting(data_2)
x_train3,x_valid3,y_train3,y_valid3,x_test3,y_test3 = splitting(data_3)
x_train4,x_valid4,y_train4,y_valid4,x_test4,y_test4 = splitting(data_4)

## Desicion tree for dataset

In [15]:
#employed desicion tree to test the each dataset
DT_1 = DecisionTreeClassifier()
DT_1.fit(x_train1, y_train1)
y_pred_1 = DT_1.predict(x_valid1)
print(f"model with 1st dataset: {f1_score(y_valid1, y_pred_1, average='micro')}")

DT_2 = DecisionTreeClassifier()
DT_2.fit(x_train2, y_train2)
y_pred_2 = DT_2.predict(x_valid2)
print(f"model with 2st dataset: {f1_score(y_valid2, y_pred_2, average='micro')}")

DT_3 = DecisionTreeClassifier()
DT_3.fit(x_train3, y_train3)
y_pred_3 = DT_3.predict(x_valid3)
print(f"model with 3st dataset: {f1_score(y_valid3, y_pred_3, average='micro')}")

DT_4 = DecisionTreeClassifier()
DT_4.fit(x_train4, y_train4)
y_pred_4 = DT_4.predict(x_valid4)
print(f"model with 4st dataset: {f1_score(y_valid4, y_pred_4, average='micro')}")


model with 1st dataset: 0.16963696369636963
model with 2st dataset: 0.16435643564356436
model with 3st dataset: 0.17227722772277226
model with 4st dataset: 0.1808580858085809


In [16]:
scores = pd.DataFrame(data ={'Drop_20': f1_score(y_valid1, y_pred_1, average='micro'),
                     'mean': f1_score(y_valid2, y_pred_2, average='micro'),
                     'median': f1_score(y_valid3, y_pred_3, average='micro'),
                     'regression': f1_score(y_valid4, y_pred_4, average='micro')},
                      index = ['F1_score'])

In [17]:
scores

Unnamed: 0,Drop_20,mean,median,regression
F1_score,0.169637,0.164356,0.172277,0.180858


In general, data_4 stands out as the best-performing dataset for the decision tree algorithm, as it retains more information from the original dataset. Consequently, data_4 will be selected for future training purposes.

In [18]:
#Save the data
x_train4.to_csv('final_x_train.csv',index =False)
y_train4.to_csv('final_y_train.csv',index =False)
x_valid4.to_csv('final_x_valid.csv',index =False)
y_valid4.to_csv('final_y_valid.csv',index =False)
x_test4.to_csv('final_x_test.csv',index =False)
y_test4.to_csv('final_y_test.csv',index =False)