# Training

## Importing Libraries

In [119]:
import pandas as pd 
import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

#Ignores warning messages.
import warnings
warnings.filterwarnings('ignore')

# #Prints all statements
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import itertools

import seaborn as sns
import matplotlib.pyplot as plt

In [120]:
le = LabelEncoder()

In [121]:
unique_SSN = []

# open file and read the content in a list
with open(r'list_files//unique_SSN.txt', 'r') as fp:
    for line in fp:
        # remove linebreak from a current name
        # linebreak is the last character of each line
        x = line[:-1]

        # add current item to the list
        unique_SSN.append(int(x))

## Import Dataset

In [122]:
df = pd.read_csv('checkpoints//df_final.csv')

In [123]:
print(f'Length of DataFrame : {len(df)}')
print(f'DataFrame Columns : {list(df.columns)}')

Length of DataFrame : 100000
DataFrame Columns : ['Month', 'Age', 'SSN', 'Occupation', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age', 'Payment_of_Min_Amount', 'Total_EMI_per_month', 'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance', 'Credit_Score']


In [124]:
df.head()

Unnamed: 0,Month,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,January,23,821000265,Scientist,19114.12,1824.843333,3,4,3,4,...,Good,809.98,26.82262,265,No,49.574949,80.415295,High_spent_Small_value_payments,312.494089,Good
1,February,23,821000265,Scientist,19114.12,1824.843333,3,4,3,4,...,Good,809.98,31.94496,266,No,49.574949,118.280222,Low_spent_Large_value_payments,284.629162,Good
2,March,23,821000265,Scientist,19114.12,1824.843333,3,4,3,4,...,Good,809.98,28.609352,267,No,49.574949,81.699521,Low_spent_Medium_value_payments,331.209863,Good
3,April,23,821000265,Scientist,19114.12,1824.843333,3,4,3,4,...,Good,809.98,31.377862,268,No,49.574949,199.458074,Low_spent_Small_value_payments,223.45131,Good
4,May,23,821000265,Scientist,19114.12,1824.843333,3,4,3,4,...,Good,809.98,24.797347,269,No,49.574949,41.420153,High_spent_Medium_value_payments,341.489231,Good


### Selecting Features

##### We select relevant columns from our previous notebooks.

In [125]:
columns_to_keep = set(['SSN',
        'Monthly_Inhand_Salary',
        'Num_Bank_Accounts',
        'Num_Credit_Card',
        'Interest_Rate',
        'Delay_from_due_date',
        'Changed_Credit_Limit',
        'Credit_Mix',
        'Outstanding_Debt',
        'Credit_Utilization_Ratio',
        'Payment_of_Min_Amount',
        'Payment_Behaviour',
        'Monthly_Balance',
        'Type_of_Loan',
        'Credit_History_Age',
        'Credit_Score'])
columns_to_drop = set(df.columns) - columns_to_keep
print(f'Columns to keep : {columns_to_keep}')
print(f'Columns to drop : {columns_to_drop}')

Columns to keep : {'Num_Credit_Card', 'Num_Bank_Accounts', 'Credit_Score', 'Outstanding_Debt', 'Credit_History_Age', 'Payment_Behaviour', 'SSN', 'Type_of_Loan', 'Payment_of_Min_Amount', 'Credit_Utilization_Ratio', 'Monthly_Balance', 'Changed_Credit_Limit', 'Monthly_Inhand_Salary', 'Delay_from_due_date', 'Interest_Rate', 'Credit_Mix'}
Columns to drop : {'Num_of_Delayed_Payment', 'Annual_Income', 'Num_Credit_Inquiries', 'Amount_invested_monthly', 'Total_EMI_per_month', 'Occupation', 'Month', 'Num_of_Loan', 'Age'}


### Drop Irrelevant Columns

In [126]:
df.drop(columns = [col for col in columns_to_drop],  axis = 1,inplace = True)


### Details on new dataframe

In [127]:
df.columns

Index(['SSN', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Type_of_Loan', 'Delay_from_due_date',
       'Changed_Credit_Limit', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Payment_Behaviour', 'Monthly_Balance',
       'Credit_Score'],
      dtype='object')

In [128]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   SSN                       100000 non-null  int64  
 1   Monthly_Inhand_Salary     100000 non-null  float64
 2   Num_Bank_Accounts         100000 non-null  int64  
 3   Num_Credit_Card           100000 non-null  int64  
 4   Interest_Rate             100000 non-null  int64  
 5   Type_of_Loan              100000 non-null  object 
 6   Delay_from_due_date       100000 non-null  int64  
 7   Changed_Credit_Limit      100000 non-null  float64
 8   Credit_Mix                100000 non-null  object 
 9   Outstanding_Debt          100000 non-null  float64
 10  Credit_Utilization_Ratio  100000 non-null  float64
 11  Credit_History_Age        100000 non-null  int64  
 12  Payment_of_Min_Amount     100000 non-null  object 
 13  Payment_Behaviour         100000 non-null  ob

# Preparing Data

### Converting Object Data types to Numerical

### Type Of Loan

In [129]:
df['Type_of_Loan'].head()

0    Auto, Credit-Builder, Personal, and HomeEquity
1    Auto, Credit-Builder, Personal, and HomeEquity
2    Auto, Credit-Builder, Personal, and HomeEquity
3    Auto, Credit-Builder, Personal, and HomeEquity
4    Auto, Credit-Builder, Personal, and HomeEquity
Name: Type_of_Loan, dtype: object

In [130]:
df['Type_of_Loan'].nunique()

6260

### One hot Encoding

In [131]:
corresponding_unique_type_of_loan_values = {}
for ssn in unique_SSN:
    corresponding_unique_type_of_loan_values[ssn] = list(df[df['SSN'] == ssn]['Type_of_Loan'].unique())

In [132]:
types_of_loan = set()

In [133]:
for values in corresponding_unique_type_of_loan_values.values():
    for i in range(len(values)):
        types_of_loan.add(values[0].split(',')[i].split(' Loan')[0].replace(' ','').replace('and',''))

In [134]:
dataframe_type_of_loan_list = sorted(['Credit-Builder Loan',
                               'Not Specified',
                               'Debt Consolidation Loan',
                               'Auto Loan',
                               'Student Loan',
                               'Home Equity Loan', 
                               'Mortgage Loan',
                               'Personal Loan',
                               'Payday Loan'])
dataframe_type_of_loan_list

processed_types_of_loan_list = sorted(list(types_of_loan))
processed_types_of_loan_list

['Auto Loan',
 'Credit-Builder Loan',
 'Debt Consolidation Loan',
 'Home Equity Loan',
 'Mortgage Loan',
 'Not Specified',
 'Payday Loan',
 'Personal Loan',
 'Student Loan']

['Auto',
 'Credit-Builder',
 'DebtConsolidation',
 'HomeEquity',
 'Mortgage',
 'NotSpecified',
 'Payday',
 'Personal',
 'Student']

In [135]:
def map_type_of_loan(txt):
    txt = str(txt)
    for i in range(len(processed_types_of_loan_list)):
        txt = txt.replace(' and ', ' ')
        txt = txt.replace(dataframe_type_of_loan_list[i],processed_types_of_loan_list[i])
    return txt

In [136]:
df['Type_of_Loan'].head()

0    Auto, Credit-Builder, Personal, and HomeEquity
1    Auto, Credit-Builder, Personal, and HomeEquity
2    Auto, Credit-Builder, Personal, and HomeEquity
3    Auto, Credit-Builder, Personal, and HomeEquity
4    Auto, Credit-Builder, Personal, and HomeEquity
Name: Type_of_Loan, dtype: object

In [137]:
index = 0
for type_of_loan in processed_types_of_loan_list:
    df.insert(loc = 11 + index, column = type_of_loan , value = 0)
    index += 1

In [138]:
for type_of_loan in processed_types_of_loan_list:    
    df[type_of_loan] = df['Type_of_Loan'].apply(lambda x:1 if type_of_loan in x else 0)

In [139]:
df.drop('Type_of_Loan', axis = 1, inplace = True)

In [140]:
df.head()

Unnamed: 0,SSN,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Delay_from_due_date,Changed_Credit_Limit,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,...,Mortgage,NotSpecified,Payday,Personal,Student,Credit_History_Age,Payment_of_Min_Amount,Payment_Behaviour,Monthly_Balance,Credit_Score
0,821000265,1824.843333,3,4,3,3,11.27,Good,809.98,26.82262,...,0,0,0,1,0,265,No,High_spent_Small_value_payments,312.494089,Good
1,821000265,1824.843333,3,4,3,3,11.27,Good,809.98,31.94496,...,0,0,0,1,0,266,No,Low_spent_Large_value_payments,284.629162,Good
2,821000265,1824.843333,3,4,3,3,11.27,Good,809.98,28.609352,...,0,0,0,1,0,267,No,Low_spent_Medium_value_payments,331.209863,Good
3,821000265,1824.843333,3,4,3,5,6.27,Good,809.98,31.377862,...,0,0,0,1,0,268,No,Low_spent_Small_value_payments,223.45131,Good
4,821000265,1824.843333,3,4,3,6,11.27,Good,809.98,24.797347,...,0,0,0,1,0,269,No,High_spent_Medium_value_payments,341.489231,Good


### Credit_Mix

In [141]:
df['Credit_Mix'].head()

0    Good
1    Good
2    Good
3    Good
4    Good
Name: Credit_Mix, dtype: object

In [142]:
df['Credit_Mix'].unique()
df['Credit_Mix'].value_counts()

array(['Good', 'Standard', 'Bad'], dtype=object)

Standard    45848
Good        30384
Bad         23768
Name: Credit_Mix, dtype: int64

In [143]:
map_credit_mix = {'Bad' : 0, 'Good' : 1,'Standard' : 2}
df['Credit_Mix'] = df['Credit_Mix'].map(map_credit_mix)

In [144]:
df['Credit_Mix'].unique()
df['Credit_Mix'].value_counts()

array([1, 2, 0], dtype=int64)

2    45848
1    30384
0    23768
Name: Credit_Mix, dtype: int64

### Payment_of_Min_Amount

In [145]:
df['Payment_of_Min_Amount'].head()

0    No
1    No
2    No
3    No
4    No
Name: Payment_of_Min_Amount, dtype: object

In [146]:
df['Payment_of_Min_Amount'].unique()
df['Payment_of_Min_Amount'].value_counts()

array(['No', 'Yes'], dtype=object)

Yes    59432
No     40568
Name: Payment_of_Min_Amount, dtype: int64

In [147]:
map_payment_of_min_amount = {'Yes' : 1, 'No' : 0}
df['Payment_of_Min_Amount'] = df['Payment_of_Min_Amount'].map(map_payment_of_min_amount)

In [148]:
df['Payment_of_Min_Amount'].unique()
df['Payment_of_Min_Amount'].value_counts()

array([0, 1], dtype=int64)

1    59432
0    40568
Name: Payment_of_Min_Amount, dtype: int64

In [149]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   SSN                       100000 non-null  int64  
 1   Monthly_Inhand_Salary     100000 non-null  float64
 2   Num_Bank_Accounts         100000 non-null  int64  
 3   Num_Credit_Card           100000 non-null  int64  
 4   Interest_Rate             100000 non-null  int64  
 5   Delay_from_due_date       100000 non-null  int64  
 6   Changed_Credit_Limit      100000 non-null  float64
 7   Credit_Mix                100000 non-null  int64  
 8   Outstanding_Debt          100000 non-null  float64
 9   Credit_Utilization_Ratio  100000 non-null  float64
 10  Auto                      100000 non-null  int64  
 11  Credit-Builder            100000 non-null  int64  
 12  DebtConsolidation         100000 non-null  int64  
 13  HomeEquity                100000 non-null  in

### Payment_Behaviour

In [150]:
df['Payment_Behaviour'].unique()
df['Payment_Behaviour'].value_counts()

array(['High_spent_Small_value_payments',
       'Low_spent_Large_value_payments',
       'Low_spent_Medium_value_payments',
       'Low_spent_Small_value_payments',
       'High_spent_Medium_value_payments', 'NotSpecified',
       'High_spent_Large_value_payments'], dtype=object)

Low_spent_Small_value_payments      25513
High_spent_Medium_value_payments    17540
Low_spent_Medium_value_payments     13861
High_spent_Large_value_payments     13721
High_spent_Small_value_payments     11340
Low_spent_Large_value_payments      10425
NotSpecified                         7600
Name: Payment_Behaviour, dtype: int64

In [151]:
df['Payment_Behaviour'] = le.fit_transform(df['Payment_Behaviour'])

In [152]:
df['Payment_Behaviour'].unique()
df['Payment_Behaviour'].value_counts()

array([2, 3, 4, 5, 1, 6, 0])

5    25513
1    17540
4    13861
0    13721
2    11340
3    10425
6     7600
Name: Payment_Behaviour, dtype: int64

In [153]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   SSN                       100000 non-null  int64  
 1   Monthly_Inhand_Salary     100000 non-null  float64
 2   Num_Bank_Accounts         100000 non-null  int64  
 3   Num_Credit_Card           100000 non-null  int64  
 4   Interest_Rate             100000 non-null  int64  
 5   Delay_from_due_date       100000 non-null  int64  
 6   Changed_Credit_Limit      100000 non-null  float64
 7   Credit_Mix                100000 non-null  int64  
 8   Outstanding_Debt          100000 non-null  float64
 9   Credit_Utilization_Ratio  100000 non-null  float64
 10  Auto                      100000 non-null  int64  
 11  Credit-Builder            100000 non-null  int64  
 12  DebtConsolidation         100000 non-null  int64  
 13  HomeEquity                100000 non-null  in

### Credit_Score

In [154]:
df['Credit_Score'].unique()
df['Credit_Score'].value_counts()

array(['Good', 'Standard', 'Poor'], dtype=object)

Standard    53174
Poor        28998
Good        17828
Name: Credit_Score, dtype: int64

In [155]:
map_credit_score   = {'Good' : 0,'Poor' : 1, 'Standard' : 2}
df['Credit_Score'] = df['Credit_Score'].map(map_credit_score)

In [156]:
df['Credit_Score'].unique()
df['Credit_Score'].value_counts()

array([0, 2, 1], dtype=int64)

2    53174
1    28998
0    17828
Name: Credit_Score, dtype: int64

## Select Features

In [157]:
X,y=df.drop(columns = ['Credit_Score'],axis=1),df['Credit_Score']

In [158]:
X.corrwith(y)

SSN                         0.011919
Monthly_Inhand_Salary      -0.078955
Num_Bank_Accounts           0.173234
Num_Credit_Card             0.109595
Interest_Rate               0.120170
Delay_from_due_date         0.096984
Changed_Credit_Limit        0.188842
Credit_Mix                  0.229920
Outstanding_Debt            0.037153
Credit_Utilization_Ratio   -0.014371
Auto                        0.026571
Credit-Builder              0.024562
DebtConsolidation           0.032146
HomeEquity                  0.026376
Mortgage                    0.020499
NotSpecified               -0.010108
Payday                      0.025630
Personal                    0.036713
Student                     0.016661
Credit_History_Age         -0.102603
Payment_of_Min_Amount       0.278235
Payment_Behaviour           0.018793
Monthly_Balance            -0.061176
dtype: float64

### Selecting Features

In [159]:
columns_to_drop =[]
for col in X.columns:
    if abs(X[col].corr(y)) < 0.1:
        columns_to_drop.append(col)

In [160]:
columns_to_drop

['SSN',
 'Monthly_Inhand_Salary',
 'Delay_from_due_date',
 'Outstanding_Debt',
 'Credit_Utilization_Ratio',
 'Auto',
 'Credit-Builder',
 'DebtConsolidation',
 'HomeEquity',
 'Mortgage',
 'NotSpecified',
 'Payday',
 'Personal',
 'Student',
 'Payment_Behaviour',
 'Monthly_Balance']

In [161]:
X,y=df.drop(columns = columns_to_drop,axis=1),df['Credit_Score']

In [162]:
X.head()

Unnamed: 0,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Changed_Credit_Limit,Credit_Mix,Credit_History_Age,Payment_of_Min_Amount,Credit_Score
0,3,4,3,11.27,1,265,0,0
1,3,4,3,11.27,1,266,0,0
2,3,4,3,11.27,1,267,0,0
3,3,4,3,6.27,1,268,0,0
4,3,4,3,11.27,1,269,0,0


In [163]:
train_X,test_X,train_y,test_y = train_test_split(X , y , test_size = 0.2, random_state = 11)

# Training

## KNN

In [164]:
params = {'n_neighbors':[5,7,9,11,13],
        'metric':['cosine','euclidean','manhattan'],
        'weights':['uniform','distance']},

In [165]:
max_score = 0.001
### for selecting the best hyperparameters
for combination in itertools.product(*params.values()):
    param_dict = dict(zip(params.keys(), combination))
    print("Training with hyper parameters:", param_dict)

    knn = KNeighborsClassifier(**param_dict)
    knn.fit(train_X, train_y)

    model_score = knn.score(test_X, test_y)
    print('Test Accuracy Score : ',model_score)
    print('------------------------------------')
    
    if model_score > max_score:
        max_score = model_score
        hyper_parameters = combination
print('Best hyper-parameter : ', hyper_parameters)
print('Score : ', max_score)

AttributeError: 'tuple' object has no attribute 'values'

## Decision Tree

In [None]:
params = {'criterion':['gini','entropy'],
        'max_depth':[2,4,6,8,10,12]
        }

In [None]:
max_score = 0.001
### for selecting the best hyperparameters
for combination in itertools.product(*params.values()):
    param_dict = dict(zip(params.keys(), combination))
    print("Training with hyper parameters:", param_dict)

    dtc = DecisionTreeClassifier(**param_dict)
    dtc.fit(train_X, train_y)

    model_score = dtc.score(test_X, test_y)
    print('Test Accuracy Score : ',model_score)
    print('------------------------------------')
    
    if model_score > max_score:
        max_score = model_score
        hyper_parameters = combination
print('Best hyper-parameter : ', hyper_parameters)
print('Score : ', max_score)

## Random Forest Classifier

In [None]:
params = {
        'criterion':['gini','entropy'],
        'max_depth':[2,4,6,8,10,12]
    }

In [None]:
max_score = 0.001
### for selecting the best hyperparameters
for combination in itertools.product(*params.values()):
    param_dict = dict(zip(params.keys(), combination))
    print("Training with hyper parameters:", param_dict)

    rtc = RandomForestClassifier(**param_dict)
    rtc.fit(train_X, train_y)

    model_score = rtc.score(test_X, test_y)
    print('Test Accuracy Score : ',model_score)
    print('------------------------------------')
    
    if model_score > max_score:
        max_score = model_score
        hyper_parameters = combination
print('Best hyper-parameter : ', hyper_parameters)
print('Score : ', max_score)

## MultinomialNB

In [None]:
params = {
        'alpha': [1.0,2.0],
        'fit_prior': [True]
    }

In [None]:
max_score = 0.001
### for selecting the best hyperparameters
for combination in itertools.product(*params.values()):
    param_dict = dict(zip(params.keys(), combination))
    print("Training with hyper parameters:", param_dict)

    mnb = MultinomialNB(**param_dict)
    mnb.fit(train_X, train_y)

    model_score = mnb.score(test_X, test_y)
    print('Test Accuracy Score : ',model_score)
    print('------------------------------------')
    
    if model_score > max_score:
        max_score = model_score
        hyper_parameters = combination
print('Best hyper-parameter : ', hyper_parameters)
print('Score : ', max_score)

## SVC

In [None]:
params = {'C':[0.1,1,10,100], 
            'gamma':[1,0.1,0.01], 
            'kernel':['rbf','linear','poly']
        }

In [None]:
max_score = 0.001
### for selecting the best hyperparameters
for combination in itertools.product(*params.values()):
    param_dict = dict(zip(params.keys(), combination))
    print("Training with hyper parameters:", param_dict)

    svc = SVC(**param_dict)
    svc.fit(train_X, train_y)

    model_score = svc.score(test_X, test_y)
    print('Test Accuracy Score : ',model_score)
    print('------------------------------------')
    
    if model_score > max_score:
        max_score = model_score
        hyper_parameters = combination
print('Best hyper-parameter : ', hyper_parameters)
print('Score : ', max_score)