In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score

# Credit scoring 

Goal is to train a Random Forest classifier that predicts `SeriousDlqin2yrs`

In [2]:
url = '/Users/liamhettinger/Documents/Portfolio_work/Data/credit_scoring.csv'
credit_scoring = pd.read_csv(url)
credit_scoring.head()

Unnamed: 0,SeriousDlqin2yrs,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,NumberOfTimes90DaysLate,NumberOfTime60-89DaysPastDueNotWorse,MonthlyIncome,NumberOfDependents
0,0,64,0,0.249908,0,0,8158.0,0.0
1,0,58,0,3870.0,0,0,,0.0
2,0,41,0,0.456127,0,0,6666.0,0.0
3,0,43,0,0.00019,0,0,10500.0,2.0
4,1,49,0,0.27182,0,0,400.0,0.0


**Data Description**

| Feature | Description |
| :- | -: |
|SeriousDlqin2yrs (target variable) | Customer hasn't paid the loan debt within 90 days 
|age	| Customer age
|DebtRatio | Total monthly loan payments (loan, alimony, etc.) / Total monthly income percentage
|NumberOfTime30-59DaysPastDueNotWorse | The number of cases when client has overdue 30-59 days (not worse) on other loans |during the last 2 years
|NumberOfTimes90DaysLate	Input Feature | Number of cases when customer had 90+dpd overdue on other credits
|NumberOfTime60-89DaysPastDueNotWorse | 	Number of cased when customer has 60-89dpd (not worse) during the last 2 years
|NumberOfDependents | The number of customer dependents


In [3]:
len(credit_scoring)

45063

In [4]:
credit_scoring.isnull().sum()

SeriousDlqin2yrs                           0
age                                        0
NumberOfTime30-59DaysPastDueNotWorse       0
DebtRatio                                  0
NumberOfTimes90DaysLate                    0
NumberOfTime60-89DaysPastDueNotWorse       0
MonthlyIncome                           8643
NumberOfDependents                      1117
dtype: int64

In [5]:
credit_scoring = credit_scoring.dropna()

In [6]:
credit_scoring.isnull().sum()

SeriousDlqin2yrs                        0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
NumberOfTimes90DaysLate                 0
NumberOfTime60-89DaysPastDueNotWorse    0
MonthlyIncome                           0
NumberOfDependents                      0
dtype: int64

In [7]:
X = credit_scoring.drop('SeriousDlqin2yrs', axis = 1)
y = credit_scoring.SeriousDlqin2yrs

In [8]:
forest_clf = RandomForestClassifier(n_jobs = -1) 

In [None]:
parameters = {"min_samples_leaf" : [1, 4,  8], # default 1
              "min_samples_split" : [2, 4, 8], # default 2
              "n_estimators" : [10, 25, 50, 100, 150], # default 100
              "max_depth" : [2,5,10,15,20]}

grid = GridSearchCV(forest_clf, parameters,cv=5,scoring='accuracy',n_jobs=-1,verbose=True)
grid.fit(X,y)

Fitting 5 folds for each of 225 candidates, totalling 1125 fits


In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
best_clf = grid.best_estimator_

In [None]:
y_test_pred = best_clf.predict(X)

In [None]:
accuracy_score(y,y_test_pred)

In [None]:
recall_score(y,y_test_pred)