# SECTION A [BATCH 1]
# WEEK 4

# Exercise 1

# 1.Use the German credit rating dataset “German Credit Data.csv” , Decision tree classifier to predict good or bad credit. Use “sklearn.model_selection” and GridSearchCV to search the hyperparameter values and report the most optimal one. Configure the grid search to search for optimal parameters: 

   # • Splitting criteria: gini or entropy.
   # • Maximum depth of decision tree ranging from 2 to 10.
   # • The searching of optimal parameter will be validated using 10-fold cross validation and the most optimal parameter will be chosen based on ROC AUC score.

**Used Resources:**

**1.)https://scikit-learn.org/stable/modules/tree.html** \
**2.)https://www.w3schools.com/python/python_ml_decision_tree.asp** \
**3.)https://medium.com/deep-math-machine-learning-ai/chapter-4-decision-trees-algorithms-b93975f7a1f1** \
**4.)https://www.youtube.com/watch?v=wxS5P7yDHRA** \
**5.)https://www.youtube.com/watch?v=HdlDYng8g9s** \
**6.)https://www.youtube.com/watch?v=gJo0uNL-5Qw**

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

%matplotlib inline

In [44]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score


In [28]:
df = pd.read_csv("German Credit Data.csv")
df

Unnamed: 0,checkin_acc,duration,credit_history,amount,savings_acc,present_emp_since,inst_rate,personal_status,residing_since,age,inst_plans,num_credits,job,status
0,A11,6,A34,1169,A65,A75,4,A93,4,67,A143,2,A173,0
1,A12,48,A32,5951,A61,A73,2,A92,2,22,A143,1,A173,1
2,A14,12,A34,2096,A61,A74,2,A93,3,49,A143,1,A172,0
3,A11,42,A32,7882,A61,A74,2,A93,4,45,A143,1,A173,0
4,A11,24,A33,4870,A61,A73,3,A93,4,53,A143,2,A173,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,A14,12,A32,1736,A61,A74,3,A92,4,31,A143,1,A172,0
996,A11,30,A32,3857,A61,A73,4,A91,4,40,A143,1,A174,0
997,A14,12,A32,804,A61,A75,4,A93,4,38,A143,1,A173,0
998,A11,45,A32,1845,A61,A73,4,A93,4,23,A143,1,A173,1


In [29]:
df.head()

Unnamed: 0,checkin_acc,duration,credit_history,amount,savings_acc,present_emp_since,inst_rate,personal_status,residing_since,age,inst_plans,num_credits,job,status
0,A11,6,A34,1169,A65,A75,4,A93,4,67,A143,2,A173,0
1,A12,48,A32,5951,A61,A73,2,A92,2,22,A143,1,A173,1
2,A14,12,A34,2096,A61,A74,2,A93,3,49,A143,1,A172,0
3,A11,42,A32,7882,A61,A74,2,A93,4,45,A143,1,A173,0
4,A11,24,A33,4870,A61,A73,3,A93,4,53,A143,2,A173,1


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   checkin_acc        1000 non-null   object
 1   duration           1000 non-null   int64 
 2   credit_history     1000 non-null   object
 3   amount             1000 non-null   int64 
 4   savings_acc        1000 non-null   object
 5   present_emp_since  1000 non-null   object
 6   inst_rate          1000 non-null   int64 
 7   personal_status    1000 non-null   object
 8   residing_since     1000 non-null   int64 
 9   age                1000 non-null   int64 
 10  inst_plans         1000 non-null   object
 11  num_credits        1000 non-null   int64 
 12  job                1000 non-null   object
 13  status             1000 non-null   int64 
dtypes: int64(7), object(7)
memory usage: 109.5+ KB


In [31]:
# Data Preprocessing:

df.isnull().sum()

checkin_acc          0
duration             0
credit_history       0
amount               0
savings_acc          0
present_emp_since    0
inst_rate            0
personal_status      0
residing_since       0
age                  0
inst_plans           0
num_credits          0
job                  0
status               0
dtype: int64

In [32]:
# Converting data into suitable form so that Decision Tree accepts it:

print(df['checkin_acc'].unique())
print(df['credit_history'].unique())
print(df['savings_acc'].unique())
print(df['present_emp_since'].unique())
print(df['personal_status'].unique())
print(df['inst_plans'].unique())
print(df['job'].unique())

['A11' 'A12' 'A14' 'A13']
['A34' 'A32' 'A33' 'A30' 'A31']
['A65' 'A61' 'A63' 'A64' 'A62']
['A75' 'A73' 'A74' 'A71' 'A72']
['A93' 'A92' 'A91' 'A94']
['A143' 'A141' 'A142']
['A173' 'A172' 'A174' 'A171']


In [33]:
# Encoding the attributes:

a = {'A11' : 0, 'A12' : 1, 'A14' : 2,'A13' : 3}
df['checkin_acc'] = df['checkin_acc'].map(a)

Unnamed: 0,checkin_acc,duration,credit_history,amount,savings_acc,present_emp_since,inst_rate,personal_status,residing_since,age,inst_plans,num_credits,job,status
0,0,6,A34,1169,A65,A75,4,A93,4,67,A143,2,A173,0
1,1,48,A32,5951,A61,A73,2,A92,2,22,A143,1,A173,1
2,2,12,A34,2096,A61,A74,2,A93,3,49,A143,1,A172,0
3,0,42,A32,7882,A61,A74,2,A93,4,45,A143,1,A173,0
4,0,24,A33,4870,A61,A73,3,A93,4,53,A143,2,A173,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2,12,A32,1736,A61,A74,3,A92,4,31,A143,1,A172,0
996,0,30,A32,3857,A61,A73,4,A91,4,40,A143,1,A174,0
997,2,12,A32,804,A61,A75,4,A93,4,38,A143,1,A173,0
998,0,45,A32,1845,A61,A73,4,A93,4,23,A143,1,A173,1


In [None]:
# You can also use LabelEncoder() for the same job instead of manual mapping:

#le_checkin_acc = LabelEncoder()
#df['checkin_acc_new'] = le_checkin_acc.fit_transform(df['checkin_acc'])
#df.drop(['checkin_acc'], axis = 1)

# Here a new column is created namely 'checkin_acc_new'

In [34]:
b = {'A34' : 0, 'A32' : 1, 'A33' : 2, 'A30' : 3, 'A31' : 4}
df['credit_history'] = df['credit_history'].map(b)

Unnamed: 0,checkin_acc,duration,credit_history,amount,savings_acc,present_emp_since,inst_rate,personal_status,residing_since,age,inst_plans,num_credits,job,status
0,0,6,0,1169,A65,A75,4,A93,4,67,A143,2,A173,0
1,1,48,1,5951,A61,A73,2,A92,2,22,A143,1,A173,1
2,2,12,0,2096,A61,A74,2,A93,3,49,A143,1,A172,0
3,0,42,1,7882,A61,A74,2,A93,4,45,A143,1,A173,0
4,0,24,2,4870,A61,A73,3,A93,4,53,A143,2,A173,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2,12,1,1736,A61,A74,3,A92,4,31,A143,1,A172,0
996,0,30,1,3857,A61,A73,4,A91,4,40,A143,1,A174,0
997,2,12,1,804,A61,A75,4,A93,4,38,A143,1,A173,0
998,0,45,1,1845,A61,A73,4,A93,4,23,A143,1,A173,1


In [35]:
c = {'A65': 0, 'A61' : 1, 'A63' : 2, 'A64' : 3, 'A62' : 4}
df['savings_acc'] = df['savings_acc'].map(c)

In [36]:
d = {'A75' : 0, 'A73' : 1, 'A74' : 2, 'A71' : 3, 'A72' : 4}
df['present_emp_since'] = df['present_emp_since'].map(d)

In [37]:
e = {'A93' : 0, 'A92' : 1, 'A91' : 2, 'A94' : 3}
df['personal_status'] = df['personal_status'].map(e)

In [38]:
f = {'A143' : 0, 'A141' : 1, 'A142' : 2}
df['inst_plans'] = df['inst_plans'].map(f)

In [39]:
g = {'A173' : 0, 'A172' : 1, 'A174' : 2, 'A171' : 3}
df['job'] = df['job'].map(g)

In [40]:
df

Unnamed: 0,checkin_acc,duration,credit_history,amount,savings_acc,present_emp_since,inst_rate,personal_status,residing_since,age,inst_plans,num_credits,job,status
0,0,6,0,1169,0,0,4,0,4,67,0,2,0,0
1,1,48,1,5951,1,1,2,1,2,22,0,1,0,1
2,2,12,0,2096,1,2,2,0,3,49,0,1,1,0
3,0,42,1,7882,1,2,2,0,4,45,0,1,0,0
4,0,24,2,4870,1,1,3,0,4,53,0,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2,12,1,1736,1,2,3,1,4,31,0,1,1,0
996,0,30,1,3857,1,1,4,2,4,40,0,1,2,0
997,2,12,1,804,1,0,4,0,4,38,0,1,0,0
998,0,45,1,1845,1,1,4,0,4,23,0,1,0,1


In [45]:
x = df.iloc[:, 0:13]
y = df.iloc[:, -1]

In [48]:
# Using GridSearchCV with given conditions:

dtc = DecisionTreeClassifier()

clf = GridSearchCV(dtc, 
                   {"criterion" : ['gini', 'entropy'], "max_depth" : range(2, 10)},
                   scoring = 'roc_auc', n_jobs = -1, cv = 10, return_train_score = False)

In [49]:
clf.fit(x, y)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(2, 10)},
             scoring='roc_auc')

In [50]:
clf.cv_results_

{'mean_fit_time': array([0.00669668, 0.00473878, 0.00334015, 0.00232856, 0.00327466,
        0.00345006, 0.00368772, 0.00408707, 0.0019043 , 0.00305843,
        0.00325091, 0.00297873, 0.00444164, 0.00457134, 0.00523617,
        0.00543222]),
 'std_fit_time': array([0.00048468, 0.00181591, 0.00048317, 0.00042522, 0.00071506,
        0.00055139, 0.00063843, 0.00053742, 0.00055592, 0.00085525,
        0.00098316, 0.00062883, 0.00076822, 0.00057376, 0.00072651,
        0.00099559]),
 'mean_score_time': array([0.00398691, 0.00309978, 0.00193598, 0.00205035, 0.00199287,
        0.00187457, 0.00179431, 0.00169339, 0.00159419, 0.00205393,
        0.00192158, 0.00174665, 0.00195396, 0.00229075, 0.0025871 ,
        0.00193622]),
 'std_score_time': array([2.14576721e-07, 1.04021792e-03, 5.66068556e-04, 1.97962345e-04,
        4.45530752e-04, 6.46996224e-04, 5.98184969e-04, 6.38422135e-04,
        7.96944372e-04, 2.98083162e-04, 6.49610322e-04, 6.71039686e-04,
        5.52971456e-04, 4.51046484e-

In [51]:
df_dt_clf = pd.DataFrame(clf.cv_results_)
df_dt_clf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006697,0.000485,0.003987,2.145767e-07,gini,2,"{'criterion': 'gini', 'max_depth': 2}",0.715238,0.676429,0.739762,0.744762,0.71881,0.763333,0.623095,0.709524,0.715476,0.67,0.707643,0.039112,8
1,0.004739,0.001816,0.0031,0.001040218,gini,3,"{'criterion': 'gini', 'max_depth': 3}",0.812619,0.669286,0.719524,0.747619,0.71619,0.762381,0.681905,0.748333,0.707619,0.700238,0.726571,0.040112,4
2,0.00334,0.000483,0.001936,0.0005660686,gini,4,"{'criterion': 'gini', 'max_depth': 4}",0.803333,0.642143,0.740714,0.742857,0.674048,0.789524,0.71881,0.739048,0.724048,0.686905,0.726143,0.046906,6
3,0.002329,0.000425,0.00205,0.0001979623,gini,5,"{'criterion': 'gini', 'max_depth': 5}",0.765238,0.661905,0.751905,0.743095,0.711667,0.809048,0.699048,0.717381,0.712619,0.652143,0.722405,0.044742,7
4,0.003275,0.000715,0.001993,0.0004455308,gini,6,"{'criterion': 'gini', 'max_depth': 6}",0.72119,0.694762,0.772619,0.670952,0.704286,0.703333,0.712619,0.721667,0.69119,0.56119,0.695381,0.051366,11
5,0.00345,0.000551,0.001875,0.0006469962,gini,7,"{'criterion': 'gini', 'max_depth': 7}",0.64381,0.709762,0.731429,0.674524,0.705714,0.679048,0.651905,0.722381,0.673333,0.535476,0.672738,0.053406,13
6,0.003688,0.000638,0.001794,0.000598185,gini,8,"{'criterion': 'gini', 'max_depth': 8}",0.649048,0.606667,0.764286,0.647381,0.69381,0.686905,0.617619,0.742143,0.738333,0.537381,0.668357,0.066995,14
7,0.004087,0.000537,0.001693,0.0006384221,gini,9,"{'criterion': 'gini', 'max_depth': 9}",0.64119,0.571667,0.715,0.632619,0.647619,0.632857,0.664524,0.714524,0.665476,0.587857,0.647333,0.044096,16
8,0.001904,0.000556,0.001594,0.0007969444,entropy,2,"{'criterion': 'entropy', 'max_depth': 2}",0.715238,0.676429,0.739762,0.744762,0.71881,0.763333,0.623095,0.709524,0.715476,0.67,0.707643,0.039112,8
9,0.003058,0.000855,0.002054,0.0002980832,entropy,3,"{'criterion': 'entropy', 'max_depth': 3}",0.863571,0.666429,0.719524,0.735,0.711905,0.765476,0.681905,0.745,0.739762,0.702619,0.733119,0.051855,3


In [52]:
df_dt_clf[['param_criterion', 'param_max_depth', 'mean_test_score']]

Unnamed: 0,param_criterion,param_max_depth,mean_test_score
0,gini,2,0.707643
1,gini,3,0.726571
2,gini,4,0.726143
3,gini,5,0.722405
4,gini,6,0.695381
5,gini,7,0.672738
6,gini,8,0.668357
7,gini,9,0.647333
8,entropy,2,0.707643
9,entropy,3,0.733119


In [53]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_select_best_index',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'feature_names_in_',
 'fit',
 'get_params',
 'inverse_transform',
 'multim

In [56]:
# Most Optimal parameters:

clf.best_params_

{'criterion': 'entropy', 'max_depth': 4}

In [57]:
# The Highest Score or Score of Optimal parameter:

clf.best_score_

0.7463571428571429

# 2. Visualize the tree using graphviz software.

**Used Resources:**

**1.)https://towardsdatascience.com/visualizing-decision-trees-with-python-scikit-learn-graphviz-matplotlib-1c50b4aa68dc** \
**2.)https://github.com/bhattbhavesh91/visualize-decision-tree/blob/master/visualize-dt-notebook.ipynb**

In [59]:
# Training the Decision Tree model with Optimal parameters:

dt_model = DecisionTreeClassifier(criterion = "entropy", max_depth = 4, random_state = 42)

In [60]:
dt_model.fit(x, y)

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [61]:
dt_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 4,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [94]:
# fn
x.columns    #dt_model.feature_names_in_

Index(['checkin_acc', 'duration', 'credit_history', 'amount', 'savings_acc',
       'present_emp_since', 'inst_rate', 'personal_status', 'residing_since',
       'age', 'inst_plans', 'num_credits', 'job'],
      dtype='object')

In [96]:
#cn
dt_model.classes_

array([0, 1], dtype=int64)

In [87]:
# Visualizing the tree with graphviz:

from sklearn import tree
import graphviz

fn = ['Checkin Account', 'Duration','Credit History', 'Amount','Savings Account',
            'Present Emp Since', 'Interest Rate', 'Personal Status', 
            'Residing Since', 'Age', 'Interest Plans', 'Number of Credits', 'Job']

cn = ['Bad Credit', 'Good Credit']

tree.export_graphviz(dt_model, out_file = "GCD_tree.dot", feature_names = fn, class_names = cn, filled = True)

#dot_data = tree.export_graphviz(dt_model, out_file = "GCD_tree.dot", feature_names = fn, class_names = cn, filled = True)

! dot -Tpng GCD_tree.dot -o GCD_tree.png       # Converting the 'dot' file to 'png'

# ! dot -Tpng -Gdpi = 300 GCD_tree.dot -o GCD_tree.png

#graph = graphviz.Source(dot_data)
#graph

# A PNG File named "GCD_tree.png" will be created in the current directory, to view the tree we can open the png file.

In [89]:
# Opening the "GCD_tree.png" File:

! GCD_tree.png

# 3. Display the text representation of the rules learnt.

**Used Resources:**

**1.)https://mljar.com/blog/extract-rules-decision-tree/** \
**2.)https://www.youtube.com/watch?v=dQY8-7Q7ijM**

In [92]:
from sklearn.tree import _tree

def get_rules(tree, feature_names, class_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []
    
    def recurse(node, path, paths):
        
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]
            
    recurse(0, path, paths)

    # sort by samples count
    samples_count = [p[-1][1] for p in paths]
    ii = list(np.argsort(samples_count))
    paths = [paths[i] for i in reversed(ii)]
    
    rules = []
    for path in paths:
        rule = "if "
        
        for p in path[:-1]:
            if rule != "if ":
                rule += " and "
            rule += str(p)
        rule += " then "
        if class_names is None:
            rule += "response: "+str(np.round(path[-1][0][0][0],3))
        else:
            classes = path[-1][0][0]
            l = np.argmax(classes)
            rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes),2)}%)"
        rule += f" | based on {path[-1][1]:,} samples"
        rules += [rule]
        
    return rules

In [98]:
rules = get_rules(dt_model, fn, cn)
for r in rules:
    print(r, '\n')

if (checkin_acc <= 1.5) and (duration <= 22.5) and (credit_history <= 2.5) and (duration > 11.5) then class: Bad Credit (proba: 64.14%) | based on 198 samples 

if (checkin_acc > 1.5) and (inst_plans <= 0.5) and (credit_history > 0.5) and (num_credits <= 1.5) then class: Bad Credit (proba: 89.19%) | based on 185 samples 

if (checkin_acc <= 1.5) and (duration > 22.5) and (savings_acc > 0.5) and (duration <= 47.5) then class: Good Credit (proba: 55.49%) | based on 164 samples 

if (checkin_acc > 1.5) and (inst_plans <= 0.5) and (credit_history <= 0.5) and (amount <= 11867.0) then class: Bad Credit (proba: 96.62%) | based on 148 samples 

if (checkin_acc <= 1.5) and (duration <= 22.5) and (credit_history <= 2.5) and (duration <= 11.5) then class: Bad Credit (proba: 82.5%) | based on 80 samples 

if (checkin_acc > 1.5) and (inst_plans > 0.5) and (age <= 44.5) and (residing_since > 1.5) then class: Bad Credit (proba: 58.82%) | based on 51 samples 

if (checkin_acc > 1.5) and (inst_plans <=

In [None]:
# Try using and experimenting with ccp_alpha parameter !

In [106]:
#dt_model.feature_importances_

In [107]:
#feature_importance = pd.DataFrame(dt_model.feature_importances_, index = fn).sort_values(0, )
#feature_importance

In [108]:
#features = list(feature_importance[feature_importance[0]>0].index)
#features

In [109]:
#feature_importance.head(10).plot(kind = 'bar')