In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random
import progressbar
from google.colab import files

In [None]:
credit=pd.read_csv('UCI_Credit_Card.csv').drop(columns=['ID'])

In [None]:
credit.head()

In [None]:
credit.dropna(inplace=True)

In [None]:
cats=['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6'] #categorical columns
for c in cats:
    credit[c]=credit[c].astype('category')

credit_ohe=pd.get_dummies(credit, drop_first=True) #One-hot encoding the categorical variables

y=credit_ohe['default.payment.next.month']
X=credit_ohe.drop(columns=['default.payment.next.month'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [None]:
rf=RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

In [None]:
print('Training set metrics:')
print('Accuracy:', accuracy_score(y_train, rf.predict(X_train)))
print('Precision:', precision_score(y_train, rf.predict(X_train)))
print('Recall:', recall_score(y_train, rf.predict(X_train)))
print('F1:', f1_score(y_train, rf.predict(X_train)))
print('---------------')
print('Test set metrics:')
print('Accuracy:', accuracy_score(y_test, rf.predict(X_test)))
print('Precision:', precision_score(y_test, rf.predict(X_test)))
print('Recall:', recall_score(y_test, rf.predict(X_test)))
print('F1:', f1_score(y_test, rf.predict(X_test)))

In [None]:
col_sorted_by_importance=rf.feature_importances_.argsort()
feat_imp=pd.DataFrame({
    'cols':X.columns[col_sorted_by_importance],
    'imps':rf.feature_importances_[col_sorted_by_importance]
})

In [None]:
!pip install plotly_express --upgrade -q

In [None]:
import plotly_express as px
import plotly.offline as po
px.bar(feat_imp.sort_values(['imps'], ascending=False)[:25], x='cols', y='imps', labels={'cols':'column', 'imps':'feature importance'})

In [None]:
def PermImportance(X, y, clf, metric, num_iterations=100):
    '''
    Calculates the permutation importance of features in a dataset.
    Inputs:
    X: dataframe with all the features
    y: array-like sequence of labels
    clf: sklearn classifier, already trained on training data
    metric: sklearn metric, such as accuracy_score, precision_score or recall_score
    num_iterations: no. of repetitive runs of the permutation
    Outputs:
    baseline: the baseline metric without any of the columns permutated
    scores: differences in baseline metric caused by permutation of each feature, dict in the format {feature:[diffs]}
    '''
    bar=progressbar.ProgressBar(max_value=len(X.columns))
    baseline_metric=metric(y, clf.predict(X))
    scores={c:[] for c in X.columns}
    for c in X.columns:
        X1=X.copy(deep=True)
        for _ in range(num_iterations):
            temp=X1[c].tolist()
            random.shuffle(temp)
            X1[c]=temp
            score=metric(y, clf.predict(X1))
            scores[c].append(baseline_metric-score)
        bar.update(X.columns.tolist().index(c))
    return baseline_metric, scores

In [None]:
baseline, scores=PermImportance(X_test, y_test, rf, recall_score, num_iterations=10)

 98% (81 of 82) |####################### | Elapsed Time: 0:02:46 ETA:   0:00:01

In [None]:
percent_changes={c:[] for c in X.columns}
for c in scores:
    for i in range(len(scores[c])):
        percent_changes[c].append(scores[c][i]/baseline*100)

In [None]:
px.bar(
    pd.DataFrame.from_dict(percent_changes).melt().groupby(['variable']).mean().reset_index().sort_values(['value'], ascending=False)[:25],
    x='variable',
    y='value',
    labels={
        'variable':'column',
        'value':'% change in recall'
        }
       )