In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')

In [None]:
os.chdir(r'D:\BML Munjal University\Module 6\Applied machine learning')
# reading data
df = pd.read_csv("germancredit.csv")

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df['Default'].value_counts().plot(kind='bar')

We can see that the data is very imbalanced. This will affect the learning of the model. To improve the distribution, we can using SMOTE technique.

In [None]:
df.Default.value_counts()

In [None]:
plt.rcParams['figure.figsize']=(15,10)
df.hist(bins = 30)
plt.show()

In [None]:
df.isnull().sum()

In [None]:
def hist(variable):
    sns.histplot(data=df, x=variable, kde=True)

def box(var):
    sns.boxplot(x=df[var])

In [None]:
hist("duration")

In [None]:
box("duration")

In [None]:
hist("amount")

In [None]:
box("amount")

In [None]:
hist("installment")

In [None]:
box("installment")

In [None]:
hist("residence")

In [None]:
box("residence")

In [None]:
hist("age")

In [None]:
box("age")

In [None]:
hist("liable")

In [None]:
box("liable")

In [None]:
plt.figure(figsize=(20,7))
sns.barplot(data=df, x="history", y="amount")

In [None]:
import seaborn as sns
sns.set(style = 'whitegrid')
sns.violinplot(x ="purpose",
             y ="amount",data = df)
sns.despine(left=True)

In [None]:
sns.set(style = 'whitegrid')
sns.violinplot(x ="status",
             y ="liable",
             data = df)

In [None]:
import squarify 
df1 = df.groupby('purpose').size().reset_index(name='counts')
labels = df1.apply(lambda x: str(x[0]) + "\n (" + str(x[1]) + ")", axis=1)
sizes = df1['counts'].values.tolist()
colors = [plt.cm.Spectral(i/float(len(labels))) for i in range(len(labels))]

# Draw Plot
plt.figure(figsize=(12,8), dpi= 80)
squarify.plot(sizes=sizes, label=labels, color=colors, alpha=.8)

# Decorate
plt.title('Treemap of Purpose')
plt.axis('off')
plt.show()

We can see that A47 (vacation) does not exist. It means that no one has taken credit for vacations.

In [None]:
df_numeric = df.select_dtypes(exclude = 'object')
df_numeric.head()

In [None]:
df_numeric = df_numeric.drop(['Default'],axis = 1)

In [None]:
plt.figure(figsize=(11,12))
sns.boxplot(data=df_numeric)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_numeric = scaler.fit_transform(df_numeric)

In [None]:
plt.figure(figsize=(11,12))
sns.boxplot(data=df_numeric)

In [None]:
df_numeric = pd.DataFrame(df_numeric)

In [None]:
df_numeric.columns = ['duration','amount','installment','residence',
                     'age','cards','liable']

In [None]:
df_numeric.columns

In [None]:
type(df_numeric)

In [None]:
Q1 = np.percentile(df_numeric['duration'], 25, interpolation = 'midpoint') 
Q2 = np.percentile(df_numeric['duration'], 50, interpolation = 'midpoint') 
Q3 = np.percentile(df_numeric['duration'], 75, interpolation = 'midpoint') 
IQR = Q3 - Q1 
low_lim1 = Q1 - 1.5 * IQR
up_lim1 = Q3 + 1.5 * IQR
print('low_limit is', low_lim1)
print('up_limit is', up_lim1)
outlier =[]
for x in df_numeric['duration']:
    if ((x> up_lim1) or (x<low_lim1)):
         outlier.append(x)
print(' Number of outlier in duration is', len(outlier))

Q1 = np.percentile(df_numeric['amount'], 25, interpolation = 'midpoint') 
Q2 = np.percentile(df_numeric['amount'], 50, interpolation = 'midpoint') 
Q3 = np.percentile(df_numeric['amount'], 75, interpolation = 'midpoint') 
IQR = Q3 - Q1 
low_lim2 = Q1 - 1.5 * IQR
up_lim2 = Q3 + 1.5 * IQR
print('low_limit is', low_lim2)
print('up_limit is', up_lim2)
outlier =[]
for x in df_numeric['amount']:
    if ((x> up_lim2) or (x<low_lim2)):
         outlier.append(x)
print(' Number of outlier in amount is', len(outlier))

Q1 = np.percentile(df_numeric['age'], 25, interpolation = 'midpoint') 
Q2 = np.percentile(df_numeric['age'], 50, interpolation = 'midpoint') 
Q3 = np.percentile(df_numeric['age'], 75, interpolation = 'midpoint') 
IQR = Q3 - Q1 
low_lim3 = Q1 - 1.5 * IQR
up_lim3 = Q3 + 1.5 * IQR
print('low_limit is', low_lim3)
print('up_limit is', up_lim3)
outlier =[]
for x in df_numeric['age']:
    if ((x> up_lim3) or (x<low_lim3)):
         outlier.append(x)
print(' Number of outlier in age is', len(outlier))

Q1 = np.percentile(df_numeric['cards'], 25, interpolation = 'midpoint') 
Q2 = np.percentile(df_numeric['cards'], 50, interpolation = 'midpoint') 
Q3 = np.percentile(df_numeric['cards'], 75, interpolation = 'midpoint') 
IQR = Q3 - Q1 
low_lim4 = Q1 - 1.5 * IQR
up_lim4 = Q3 + 1.5 * IQR
print('low_limit is', low_lim1)
print('up_limit is', up_lim1)
outlier =[]
for x in df_numeric['cards']:
    if ((x> up_lim4) or (x<low_lim4)):
         outlier.append(x)
print(' Number of outlier in cards is', len(outlier))

Q1 = np.percentile(df_numeric['liable'], 25, interpolation = 'midpoint') 
Q2 = np.percentile(df_numeric['liable'], 50, interpolation = 'midpoint') 
Q3 = np.percentile(df_numeric['liable'], 75, interpolation = 'midpoint') 
IQR = Q3 - Q1 
low_lim5 = Q1 - 1.5 * IQR
up_lim5 = Q3 + 1.5 * IQR
print('low_limit is', low_lim5)
print('up_limit is', up_lim5)
outlier =[]
for x in df_numeric['liable']:
    if ((x> up_lim5) or (x<low_lim5)):
         outlier.append(x)
print(' Number of outlier in liable is', len(outlier))



In [None]:
df_numeric["duration"] = np.where(df_numeric["duration"]> up_lim1, up_lim1,
                        np.where(df_numeric["duration"]< low_lim1, low_lim1,
                        df_numeric["duration"]))

df_numeric["amount"] = np.where(df_numeric["amount"]> up_lim2, up_lim2,
                        np.where(df_numeric["amount"]< low_lim2, low_lim2,
                        df_numeric["amount"]))

df_numeric["age"] = np.where(df_numeric["age"]> up_lim3, up_lim3,
                        np.where(df_numeric["age"]< low_lim3, low_lim3,
                        df_numeric["age"]))

df_numeric["cards"] = np.where(df_numeric["cards"]> up_lim4, up_lim4,
                        np.where(df_numeric["cards"]< low_lim4, low_lim4,
                        df_numeric["cards"]))

df_numeric["liable"] = np.where(df_numeric["liable"]> up_lim5, up_lim5,
                        np.where(df_numeric["liable"]< low_lim5, low_lim5,
                        df_numeric["liable"]))


In [None]:
plt.figure(figsize=(11,12))
sns.boxplot(data=df_numeric)

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(), linewidths=0.5, annot=True, fmt=".2f", cmap = 'viridis')

As the credit amount increases, the duration for which the amount is taken also increases. Therefore, they are highly correlated.

In [None]:
df_cat_encoded = pd.get_dummies(df, columns=['checkingstatus1','history','purpose','savings','employ',
                                            'status','others','property','otherplans','housing','job',
                                            'tele','foreign'])

In [None]:
df_numeric.head()

In [None]:
df_cat_encoded.columns

In [None]:
df_cat_encoded = df_cat_encoded.drop(['Default', 'duration', 'amount', 'installment', 'residence', 'age',
       'cards', 'liable'], axis = 1)

In [None]:
df_cat_encoded.columns

In [None]:
df_combine = pd.concat([df_cat_encoded,df_numeric], axis = 1)
df_combine.head()
#df_combine.columns

# Modelling

## Using Logistic Regression

In [None]:
X1 = df_combine
y1 = df['Default']

In [None]:
X1.head()

In [None]:
y1.head()

In [None]:
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier_logreg1 = LogisticRegression(solver='liblinear', random_state=0)
classifier_logreg1.fit(X1_train, y1_train)

In [None]:
y_pred_logreg_proba1 = classifier_logreg1.predict_proba(X1_test)
from sklearn.metrics import roc_curve
fpr1, tpr1, thresholds = roc_curve(y1_test, y_pred_logreg_proba1[:,1])
plt.figure(figsize=(6,4))
plt.plot(fpr1,tpr1,'-g',linewidth=1)
plt.plot([0,1], [0,1], 'k--' )
plt.title('ROC curve for Logistic Regression Model')
plt.xlabel("False Positive Rate")
plt.ylabel('True Positive Rate')
plt.show()


In [None]:
pd.DataFrame({"fpr1":fpr1, "tpr1":tpr1, "threshold1":thresholds})

In [None]:
thresholds[np.argmax(tpr1 - fpr1)]

In [None]:
pred_proba1 = y_pred_logreg_proba1[:,1]

In [None]:
preds1 = np.where(pred_proba1>0.18, 1, 0)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm1 = confusion_matrix(y1_test, preds1)
s1 = sns.heatmap(cm1 ,annot=True ,fmt='d')
s1.set(xlabel='Predicted', ylabel='Actual')
print("Model accuracy for model 1:",accuracy_score(y1_test, preds1))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y1_test, preds1))

## Using Decision Trees with depth 8

In [None]:
from sklearn.tree import DecisionTreeClassifier
# from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
classifier_dt1 = DecisionTreeClassifier(max_depth=8,random_state=0)
classifier_dt1.fit(X1_train, y1_train)

In [None]:
# Get probabilities of records belonging to each class
y_pred_logreg_proba_dt1 = classifier_dt1.predict_proba(X1_test)
y_pred_logreg_proba_dt1

In [None]:
from sklearn.metrics import roc_curve
fpr_dt1, tpr_dt1, thresholds_dt1 = roc_curve(y1_test, y_pred_logreg_proba_dt1[:,1])
plt.figure(figsize=(6,4))
plt.plot(fpr_dt1,tpr_dt1,'-g',linewidth=1)
plt.plot([0,1], [0,1], 'k--' )
plt.title('ROC curve for Decision Tree Model')
plt.xlabel("False Positive Rate")
plt.ylabel('True Positive Rate')
plt.show()

In [None]:
pred_proba_dt1 = y_pred_logreg_proba_dt1[:,1]

In [None]:
preds_dt1 = np.where(pred_proba_dt1>0.70, 1, 0)

In [None]:
cm_dt1 = confusion_matrix(y1_test, preds_dt1)
ConfusionMatrixDisplay(confusion_matrix=cm_dt1).plot()
print("Model accuracy of Decision Tree:",accuracy_score(y1_test, preds_dt1))

In [None]:
print(classification_report(y1_test, preds_dt1))

## Using Decision Trees with depth 6

In [None]:
classifier_dt2 = DecisionTreeClassifier(max_depth=6,random_state=0)
classifier_dt2.fit(X1_train, y1_train)

In [None]:
# Get probabilities of records belonging to each class
y_pred_logreg_proba_dt = classifier_dt2.predict_proba(X1_test)
y_pred_logreg_proba_dt

In [None]:
from sklearn.metrics import roc_curve
fpr_dt, tpr_dt, thresholds_dt = roc_curve(y1_test, y_pred_logreg_proba_dt[:,1])
plt.figure(figsize=(6,4))
plt.plot(fpr_dt,tpr_dt,'-g',linewidth=1)
plt.plot([0,1], [0,1], 'k--' )
plt.title('ROC curve for Decision Tree Model')
plt.xlabel("False Positive Rate")
plt.ylabel('True Positive Rate')
plt.show()

In [None]:
pred_proba_dt = y_pred_logreg_proba_dt[:,1]

In [None]:
preds_dt = np.where(pred_proba_dt>0.70, 1, 0)

In [None]:
cm_dt = confusion_matrix(y1_test, preds_dt)
ConfusionMatrixDisplay(confusion_matrix=cm_dt).plot()
print("Model accuracy of Decision Tree:",accuracy_score(y1_test, preds_dt))

In [None]:
print(classification_report(y1_test, preds_dt))

## Using Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier_rf = RandomForestClassifier(n_estimators= 20 ,max_depth=8, random_state=0)
classifier_rf.fit(X1_train, y1_train)

In [None]:
y_pred_logreg_proba_rf = classifier_rf.predict_proba(X1_test)
from sklearn.metrics import roc_curve
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y1_test, y_pred_logreg_proba_rf[:,1])
plt.figure(figsize=(6,4))
plt.plot(fpr_rf,tpr_rf,'-g',linewidth=1)
plt.plot([0,1], [0,1], 'k--' )
plt.title('ROC curve for Random forest Model')
plt.xlabel("False Positive Rate")
plt.ylabel('True Positive Rate')
plt.show()

In [None]:
pred_proba_rf = y_pred_logreg_proba_rf[:,1]

In [None]:
preds_rf = np.where(pred_proba_rf>0.70, 1, 0)

In [None]:
cm_rf = confusion_matrix(y1_test, preds_rf)
ConfusionMatrixDisplay(confusion_matrix=cm_rf).plot()
print("Model accuracy:",accuracy_score(y1_test, preds_rf))

In [None]:
print(classification_report(y1_test, preds_rf))

## Using Random Forest with 30 estimators and 12 depth 

In [None]:
classifier_rf = RandomForestClassifier(n_estimators= 30 ,max_depth=12, random_state=0)
classifier_rf.fit(X1_train, y1_train)

In [None]:
y_pred_logreg_proba_rf = classifier_rf.predict_proba(X1_test)
from sklearn.metrics import roc_curve
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y1_test, y_pred_logreg_proba_rf[:,1])
plt.figure(figsize=(6,4))
plt.plot(fpr_rf,tpr_rf,'-g',linewidth=1)
plt.plot([0,1], [0,1], 'k--' )
plt.title('ROC curve for Random forest Model')
plt.xlabel("False Positive Rate")
plt.ylabel('True Positive Rate')
plt.show()

In [None]:
pred_proba_rf = y_pred_logreg_proba_rf[:,1]

In [None]:
preds_rf = np.where(pred_proba_rf>0.70, 1, 0)

In [None]:
cm_rf = confusion_matrix(y1_test, preds_rf)
ConfusionMatrixDisplay(confusion_matrix=cm_rf).plot()
print("Model accuracy:",accuracy_score(y1_test, preds_rf))

In [None]:
print(classification_report(y1_test, preds_rf))

## Modelling with SMOTE data

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X1_train, y1_train)

In [None]:
np.bincount(y_res)

In [None]:
np.bincount(y1_train)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier_logreg = LogisticRegression(solver='liblinear', random_state=0)
classifier_logreg.fit(X_res, y_res)

In [None]:
y_pred_logreg_proba = classifier_logreg.predict_proba(X_res)
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_res, y_pred_logreg_proba[:,1])
plt.figure(figsize=(6,4))
plt.plot(fpr,tpr,'-g',linewidth=1)
plt.plot([0,1], [0,1], 'k--' )
plt.title('ROC curve for Logistic Regression Model')
plt.xlabel("False Positive Rate")
plt.ylabel('True Positive Rate')
plt.show()


In [None]:
pd.DataFrame({"fpr":fpr, "tpr":tpr, "threshold":thresholds})

In [None]:
thresholds[np.argmax(tpr - fpr)]

In [None]:
pred_proba = y_pred_logreg_proba[:,1]

In [None]:
preds = np.where(pred_proba>0.18, 1, 0)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_res, preds)
s = sns.heatmap(cm ,annot=True ,fmt='d')
s.set(xlabel='Predicted', ylabel='Actual')
print("Model accuracy:",accuracy_score(y_res, preds))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_res, preds))