## Importing necessary libraries

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("Loan_status_2007-2020Q3.gzip", low_memory=False)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Loan_status_2007-2020Q3.gzip'

In [None]:
df.shape #Clearly there are way too many columns. We are gonna drop some columns(features) by implementing several constrictions

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df['loan_status'].value_counts()

In [None]:
counts = df['loan_status'].value_counts()
plt.figure(figsize=(30,15))
plt.title('Value Counts of loan_status')
plt.pie(counts, labels=counts.index, autopct='%.1f%%')
plt.show()

## Preprocessing : Dropping columns having lots of data missing

In [None]:
missing_data = df.isnull().mean().sort_values(ascending=False)
missing_data.head()

In [None]:
plt.figure(figsize=(10,6), dpi=90)
missing_data.plot.hist()
plt.title('Histogram of Feature Incompleteness')
plt.xlabel('Fraction of missing data')
plt.ylabel('Feature count')

In [None]:
drop_list = sorted(list(missing_data[missing_data > 0.6].index)) #listing columns having more than 60 percent of missing data
print(drop_list)
print("Drop Features: ", len(drop_list))

In [None]:
df.drop(labels=drop_list, axis=1, inplace=True) #dropping columns having more than 60 percent of data
df.shape

In [None]:
df['loan_status'].shape

In [None]:
df['Charged_Off'] = df.loan_status.map({'Fully Paid':0,'Charged Off':1,'Current':0,'Late (31-120 days)':0,'In Grace Period':0,'Late (16-30 days)':0,'Issued':0,'Does not meet the credit policy. Status:Fully Paid':0,'Does not meet the credit policy. Status:Charged Off':0,'Default':0}) #creating a target column using 0 in case of fully paid and 1 in case of charged off
df[['loan_status', 'Charged_Off']].head(15)

In [None]:
df['Charged_Off'].shape

In [None]:
df.corr()['Charged_Off'].sort_values(ascending=False).dropna() #Now we check for features most relevant to charged off status using correlation method

In [None]:
features_selected = ['Charged_Off','recoveries','collection_recovery_fee', 'total_rec_late_fee', 'inq_last_6mths', 'pub_rec', 'pub_rec_bankruptcies', 'loan_amnt', 'funded_amnt', 'dti', 'id', 'funded_amnt_inv', 'delinq_2yrs', 'installment', 'open_acc', 'total_acc', 'annual_inc', 'fico_range_high', 'fico_range_low', 'last_pymnt_amnt', 'total_pymnt', 'total_pymnt_inv','total_rec_prncp', 'last_fico_range_low', 'last_fico_range_high']
len(features_selected)

In [None]:
#So we have the new dataframe with the selected features
df2 = df.loc[:, ['Charged_Off','recoveries','collection_recovery_fee', 'total_rec_late_fee', 'inq_last_6mths', 'pub_rec', 'pub_rec_bankruptcies', 'loan_amnt', 'funded_amnt', 'dti', 'id', 'funded_amnt_inv', 'delinq_2yrs', 'installment', 'open_acc', 'total_acc', 'annual_inc', 'fico_range_high', 'fico_range_low', 'last_pymnt_amnt', 'total_pymnt', 'total_pymnt_inv','total_rec_prncp', 'last_fico_range_low', 'last_fico_range_high']]

In [None]:
df2.shape

## Further Cleaning

In [None]:
df2.isnull().sum()

In [None]:
df2.dropna(inplace=True) #dropping any remaining null values
df2.isnull().sum()

In [None]:
df2.duplicated().sum() #no duplicates

In [None]:
df2.shape

In [None]:
df2.to_csv("cleaned_dataset", index=False)

## Exploratory Data Analysis

In [None]:
plt.figure(figsize=(30,15)) #plotting the graph for open_acc value count
plt.xticks(rotation=90)
ax = sns.countplot(x="open_acc", data=df)

In [None]:
plt.figure(figsize=(10,6))
df.loan_status.value_counts().plot.bar()
plt.title('Loan Status Bar Plot')

In [None]:
#Plotting the graph against the loan amount and loan status.
plt.figure(figsize=(10,6))
sns.boxplot(x='loan_amnt', y='loan_status', data=df)
plt.title('Box Plot of loan_status vs loan-amnt')
plt.show()


In [None]:
value_counts = df['int_rate'].value_counts().sort_index() #Plotting the graph of interest rates counts
plt.figure(figsize=(10,6))
plt.bar(value_counts.index, value_counts.values)
plt.title('Bar Plot of Interest Rates')
plt.xlabel('Interest Rate')
plt.ylabel('Counts')
plt.xlim(0, 14)
plt.show()

In [None]:
plt.figure(figsize=(10,6)) #Plotting Subgrade Distribution
sns.countplot(x='sub_grade',data=df, palette='coolwarm')
plt.title('Subgrade Distribution');

In [None]:
# converting emp_length to integer for better indexing and training
emp_len_map = {
    '< 1 year' : 0,
    '1 year' : 1, 
    '2 years' : 2,
    '3 years' : 3, 
    '4 years' : 4,
    '5 years' : 5, 
    '6 years' : 6, 
    '7 years' : 7, 
    '8 years' : 8,
    '9 years' : 9, 
   '10+ years': 10,
}
df_emp = df.dropna(subset=['emp_length']) #to deal with nan values
df_emp['emp_length'] = df_emp.apply(lambda r: emp_len_map[r.emp_length], axis=1)

In [None]:
plt.figure(figsize=(10,6)) #Plotting emp_length distribution
vc = df_emp['emp_length'].value_counts().sort_index()
sns.countplot(x='emp_length',data=df_emp, order =vc.index, palette='viridis')
plt.title('emp_length Distribution')

In [None]:
plt.figure(figsize=(10,6), dpi=90) #plotting the dti (debt to income ratio)
sns.displot(df.loc[df['dti'].notnull() & (df['dti']<50), 'dti'], kde=False)
plt.xlabel('Debt to income Ratio')
plt.ylabel('Count')
plt.title('Debt to income Ratio Plot')

In [None]:
plt.figure(figsize=(30,10)) #Plotting heatmap of the Correlation of all the features
sns.heatmap(df2.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation')

In [None]:
# It is clearly visible that there are some strong correlations betwee multile features.
# We need to drop some of these highly correlated features as the model can get biased by this strong correlations.
# funded_amnt has very high correlation with loan_amnt, so we drop funded_amnt
# 
# collection_recovery_fee has very high correlation with recoveries, so we drop collection_recovery_fee
# total_pymnt has very high correlation with total_pymnt_inv, so we drop total_pymnt_inv
# loan_amnt has very high correlation with funded_amnt, so we drop funded_amnt
# fico_range_high has very high correlation with fico_range_low, so we drop fico_range_low
# funded_amnt_inv has very high correlation with loan_amnt, so we drop funded_amnt_inv
# total_rec_prncp has very high correlation with both total_paymnt, so we drop total_rec_prncp

df2.drop(['funded_amnt_inv','total_pymnt_inv','funded_amnt','collection_recovery_fee','total_rec_prncp','fico_range_low'],axis=1,inplace=True)


## More preprocessing

Performing Feature Scaling

In [None]:
df2.shape

In [None]:
df2.isnull().sum()
df2.dropna(inplace=True)

In [None]:
df2.shape

In [None]:
target = df2['Charged_Off'] #setting up the target variable
numeric_features = df2.select_dtypes(exclude=['object', 'category'])
numeric_features = numeric_features.drop('Charged_Off', axis = 1) #dropping the Charged Off column, as we don't want that to be scaled.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
scaled = sc.fit_transform(numeric_features)

In [None]:
target = target.astype(int) #Checking target variable
print(target.shape)
target.head()

In [None]:
numeric_features.columns

In [None]:
scaled_features = pd.DataFrame(scaled, columns=['recoveries', 'total_rec_late_fee', 'inq_last_6mths', 'pub_rec',
       'pub_rec_bankruptcies', 'loan_amnt', 'dti', 'delinq_2yrs',
       'installment', 'open_acc', 'total_acc', 'annual_inc', 'fico_range_high',
       'last_pymnt_amnt', 'total_pymnt', 'last_fico_range_low',
       'last_fico_range_high'], index= numeric_features.index)
scaled_features.head()

In [None]:
df2.select_dtypes(['object', 'category']).info()
df2.select_dtypes(['object', 'category'])


In [None]:
# df2 dataframe has only one categorical value having object datatype.
#The 'id' is just an unique assigned ID, which will not play any significant role in the prediction process.
#So there is no need of One Hot Encoding of that column.

In [None]:
scaled_features.shape  #These are our predictor variables

In [None]:
target.shape #These are our target variables

Creating Train_Test Split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

X = scaled_features
Y = target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, shuffle=True, random_state=42)


In [None]:
X.shape


In [None]:
Y.shape

In [None]:
# Creating a Decision Tree Classifier object
dt = DecisionTreeClassifier()

# Defining the parameter grid 
parameters = {'max_depth': [2, 4, 6, 8],
          'min_samples_split': [2, 5, 10, 15],
          'min_samples_leaf': [1, 2, 4, 8]}


grid_search = GridSearchCV(dt, param_grid = parameters, cv=5)

# Fitting the GridSearchCV object to the dataset
grid_search.fit(X, Y)

# Printing the best parameters 
print("Best parameters:", grid_search.best_params_)

#getting associated grid score
train_score = grid_search.score(X_train, Y_train)
test_score = grid_search.score(X_test, Y_test)

Y_pred = grid_search.predict(X_test)

# Computing accuracy of the model
accuracy = accuracy_score(Y_test, Y_pred)
f1_score_dt = f1_score(Y_test, Y_pred, average='weighted')

#Printing the scores
print("Train score: ", train_score)
print("Test score: ", test_score)
print("f1 score: ", f1_score_dt)
print("Accuracy: ", accuracy)

cm = confusion_matrix(Y_test, Y_pred)
sns.heatmap(cm/np.sum(cm), annot=True, fmt='.2%', cmap='coolwarm')
plt.title('Confusion Matrix of Decision Tree Classifier', fontweight='bold', fontsize=18)
plt.xlabel('Predicted', fontweight='bold', fontsize=14)
plt.ylabel('Actual', fontweight='bold', fontsize=14)

Feature Importance Plot

In [None]:
best_estimator = grid_search.best_estimator_ #getting the best features trough best attribute method
importance_fraction = best_estimator.feature_importances_

# creating a dataframe having feature names and their corresponding importance fraction
feat_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importance_fraction})

# sorting the dataframe by importance in descending order
feat_importances = feat_importances.sort_values('Importance', ascending=False)

# visualizing the important features using a horizontal barplot
sns.barplot(x='Importance', y='Feature', data=feat_importances[:10], color="steelblue")
plt.title('Feature Importance Plot')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()
