In [1]:
# Data manipulation stuff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import PoissonRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

path_application_train = "/home/dark/VS-CodePythonProjects/DataScience-Club/Credit-Risk-Model/Data/application_train.csv"
df = pd.read_csv(path_application_train)

# Figuring out the missing values

miss_val = df.isnull().sum()
miss_val_perc = 100 * df.isnull().sum() / len(df)
greater_0 = miss_val_perc[miss_val_perc != 0].round(2)
greater_0 = greater_0.sort_values(ascending=False)
list_col = greater_0.index.to_list()
df = df.drop(columns=list_col)



missing_threshold = 0.5
missing_perc = df.isnull().mean()

cols_to_drop = missing_perc[missing_perc > missing_threshold].index
df.drop(columns=cols_to_drop, inplace=True)

cols_to_drop = ['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7',
                'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13',
                'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19',
                'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'ORGANIZATION_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE',
                'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REGION_POPULATION_RELATIVE', 'DAYS_REGISTRATION', 'REGION_RATING_CLIENT',
                'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY']

df.drop(columns=cols_to_drop, inplace=True)
df = df.select_dtypes(exclude='object')

X = df.drop(columns=['TARGET'])
y = df['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

model = make_pipeline(StandardScaler(), PoissonRegressor())
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test)
y_pred = (y_pred >= 0.5).astype(int)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.93      0.61      0.74    113064
           1       0.09      0.44      0.15      9941

    accuracy                           0.60    123005
   macro avg       0.51      0.52      0.44    123005
weighted avg       0.86      0.60      0.69    123005



In [6]:
# Data manipulation stuff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
import statsmodels.api as sm

path_application_train = "/home/dark/VS-CodePythonProjects/DataScience-Club/Credit-Risk-Model/Data/application_train.csv"
df = pd.read_csv(path_application_train)

# Figuring out the missing values
miss_val = df.isnull().sum()
miss_val_perc = 100 * miss_val / len(df)
missing_threshold = 0.9
cols_to_drop = miss_val_perc[miss_val_perc > missing_threshold].index
df.drop(columns=cols_to_drop, inplace=True)

# Drop unnecessary columns
cols_to_drop = ['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7',
                'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13',
                'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19',
                'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'ORGANIZATION_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE',
                'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REGION_POPULATION_RELATIVE', 'DAYS_REGISTRATION', 'REGION_RATING_CLIENT',
                'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY']

df.drop(columns=cols_to_drop, inplace=True)
df = df.select_dtypes(exclude='object')

# Handle missing values
df.fillna(df.mean(), inplace=True)

X = df.drop(columns=['TARGET'])
y = df['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

X_train_resampled = sm.add_constant(X_train_resampled)

glm_model = sm.GLM(y_train_resampled, X_train_resampled, family=sm.families.Binomial())
glm_results = glm_model.fit()

X_test_with_const = sm.add_constant(X_test)
y_pred = glm_results.predict(X_test_with_const)


y_pred_binary = (y_pred >= 0.9).astype(int)
print(classification_report(y_test, y_pred_binary))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96    113014
           1       0.25      0.02      0.04      9991

    accuracy                           0.92    123005
   macro avg       0.59      0.51      0.50    123005
weighted avg       0.87      0.92      0.88    123005



In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = df.drop(columns=['TARGET'])
y = df['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logistic_model = LogisticRegression()

logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9195161211648212


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Credit Risk Modelling - Default Risk Prediction

## Introduction:

Credit Risk itself is a very broad topic and has a lot of different approaches, as talked about in the earlier slides, We will use Generalised Linear Models (GLMs)

In [None]:
"""
Here import libraries
"""