In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer 
from sklearn.preprocessing import OneHotEncoder, scale, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle

In [2]:
data = pd.read_csv(r"C:\Users\nki\Desktop\defaulting\Loan_Default.csv")

In [3]:
data.head()

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0


In [4]:
data.columns


Index(['ID', 'year', 'loan_limit', 'Gender', 'approv_in_adv', 'loan_type',
       'loan_purpose', 'Credit_Worthiness', 'open_credit',
       'business_or_commercial', 'loan_amount', 'rate_of_interest',
       'Interest_rate_spread', 'Upfront_charges', 'term', 'Neg_ammortization',
       'interest_only', 'lump_sum_payment', 'property_value',
       'construction_type', 'occupancy_type', 'Secured_by', 'total_units',
       'income', 'credit_type', 'Credit_Score', 'co-applicant_credit_type',
       'age', 'submission_of_application', 'LTV', 'Region', 'Security_Type',
       'Status', 'dtir1'],
      dtype='object')

In [5]:
data.drop(['ID','year'], axis=1, inplace=True)

In [6]:
data['construction_type'].value_counts()

construction_type
sb    148637
mh        33
Name: count, dtype: int64

In [7]:
data['Secured_by'].value_counts()

Secured_by
home    148637
land        33
Name: count, dtype: int64

In [8]:
data['Security_Type'].value_counts()

Security_Type
direct      148637
Indriect        33
Name: count, dtype: int64

In [9]:
data.drop(['construction_type','Secured_by','Security_Type'], axis=1, inplace=True)

In [10]:
data.shape

(148670, 29)

In [11]:
data['loan_limit'].value_counts()

loan_limit
cf     135348
ncf      9978
Name: count, dtype: int64

In [12]:
cat_vars = []
num_vars = []
for i in data.columns:
    if(data[i].dtype == "object"):
        cat_vars.append(i)
    else:
        num_vars.append(i)
print("Categorical Variables:\n",cat_vars,"\n")
print("Numerical Variables:\n",num_vars,"\n")

Categorical Variables:
 ['loan_limit', 'Gender', 'approv_in_adv', 'loan_type', 'loan_purpose', 'Credit_Worthiness', 'open_credit', 'business_or_commercial', 'Neg_ammortization', 'interest_only', 'lump_sum_payment', 'occupancy_type', 'total_units', 'credit_type', 'co-applicant_credit_type', 'age', 'submission_of_application', 'Region'] 

Numerical Variables:
 ['loan_amount', 'rate_of_interest', 'Interest_rate_spread', 'Upfront_charges', 'term', 'property_value', 'income', 'Credit_Score', 'LTV', 'Status', 'dtir1'] 



In [13]:
df_num = data[num_vars]
knn = KNNImputer(n_neighbors = 3)
knn.fit(df_num)
X = knn.fit_transform(df_num)

KeyboardInterrupt: 

In [54]:
df_num.isna().sum()

loan_amount             0
rate_of_interest        0
Interest_rate_spread    0
Upfront_charges         0
term                    0
property_value          0
income                  0
Credit_Score            0
LTV                     0
Status                  0
dtir1                   0
dtype: int64

In [55]:
df_num = pd.DataFrame(X, columns=num_vars)
df_num.isna().sum()

loan_amount             0
rate_of_interest        0
Interest_rate_spread    0
Upfront_charges         0
term                    0
property_value          0
income                  0
Credit_Score            0
LTV                     0
Status                  0
dtir1                   0
dtype: int64

In [56]:

df_cat = data[cat_vars]

for i in cat_vars :
    mode = data[i].mode()
    mode = mode[0]
    df_cat[i].fillna(value=mode, inplace=True)

df_cat.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cat[i].fillna(value=mode, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat[i].fillna(value=mode, inplace=True)


loan_limit                   0
Gender                       0
approv_in_adv                0
loan_type                    0
loan_purpose                 0
Credit_Worthiness            0
open_credit                  0
business_or_commercial       0
Neg_ammortization            0
interest_only                0
lump_sum_payment             0
occupancy_type               0
total_units                  0
credit_type                  0
co-applicant_credit_type     0
age                          0
submission_of_application    0
Region                       0
dtype: int64

In [57]:
df_full = pd.concat([df_num, df_cat], axis=1, join='inner')
df_full.isna().sum()

loan_amount                  0
rate_of_interest             0
Interest_rate_spread         0
Upfront_charges              0
term                         0
property_value               0
income                       0
Credit_Score                 0
LTV                          0
Status                       0
dtir1                        0
loan_limit                   0
Gender                       0
approv_in_adv                0
loan_type                    0
loan_purpose                 0
Credit_Worthiness            0
open_credit                  0
business_or_commercial       0
Neg_ammortization            0
interest_only                0
lump_sum_payment             0
occupancy_type               0
total_units                  0
credit_type                  0
co-applicant_credit_type     0
age                          0
submission_of_application    0
Region                       0
dtype: int64

In [58]:
df_full.to_csv('imputed.csv', index=False)

In [59]:
df_full.shape

(148670, 29)

In [75]:
df_shuffled = df_full.sample(frac=1)

In [76]:
df1 = df_shuffled.iloc[:30000]

In [77]:
df2 = df_shuffled.iloc[30001:]

In [78]:
df1.shape

(30000, 29)

In [79]:
df2.shape

(118669, 29)

In [80]:
df2.columns


Index(['loan_amount', 'rate_of_interest', 'Interest_rate_spread',
       'Upfront_charges', 'term', 'property_value', 'income', 'Credit_Score',
       'LTV', 'Status', 'dtir1', 'loan_limit', 'Gender', 'approv_in_adv',
       'loan_type', 'loan_purpose', 'Credit_Worthiness', 'open_credit',
       'business_or_commercial', 'Neg_ammortization', 'interest_only',
       'lump_sum_payment', 'occupancy_type', 'total_units', 'credit_type',
       'co-applicant_credit_type', 'age', 'submission_of_application',
       'Region'],
      dtype='object')

In [81]:
df2.drop(['Status'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.drop(['Status'], axis=1, inplace=True)


In [82]:
df2.columns

Index(['loan_amount', 'rate_of_interest', 'Interest_rate_spread',
       'Upfront_charges', 'term', 'property_value', 'income', 'Credit_Score',
       'LTV', 'dtir1', 'loan_limit', 'Gender', 'approv_in_adv', 'loan_type',
       'loan_purpose', 'Credit_Worthiness', 'open_credit',
       'business_or_commercial', 'Neg_ammortization', 'interest_only',
       'lump_sum_payment', 'occupancy_type', 'total_units', 'credit_type',
       'co-applicant_credit_type', 'age', 'submission_of_application',
       'Region'],
      dtype='object')

In [83]:
df1.columns
#

Index(['loan_amount', 'rate_of_interest', 'Interest_rate_spread',
       'Upfront_charges', 'term', 'property_value', 'income', 'Credit_Score',
       'LTV', 'Status', 'dtir1', 'loan_limit', 'Gender', 'approv_in_adv',
       'loan_type', 'loan_purpose', 'Credit_Worthiness', 'open_credit',
       'business_or_commercial', 'Neg_ammortization', 'interest_only',
       'lump_sum_payment', 'occupancy_type', 'total_units', 'credit_type',
       'co-applicant_credit_type', 'age', 'submission_of_application',
       'Region'],
      dtype='object')

In [84]:
df2.to_csv('df2.csv', index=False)


In [85]:
df1.to_csv('df1.csv', index=False)

In [86]:
df1.isna().sum()

loan_amount                  0
rate_of_interest             0
Interest_rate_spread         0
Upfront_charges              0
term                         0
property_value               0
income                       0
Credit_Score                 0
LTV                          0
Status                       0
dtir1                        0
loan_limit                   0
Gender                       0
approv_in_adv                0
loan_type                    0
loan_purpose                 0
Credit_Worthiness            0
open_credit                  0
business_or_commercial       0
Neg_ammortization            0
interest_only                0
lump_sum_payment             0
occupancy_type               0
total_units                  0
credit_type                  0
co-applicant_credit_type     0
age                          0
submission_of_application    0
Region                       0
dtype: int64

In [87]:
df1.isna().sum()

loan_amount                  0
rate_of_interest             0
Interest_rate_spread         0
Upfront_charges              0
term                         0
property_value               0
income                       0
Credit_Score                 0
LTV                          0
Status                       0
dtir1                        0
loan_limit                   0
Gender                       0
approv_in_adv                0
loan_type                    0
loan_purpose                 0
Credit_Worthiness            0
open_credit                  0
business_or_commercial       0
Neg_ammortization            0
interest_only                0
lump_sum_payment             0
occupancy_type               0
total_units                  0
credit_type                  0
co-applicant_credit_type     0
age                          0
submission_of_application    0
Region                       0
dtype: int64

In [32]:
df_no_default = df1.loc[(dftest['Status']==0)]
df_default = dftest.loc[(dftest['Status']==1)]

In [15]:
from sklearn.utils import resample

df_no_default_downsampled = resample(df_no_default, replace=False, n_samples=7000, random_state=42 )
df_default_downsampled = resample(df_default, replace=False, n_samples=7000, random_state=42 )

ValueError: Cannot sample 7000 out of arrays with dim 2258 when replace is False

In [17]:
df_downsample = pd.concat([df_no_default_downsampled, df_default_downsampled ])
len(df_downsample)

14000

In [18]:
df_downsample.shape

(14000, 29)

In [14]:
df1 = pd.read_csv('df1.csv')
df2 = pd.read_csv('df2.csv')

In [15]:
ohe = OneHotEncoder(sparse_output=False, dtype="int",handle_unknown='ignore')
ohe.fit(df1[cat_vars])
X_ohe_train = ohe.transform(df1[cat_vars])

X_ohe_train

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1]])

In [16]:

transformed_ohe = pd.DataFrame(
    data=X_ohe_train,
    columns=ohe.get_feature_names_out(cat_vars),
    index=df1.index,
)
transformed_ohe.head()

Unnamed: 0,loan_limit_cf,loan_limit_ncf,Gender_Female,Gender_Joint,Gender_Male,Gender_Sex Not Available,approv_in_adv_nopre,approv_in_adv_pre,loan_type_type1,loan_type_type2,...,age_55-64,age_65-74,age_<25,age_>74,submission_of_application_not_inst,submission_of_application_to_inst,Region_North,Region_North-East,Region_central,Region_south
0,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1,1,0,0,0,0,1,1,0,1,0,...,0,0,0,0,0,1,0,0,0,1
2,1,0,0,1,0,0,1,0,0,0,...,0,0,0,0,1,0,1,0,0,0
3,1,0,0,0,0,1,1,0,1,0,...,0,0,0,0,0,1,0,0,0,1
4,1,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,1,1,0,0,0


In [17]:
X_encoded = pd.concat([df1[['loan_amount', 'rate_of_interest', 'Interest_rate_spread', 'Upfront_charges', 'term', 'property_value', 'income', 'Credit_Score', 'LTV', 'dtir1'] ], transformed_ohe], axis=1)
X_encoded.head()

Unnamed: 0,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,term,property_value,income,Credit_Score,LTV,dtir1,...,age_55-64,age_65-74,age_<25,age_>74,submission_of_application_not_inst,submission_of_application_to_inst,Region_North,Region_North-East,Region_central,Region_south
0,426500.0,3.705,0.1962,5263.9,360.0,408000.0,6900.0,872.0,104.534314,58.0,...,0,0,0,0,0,1,1,0,0,0
1,306500.0,3.875,0.1077,2779.0,360.0,388000.0,6780.0,867.0,78.994845,43.0,...,0,0,0,0,0,1,0,0,0,1
2,316500.0,2.875,-0.6885,1459.13,360.0,368000.0,4560.0,549.0,86.005435,40.333333,...,0,0,0,0,1,0,1,0,0,0
3,416500.0,3.99,0.2617,5417.0,360.0,558000.0,9420.0,655.0,74.641577,36.0,...,0,0,0,0,0,1,0,0,0,1
4,186500.0,4.75,1.6116,930.0,360.0,218000.0,8160.0,628.0,85.550459,33.0,...,0,0,0,0,0,1,1,0,0,0


In [18]:
y = df1['Status'].copy()
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
#scale the dat
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [19]:
y_train.value_counts()

Status
0.0    15863
1.0     5137
Name: count, dtype: int64

In [20]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit LabelEncoder to y_train and transform y_train
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

In [21]:
clf_svm = SVC(random_state = 42, probability=True) # you will get the same result everytime you run
clf_svm.fit(X_train_scaled, y_train_encoded)

#calculate overall accuracy
y_pred = clf_svm.predict(X_test_scaled)
y_train_pred = clf_svm.predict(X_train_scaled)
accuracy = accuracy_score(y_test_encoded, y_pred)
acc = accuracy_score(y_train_encoded,y_train_pred)
print(f'Accuracy on test: {accuracy:.2%}')
print(f'Accuracy on train: {acc:.2%}')


Accuracy on test: 89.63%
Accuracy on train: 91.18%


In [23]:
clf_svm.predict(X_test_scaled)

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [24]:
import pickle
with open('ohe.pkl', 'wb') as file:
    pickle.dump(ohe, file)

In [25]:
import pickle
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [26]:
import pickle
with open('model.pkl', 'wb') as file:
    pickle.dump(clf_svm, file)

In [103]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Assuming binary classification task
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',  # Assuming binary classification task
              metrics=['accuracy'])

# Fit the model using transformed data
model.fit(X_train_scaled, y_train_encoded, epochs=30, batch_size=32, verbose=1)

# Evaluate the model on test data
y_pred = model.predict(X_test_scaled)
y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on test set: {accuracy:.4f}")
train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=0)
print(f"Training Accuracy: {train_accuracy:.4f}")

Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 520us/step - accuracy: 0.8057 - loss: 0.4388
Epoch 2/30
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 528us/step - accuracy: 0.8904 - loss: 0.2854
Epoch 3/30
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 511us/step - accuracy: 0.9039 - loss: 0.2552
Epoch 4/30
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 529us/step - accuracy: 0.9112 - loss: 0.2394
Epoch 5/30
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 497us/step - accuracy: 0.9145 - loss: 0.2250
Epoch 6/30
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 589us/step - accuracy: 0.9189 - loss: 0.2133
Epoch 7/30
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 511us/step - accuracy: 0.9242 - loss: 0.1989
Epoch 8/30
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 509us/step - accuracy: 0.9274 - loss: 0.1933
Epoch 9/30
[1m657/657[0m [32m━━━

In [30]:
X = dftest.drop('Status', axis=1).copy()
y = dftest['Status'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8392570906879052901
xla_global_id: -1
]


In [25]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Example data
# Assume X_train, X_test, y_train, y_test are your training and test data
# Replace this with your actual data loading and preprocessing steps



# Define the pipeline steps
transformers = [
    ('encoder', OneHotEncoder(sparse_output=False, dtype="int", handle_unknown='ignore'), cat_vars),
    ('scaler', StandardScaler(),  ['loan_amount', 'rate_of_interest', 'Interest_rate_spread', 'Upfront_charges', 'term', 'property_value', 'income', 'Credit_Score', 'LTV', 'dtir1'] )
]

# ColumnTransformer to apply transformations
preprocessor = ColumnTransformer(transformers)

# Transform data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Define the neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_transformed.shape[1],)),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Assuming binary classification task
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',  # Assuming binary classification task
              metrics=['accuracy'])

# Fit the model using transformed data
model.fit(X_train_transformed, y_train, epochs=50, batch_size=32, verbose=1)

# Evaluate the model on test data
y_pred = model.predict(X_test_transformed)
y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on test set: {accuracy:.4f}")


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 452us/step - accuracy: 0.7488 - loss: 0.6596
Epoch 2/50
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 446us/step - accuracy: 0.7541 - loss: 0.5777
Epoch 3/50
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 486us/step - accuracy: 0.7527 - loss: 0.5617
Epoch 4/50
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 509us/step - accuracy: 0.7537 - loss: 0.5584
Epoch 5/50
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 446us/step - accuracy: 0.7586 - loss: 0.5528
Epoch 6/50
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 451us/step - accuracy: 0.7537 - loss: 0.5583
Epoch 7/50
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 452us/step - accuracy: 0.7522 - loss: 0.5599
Epoch 8/50
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 438us/step - accuracy: 0.7560 - loss: 0.5556
Epoch 9/50
[1m750/750[0m [32m━━━

In [31]:
steps = [
    ('onehotencoder', OneHotEncoder(sparse_output=False, dtype="int", handle_unknown='ignore')),
    ('scaler', StandardScaler()),
    ('svm', SVC())
]

# Create the pipeline object
pipeline = Pipeline(steps=steps)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate model performance (replace with your chosen evaluation metric)
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on test set: {accuracy:.4f}")

Model accuracy on test set: 0.4500


In [32]:
y  = pd.DataFrame(y_pred)

In [33]:
y.value_counts()

0
1    457
0    143
Name: count, dtype: int64

In [28]:
y

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
5995,0
5996,0
5997,0
5998,0
