In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

# gradient boosting

from sklearn.ensemble import GradientBoostingClassifier

# Gradient Boosting

### Steps

* Encode categorical columns using label encoding for ones that have a natural order like employment type and education and pd.get_dummies(one-hot encoding) for ones that does not have a natural order.
* Split the data into training and validation sets; use stratify to ensure balanced classes.
* Create an instance of GradientBoostingClassifier.
* Define a parameter grid with options for `n_estimators`, `learning_rate`, `max_depth`, `min_samples_split`, and `min_samples_leaf`.
* Before training, drop irrelevant columns like `LoanID`
* Although Gradient Boosting is less sensitive to scaling, consider scaling input data if it improves model performance in your specific dataset.
* Conduct a randomized search over the parameter grid to find the best hyperparameters. Train the model by fitting it on the training data.
* Here, `learning_rate` and `n_estimators` are key parameters that control the learning process and regularization.
* Use the model with the best parameters to predict labels for the test data.


In [None]:
# Get the training data into thje dataframe
df=pd.read_csv('train.csv')

df.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,DRIRC89L0T,18,137576,209136,846,26,2,10.47,60,0.81,High School,Self-employed,Single,Yes,No,Business,No,0
1,TS0FIUNHNU,47,57194,5970,748,30,2,19.72,36,0.73,High School,Unemployed,Divorced,No,Yes,Education,No,0
2,I0YR284A1V,26,84328,95065,453,7,2,24.25,12,0.45,Master's,Self-employed,Married,No,No,Other,Yes,0
3,WB1T7NQV8A,53,49795,229582,533,107,3,14.44,60,0.17,Bachelor's,Self-employed,Single,Yes,No,Auto,Yes,1
4,J6GU9M4G1Z,49,115450,22072,840,0,4,24.48,12,0.11,Bachelor's,Part-time,Single,No,Yes,Education,Yes,0


In [None]:
# Check for null columns
cols = df.columns
df.isna().sum()


Unnamed: 0,0
LoanID,0
Age,0
Income,0
LoanAmount,0
CreditScore,0
MonthsEmployed,0
NumCreditLines,0
InterestRate,0
LoanTerm,0
DTIRatio,0


In [None]:
#drop columns like LoanID as they play no significance into predicting the Default
df.drop(columns=['LoanID'], inplace=True)
df.shape
df

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,18,137576,209136,846,26,2,10.47,60,0.81,High School,Self-employed,Single,Yes,No,Business,No,0
1,47,57194,5970,748,30,2,19.72,36,0.73,High School,Unemployed,Divorced,No,Yes,Education,No,0
2,26,84328,95065,453,7,2,24.25,12,0.45,Master's,Self-employed,Married,No,No,Other,Yes,0
3,53,49795,229582,533,107,3,14.44,60,0.17,Bachelor's,Self-employed,Single,Yes,No,Auto,Yes,1
4,49,115450,22072,840,0,4,24.48,12,0.11,Bachelor's,Part-time,Single,No,Yes,Education,Yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204272,40,116623,161673,651,79,2,23.44,12,0.87,Bachelor's,Part-time,Divorced,No,No,Home,Yes,0
204273,67,62958,189499,460,77,3,9.29,36,0.11,Bachelor's,Self-employed,Single,No,No,Business,Yes,0
204274,62,34372,59645,524,94,3,9.72,60,0.24,PhD,Full-time,Single,Yes,No,Auto,No,0
204275,44,146262,198454,489,7,4,4.31,48,0.30,High School,Self-employed,Married,Yes,No,Home,No,0


In [None]:
#replace yes or no with 1 or 0 in has_mortgage has_dependents, has_co_signer

df['HasMortgage'] = df['HasMortgage'].replace({'Yes':1, 'No':0})
df['HasDependents'] = df['HasDependents'].replace({'Yes':1, 'No':0})
df['HasCoSigner'] = df['HasCoSigner'].replace({'Yes':1, 'No':0})

  df['HasMortgage'] = df['HasMortgage'].replace({'Yes':1, 'No':0})
  df['HasDependents'] = df['HasDependents'].replace({'Yes':1, 'No':0})
  df['HasCoSigner'] = df['HasCoSigner'].replace({'Yes':1, 'No':0})


In [None]:
#education, employment, property, and loan purpose unique values

df['Education'].unique()

#replace education with 0, 1, 2, 3 using ordinal encoding as natural order may exist
df['Education'] = df['Education'].replace({'High School':0, 'Bachelor\'s':1, 'Master\'s':2, 'PhD':3})



  df['Education'] = df['Education'].replace({'High School':0, 'Bachelor\'s':1, 'Master\'s':2, 'PhD':3})


In [None]:
df['EmploymentType'].unique()


#replace employment with 0, 1, 2, 3 using ordinal encoding as natural order may exist
df['EmploymentType'] = df['EmploymentType'].replace({'Unemployed':0, 'Part-time':1, 'Full-time':2, 'Self-employed':3})


In [None]:
df['MaritalStatus'].unique()

#replace marital status with 0, 1, 2 using ordinal encoding as natural order may exist
df['MaritalStatus'] = df['MaritalStatus'].replace({'Single':0, 'Married':1, 'Divorced':2})


In [None]:
df['LoanPurpose'].unique()

# one-hot encoding
df = pd.get_dummies(df, columns=['LoanPurpose'])

#turn all false to 0 and true to 1

columns_to_convert = [
    'LoanPurpose_Auto',
    'LoanPurpose_Business',
    'LoanPurpose_Education',
    'LoanPurpose_Home',
    'LoanPurpose_Other'
]

# Convert True/False to 1/0 in the specified columns
df[columns_to_convert] = df[columns_to_convert].astype(int)


In [None]:
df



Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,...,MaritalStatus,HasMortgage,HasDependents,HasCoSigner,Default,LoanPurpose_Auto,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other
0,18,137576,209136,846,26,2,10.47,60,0.81,0,...,0,1,0,0,0,0,1,0,0,0
1,47,57194,5970,748,30,2,19.72,36,0.73,0,...,2,0,1,0,0,0,0,1,0,0
2,26,84328,95065,453,7,2,24.25,12,0.45,2,...,1,0,0,1,0,0,0,0,0,1
3,53,49795,229582,533,107,3,14.44,60,0.17,1,...,0,1,0,1,1,1,0,0,0,0
4,49,115450,22072,840,0,4,24.48,12,0.11,1,...,0,0,1,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204272,40,116623,161673,651,79,2,23.44,12,0.87,1,...,2,0,0,1,0,0,0,0,1,0
204273,67,62958,189499,460,77,3,9.29,36,0.11,1,...,0,0,0,1,0,0,1,0,0,0
204274,62,34372,59645,524,94,3,9.72,60,0.24,3,...,0,1,0,0,0,1,0,0,0,0
204275,44,146262,198454,489,7,4,4.31,48,0.30,0,...,1,1,0,0,0,0,0,0,1,0


In [None]:
#Split it into train and test data, 80:20
X = df.drop(columns=['Default'])
y = df['Default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=114)


In [None]:
#implement gradient boosting algorithm
#Search for the best parameters for the gradient boost using randomized search


from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

gbc = GradientBoostingClassifier()

param_dist = {
    'n_estimators': [100, 300],
    'learning_rate': [0.01, 0.1],
    'max_depth': [7],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [2, 3]
}

random_search = RandomizedSearchCV(gbc, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, verbose=2)
random_search.fit(X_train, y_train)

# Load the trained model parameters (replace with your actual best parameters)
gbc_best = GradientBoostingClassifier(**random_search.best_params_)
gbc_best.fit(X_train, y_train)
y_pred = gbc_best.predict(X_test)
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  _data = np.array(data, dtype=dtype, copy=copy,


              precision    recall  f1-score   support

           0       0.89      1.00      0.94     36147
           1       0.59      0.05      0.09      4709

    accuracy                           0.89     40856
   macro avg       0.74      0.52      0.51     40856
weighted avg       0.85      0.89      0.84     40856



In [None]:
# prompt: get the data from test.csv and predict the output and put it into the output.csv file

import pandas as pd
import numpy as np


# Load the test data
df_test1 = pd.read_csv('test.csv')
df_test = df_test1.drop(columns=['LoanID'])

# Apply the same preprocessing steps to the test data
df_test['HasMortgage'] = df_test['HasMortgage'].replace({'Yes': 1, 'No': 0})
df_test['HasDependents'] = df_test['HasDependents'].replace({'Yes': 1, 'No': 0})
df_test['HasCoSigner'] = df_test['HasCoSigner'].replace({'Yes': 1, 'No': 0})
df_test['Education'] = df_test['Education'].replace({'High School': 0, 'Bachelor\'s': 1, 'Master\'s': 2, 'PhD': 3})
df_test['EmploymentType'] = df_test['EmploymentType'].replace({'Unemployed': 0, 'Part-time': 1, 'Full-time': 2, 'Self-employed': 3})
df_test['MaritalStatus'] = df_test['MaritalStatus'].replace({'Single': 0, 'Married': 1, 'Divorced': 2})
df_test = pd.get_dummies(df_test, columns=['LoanPurpose'])

# Standardize the test data using the same scaler fitted on the training data

print(df_test.shape)
# Make predictions on the test data
y_pred = gbc_best.predict(df_test)
print(y_pred.shape)
# Save LoanID and prediction
output = pd.DataFrame({'LoanID': df_test1['LoanID'], 'Default': y_pred})
output.to_csv('gradientbosting_out.csv', index=False)



  df_test['HasMortgage'] = df_test['HasMortgage'].replace({'Yes': 1, 'No': 0})
  df_test['HasDependents'] = df_test['HasDependents'].replace({'Yes': 1, 'No': 0})
  df_test['HasCoSigner'] = df_test['HasCoSigner'].replace({'Yes': 1, 'No': 0})


(51070, 20)


  df_test['Education'] = df_test['Education'].replace({'High School': 0, 'Bachelor\'s': 1, 'Master\'s': 2, 'PhD': 3})
  df_test['EmploymentType'] = df_test['EmploymentType'].replace({'Unemployed': 0, 'Part-time': 1, 'Full-time': 2, 'Self-employed': 3})
  df_test['MaritalStatus'] = df_test['MaritalStatus'].replace({'Single': 0, 'Married': 1, 'Divorced': 2})
