In [217]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import sklearn




In [218]:
#loading our data files in pandas dataframe
raw_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

raw_df

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,DRIRC89L0T,18,137576,209136,846,26,2,10.47,60,0.81,High School,Self-employed,Single,Yes,No,Business,No,0
1,TS0FIUNHNU,47,57194,5970,748,30,2,19.72,36,0.73,High School,Unemployed,Divorced,No,Yes,Education,No,0
2,I0YR284A1V,26,84328,95065,453,7,2,24.25,12,0.45,Master's,Self-employed,Married,No,No,Other,Yes,0
3,WB1T7NQV8A,53,49795,229582,533,107,3,14.44,60,0.17,Bachelor's,Self-employed,Single,Yes,No,Auto,Yes,1
4,J6GU9M4G1Z,49,115450,22072,840,0,4,24.48,12,0.11,Bachelor's,Part-time,Single,No,Yes,Education,Yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204272,KYS1HKNGGE,40,116623,161673,651,79,2,23.44,12,0.87,Bachelor's,Part-time,Divorced,No,No,Home,Yes,0
204273,5MAOH3AOZO,67,62958,189499,460,77,3,9.29,36,0.11,Bachelor's,Self-employed,Single,No,No,Business,Yes,0
204274,5Y9Z6NW29X,62,34372,59645,524,94,3,9.72,60,0.24,PhD,Full-time,Single,Yes,No,Auto,No,0
204275,O51974F566,44,146262,198454,489,7,4,4.31,48,0.30,High School,Self-employed,Married,Yes,No,Home,No,0


In [219]:
# now lets create our training set and validation set
from sklearn.model_selection import train_test_split

X_with_target = raw_df.drop('LoanID', axis=1)
X= X_with_target.drop('Default', axis=1)
Y = raw_df['Default']

# Split the dataset into 80% training and 20% testing
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.2, random_state=14)

test_df_loanID = test_df['LoanID']
test_df = test_df.drop('LoanID', axis=1)
test_df

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
0,55,112656,92393,581,113,2,23.54,36,0.15,PhD,Self-employed,Single,Yes,Yes,Home,No
1,56,91569,131575,641,54,1,15.19,12,0.43,High School,Part-time,Divorced,Yes,Yes,Education,Yes
2,26,78169,75417,569,105,3,18.02,12,0.29,Master's,Part-time,Married,Yes,Yes,Education,Yes
3,26,63033,10804,326,118,1,14.71,24,0.41,High School,Part-time,Single,No,No,Business,Yes
4,24,29665,21182,662,102,3,15.02,60,0.69,PhD,Unemployed,Single,No,Yes,Business,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51065,51,99473,170353,628,24,1,17.03,12,0.46,PhD,Self-employed,Divorced,Yes,Yes,Auto,Yes
51066,29,42016,111314,371,51,4,7.10,36,0.50,PhD,Self-employed,Married,No,No,Other,No
51067,67,88507,142666,731,51,1,22.89,48,0.79,Bachelor's,Part-time,Divorced,No,No,Education,No
51068,42,116649,190938,488,6,1,10.83,60,0.32,Bachelor's,Full-time,Married,No,Yes,Other,Yes


In [220]:
# now lets identify the numerical and categorical columns

numerical_columns = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_columns = X_train.select_dtypes('object').columns.tolist()

test_numerical_columns = test_df.select_dtypes(include=np.number).columns.tolist()
test_categorical_columns = test_df.select_dtypes('object').columns.tolist()

In [221]:
# lets lets impute stuff for numerical columns
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean').fit(raw_df[numerical_columns])
X_train[numerical_columns] = imputer.transform(X_train[numerical_columns])
X_validation[numerical_columns] = imputer.transform(X_validation[numerical_columns])

test_df[test_numerical_columns] = imputer.transform(test_df[test_numerical_columns])

X_validation[numerical_columns].isna().sum()

Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
dtype: int64

In [222]:
# now lets numeric values to the range (0,1)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(raw_df[numerical_columns])
# transform the data
X_train[numerical_columns] = scaler.transform(X_train[numerical_columns])
X_validation[numerical_columns] = scaler.transform(X_validation[numerical_columns])

test_df[test_numerical_columns] = scaler.transform(test_df[test_numerical_columns])

X_validation.describe().loc[['min', 'max']]


Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,0.999941,0.999959,1.0,1.0,1.0,1.0,1.0,1.0


In [223]:
# now lets do encoding for categorical columns, using one-hot vectors

# first of all lets fill out all categorical columns with NaN with Unknown
X_train[categorical_columns] = X_train[categorical_columns].fillna('Unknown')
X_validation[categorical_columns] = X_validation[categorical_columns].fillna('Unknown')
test_df[categorical_columns] = test_df[test_categorical_columns].fillna('Unknown')

from sklearn.preprocessing import OneHotEncoder

# encoder for one-hot encoding columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore').fit(X_train[categorical_columns])

encoded_cols = list(encoder.get_feature_names_out(categorical_columns))
encoded_cols

["Education_Bachelor's",
 'Education_High School',
 "Education_Master's",
 'Education_PhD',
 'EmploymentType_Full-time',
 'EmploymentType_Part-time',
 'EmploymentType_Self-employed',
 'EmploymentType_Unemployed',
 'MaritalStatus_Divorced',
 'MaritalStatus_Married',
 'MaritalStatus_Single',
 'HasMortgage_No',
 'HasMortgage_Yes',
 'HasDependents_No',
 'HasDependents_Yes',
 'LoanPurpose_Auto',
 'LoanPurpose_Business',
 'LoanPurpose_Education',
 'LoanPurpose_Home',
 'LoanPurpose_Other',
 'HasCoSigner_No',
 'HasCoSigner_Yes']

In [224]:
X_train

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
133064,0.764706,0.472537,0.269761,0.287796,0.705882,1.000000,0.497391,0.00,0.2000,Bachelor's,Part-time,Married,No,No,Other,Yes
128654,0.627451,0.011393,0.786945,0.384335,0.252101,0.666667,0.638261,0.75,0.3500,High School,Part-time,Single,Yes,No,Auto,Yes
69670,0.431373,0.434981,0.475302,0.590164,0.689076,0.666667,0.748696,0.00,0.8875,Master's,Unemployed,Divorced,Yes,No,Business,Yes
95361,0.313725,0.572701,0.216124,0.353370,0.176471,1.000000,0.034783,0.25,0.7625,Master's,Part-time,Married,Yes,No,Other,No
103832,0.705882,0.939363,0.747480,0.812386,0.243697,0.333333,0.259130,0.75,0.9625,PhD,Self-employed,Married,Yes,No,Education,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22855,0.843137,0.301580,0.147515,0.114754,0.932773,1.000000,0.371739,0.50,0.4750,High School,Part-time,Single,No,No,Home,No
117142,0.333333,0.127497,0.678018,0.424408,0.529412,0.000000,0.235217,1.00,0.9750,Bachelor's,Part-time,Divorced,Yes,No,Other,No
140556,0.803922,0.130505,0.008804,0.289617,0.226891,0.666667,0.233043,0.50,0.8625,High School,Self-employed,Single,Yes,No,Auto,Yes
79192,0.176471,0.371662,0.872668,0.883424,0.689076,0.333333,0.821739,1.00,0.2125,High School,Self-employed,Divorced,Yes,Yes,Home,Yes


In [225]:
X_train[encoded_cols] = encoder.transform(X_train[categorical_columns])
X_validation[encoded_cols] = encoder.transform(X_validation[categorical_columns])
test_df[encoded_cols] = encoder.transform(test_df[test_categorical_columns])

test_df

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,...,HasMortgage_Yes,HasDependents_No,HasDependents_Yes,LoanPurpose_Auto,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other,HasCoSigner_No,HasCoSigner_Yes
0,0.725490,0.723383,0.356705,0.511840,0.949580,0.333333,0.936522,0.50,0.0625,PhD,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.745098,0.567182,0.516633,0.621129,0.453782,0.000000,0.573478,0.00,0.4125,High School,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.156863,0.467922,0.287415,0.489982,0.882353,0.666667,0.696522,0.00,0.2375,Master's,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.156863,0.355803,0.023686,0.047359,0.991597,0.000000,0.552609,0.25,0.3875,High School,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.117647,0.108630,0.066045,0.659381,0.857143,0.666667,0.566087,1.00,0.7375,PhD,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51065,0.647059,0.625731,0.674912,0.597450,0.201681,0.000000,0.653478,0.00,0.4500,PhD,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
51066,0.215686,0.200120,0.433934,0.129326,0.428571,1.000000,0.221739,0.50,0.5000,PhD,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
51067,0.960784,0.544500,0.561903,0.785064,0.428571,0.000000,0.908261,0.75,0.8625,Bachelor's,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
51068,0.470588,0.752961,0.758933,0.342441,0.050420,0.000000,0.383913,1.00,0.2750,Bachelor's,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [226]:
# as a final lets drop the textual categorical columns so that we're left with just numeric data
X_train = X_train[numerical_columns+encoded_cols]
X_validation = X_validation[numerical_columns+encoded_cols]
test_df = test_df[test_numerical_columns+encoded_cols]

X_train.shape

(163421, 31)

In [227]:
X_train

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education_Bachelor's,...,HasMortgage_Yes,HasDependents_No,HasDependents_Yes,LoanPurpose_Auto,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other,HasCoSigner_No,HasCoSigner_Yes
133064,0.764706,0.472537,0.269761,0.287796,0.705882,1.000000,0.497391,0.00,0.2000,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
128654,0.627451,0.011393,0.786945,0.384335,0.252101,0.666667,0.638261,0.75,0.3500,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
69670,0.431373,0.434981,0.475302,0.590164,0.689076,0.666667,0.748696,0.00,0.8875,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
95361,0.313725,0.572701,0.216124,0.353370,0.176471,1.000000,0.034783,0.25,0.7625,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
103832,0.705882,0.939363,0.747480,0.812386,0.243697,0.333333,0.259130,0.75,0.9625,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22855,0.843137,0.301580,0.147515,0.114754,0.932773,1.000000,0.371739,0.50,0.4750,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
117142,0.333333,0.127497,0.678018,0.424408,0.529412,0.000000,0.235217,1.00,0.9750,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
140556,0.803922,0.130505,0.008804,0.289617,0.226891,0.666667,0.233043,0.50,0.8625,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
79192,0.176471,0.371662,0.872668,0.883424,0.689076,0.333333,0.821739,1.00,0.2125,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [228]:
test_df

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education_Bachelor's,...,HasMortgage_Yes,HasDependents_No,HasDependents_Yes,LoanPurpose_Auto,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other,HasCoSigner_No,HasCoSigner_Yes
0,0.725490,0.723383,0.356705,0.511840,0.949580,0.333333,0.936522,0.50,0.0625,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.745098,0.567182,0.516633,0.621129,0.453782,0.000000,0.573478,0.00,0.4125,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.156863,0.467922,0.287415,0.489982,0.882353,0.666667,0.696522,0.00,0.2375,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.156863,0.355803,0.023686,0.047359,0.991597,0.000000,0.552609,0.25,0.3875,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.117647,0.108630,0.066045,0.659381,0.857143,0.666667,0.566087,1.00,0.7375,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51065,0.647059,0.625731,0.674912,0.597450,0.201681,0.000000,0.653478,0.00,0.4500,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
51066,0.215686,0.200120,0.433934,0.129326,0.428571,1.000000,0.221739,0.50,0.5000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
51067,0.960784,0.544500,0.561903,0.785064,0.428571,0.000000,0.908261,0.75,0.8625,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
51068,0.470588,0.752961,0.758933,0.342441,0.050420,0.000000,0.383913,1.00,0.2750,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [229]:
from scipy.stats import pointbiserialr #

In [230]:
X_with_target.shape

(204277, 17)

In [231]:
X

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
0,18,137576,209136,846,26,2,10.47,60,0.81,High School,Self-employed,Single,Yes,No,Business,No
1,47,57194,5970,748,30,2,19.72,36,0.73,High School,Unemployed,Divorced,No,Yes,Education,No
2,26,84328,95065,453,7,2,24.25,12,0.45,Master's,Self-employed,Married,No,No,Other,Yes
3,53,49795,229582,533,107,3,14.44,60,0.17,Bachelor's,Self-employed,Single,Yes,No,Auto,Yes
4,49,115450,22072,840,0,4,24.48,12,0.11,Bachelor's,Part-time,Single,No,Yes,Education,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204272,40,116623,161673,651,79,2,23.44,12,0.87,Bachelor's,Part-time,Divorced,No,No,Home,Yes
204273,67,62958,189499,460,77,3,9.29,36,0.11,Bachelor's,Self-employed,Single,No,No,Business,Yes
204274,62,34372,59645,524,94,3,9.72,60,0.24,PhD,Full-time,Single,Yes,No,Auto,No
204275,44,146262,198454,489,7,4,4.31,48,0.30,High School,Self-employed,Married,Yes,No,Home,No


In [232]:
raw_df.shape

(204277, 18)

In [233]:
# Calculate point biserial correlation for each numerical feature
correlations = {}

target_column = 'Default'

input_columns = raw_df.columns

input_columns


Index(['LoanID', 'Age', 'Income', 'LoanAmount', 'CreditScore',
       'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm',
       'DTIRatio', 'Education', 'EmploymentType', 'MaritalStatus',
       'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner',
       'Default'],
      dtype='object')

In [234]:
X.shape

(204277, 16)

In [235]:
X

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
0,18,137576,209136,846,26,2,10.47,60,0.81,High School,Self-employed,Single,Yes,No,Business,No
1,47,57194,5970,748,30,2,19.72,36,0.73,High School,Unemployed,Divorced,No,Yes,Education,No
2,26,84328,95065,453,7,2,24.25,12,0.45,Master's,Self-employed,Married,No,No,Other,Yes
3,53,49795,229582,533,107,3,14.44,60,0.17,Bachelor's,Self-employed,Single,Yes,No,Auto,Yes
4,49,115450,22072,840,0,4,24.48,12,0.11,Bachelor's,Part-time,Single,No,Yes,Education,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204272,40,116623,161673,651,79,2,23.44,12,0.87,Bachelor's,Part-time,Divorced,No,No,Home,Yes
204273,67,62958,189499,460,77,3,9.29,36,0.11,Bachelor's,Self-employed,Single,No,No,Business,Yes
204274,62,34372,59645,524,94,3,9.72,60,0.24,PhD,Full-time,Single,Yes,No,Auto,No
204275,44,146262,198454,489,7,4,4.31,48,0.30,High School,Self-employed,Married,Yes,No,Home,No


In [236]:
X_with_target.shape

(204277, 17)

In [237]:
for col in input_columns:
    if pd.api.types.is_numeric_dtype(raw_df[col]):
        corr, _ = pointbiserialr(raw_df[col], raw_df[target_column])
        correlations[col] = corr

# Display correlations
for feature, corr_value in correlations.items():
    print(f"Correlation between {feature} and {target_column}: {corr_value:.2f}")

Correlation between Age and Default: -0.17
Correlation between Income and Default: -0.10
Correlation between LoanAmount and Default: 0.09
Correlation between CreditScore and Default: -0.03
Correlation between MonthsEmployed and Default: -0.10
Correlation between NumCreditLines and Default: 0.03
Correlation between InterestRate and Default: 0.13
Correlation between LoanTerm and Default: 0.00
Correlation between DTIRatio and Default: 0.02
Correlation between Default and Default: 1.00


In [238]:
final_X_train = pd.concat([X_train, X_validation], ignore_index=True)
final_Y_train = pd.concat([Y_train, Y_validation], ignore_index=True)

In [239]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(random_state=42)
model.fit(final_X_train, final_Y_train)

# Get feature importance
feature_importances = pd.Series(model.feature_importances_, index=final_X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)

# Display the feature importance
print(feature_importances)

Income                          0.119614
InterestRate                    0.113748
LoanAmount                      0.107330
Age                             0.099019
CreditScore                     0.094274
MonthsEmployed                  0.092724
DTIRatio                        0.083324
LoanTerm                        0.038874
NumCreditLines                  0.031586
Education_High School           0.011750
MaritalStatus_Single            0.011708
MaritalStatus_Divorced          0.011689
Education_Bachelor's            0.011642
LoanPurpose_Business            0.011498
LoanPurpose_Education           0.011359
LoanPurpose_Auto                0.011306
LoanPurpose_Other               0.011201
Education_Master's              0.011018
Education_PhD                   0.010791
MaritalStatus_Married           0.010537
EmploymentType_Part-time        0.010478
HasMortgage_Yes                 0.010478
EmploymentType_Self-employed    0.010221
HasMortgage_No                  0.010202
LoanPurpose_Home

In [240]:
# Select the top 75 features based on importance
top_75_features = feature_importances.index[:26]
X_top_75 = final_X_train[top_75_features]
test_df_X_top_75 = test_df[top_75_features]

print("Top 75 features selected based on importance:")
print(top_75_features)

Top 75 features selected based on importance:
Index(['Income', 'InterestRate', 'LoanAmount', 'Age', 'CreditScore',
       'MonthsEmployed', 'DTIRatio', 'LoanTerm', 'NumCreditLines',
       'Education_High School', 'MaritalStatus_Single',
       'MaritalStatus_Divorced', 'Education_Bachelor's',
       'LoanPurpose_Business', 'LoanPurpose_Education', 'LoanPurpose_Auto',
       'LoanPurpose_Other', 'Education_Master's', 'Education_PhD',
       'MaritalStatus_Married', 'EmploymentType_Part-time', 'HasMortgage_Yes',
       'EmploymentType_Self-employed', 'HasMortgage_No', 'LoanPurpose_Home',
       'EmploymentType_Unemployed'],
      dtype='object')


In [241]:
top_5_features = ['Income', 'InterestRate', 'LoanAmount', 'Age', 'CreditScore']  # Replace with actual feature names

# Increase their importance by multiplying their values
for feature in top_5_features:
    final_X_train[feature] *= 2  # Multiply by 1.5 or any other factor you choose

In [242]:
final_X_train.shape

(204277, 31)

In [243]:
final_X_train.shape

(204277, 31)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_randomForest = RandomForestClassifier(random_state=14)

param_grid = {
    'n_estimators': [10, 20, 50, 100],  # number of trees in the forest
}

grid_search_rf = GridSearchCV(estimator=model_randomForest, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1, verbose=1)

grid_search_rf.fit(final_X_train, final_Y_train)

print(f"Best parameters: {grid_search_rf.best_params_}")

# Retrain the model using the best parameters
best_model_randomForest = grid_search_rf.best_estimator_


Fitting 3 folds for each of 2 candidates, totalling 6 fits
Best parameters: {'n_estimators': 20}


In [170]:
test_df_X_top_75.columns

Index(['Income', 'InterestRate', 'LoanAmount', 'Age', 'CreditScore',
       'MonthsEmployed', 'DTIRatio', 'LoanTerm', 'NumCreditLines',
       'Education_High School', 'MaritalStatus_Single',
       'MaritalStatus_Divorced', 'Education_Bachelor's',
       'LoanPurpose_Business', 'LoanPurpose_Education', 'LoanPurpose_Auto',
       'LoanPurpose_Other', 'Education_Master's', 'Education_PhD',
       'MaritalStatus_Married', 'EmploymentType_Part-time', 'HasMortgage_Yes',
       'EmploymentType_Self-employed', 'HasMortgage_No', 'LoanPurpose_Home',
       'EmploymentType_Unemployed'],
      dtype='object')

In [171]:
final_X_train.columns

Index(['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
       'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio',
       'Education_Bachelor's', 'Education_High School', 'Education_Master's',
       'Education_PhD', 'EmploymentType_Part-time',
       'EmploymentType_Self-employed', 'EmploymentType_Unemployed',
       'MaritalStatus_Divorced', 'MaritalStatus_Married',
       'MaritalStatus_Single', 'HasMortgage_No', 'HasMortgage_Yes',
       'LoanPurpose_Auto', 'LoanPurpose_Business', 'LoanPurpose_Education',
       'LoanPurpose_Home', 'LoanPurpose_Other'],
      dtype='object')

In [245]:
test_df_X_top_75 = test_df_X_top_75.reindex(columns=final_X_train.columns)
# now lets play with our test data
predictions1 = best_model_randomForest.predict(test_df_X_top_75)

submission_df = pd.DataFrame({
    'LoanID': test_df_loanID,
    'Default': predictions1
})

submission_df.to_csv('submission_randomForest_removed_bottom25.csv', index=False)

In [None]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier(random_state=14)

param_grid = {
    'n_estimators': [250],  # number of trees in the forest
}

grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)

grid_search_xgb.fit(final_X_train, final_Y_train)

print(f"Best parameters: {grid_search_rf.best_params_}")

# Retrain the model using the best parameters
best_model_xgb = grid_search_xgb.best_estimator_


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters: {'n_estimators': 300}


In [107]:
# now lets play with our test data
predictions = best_model_xgb.predict(test_df_X_top_75)

submission_df = pd.DataFrame({
    'LoanID': test_df_loanID,
    'Default': predictions
})

submission_df.to_csv('submission_xgb_removed_bottom25.csv', index=False)

In [215]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Initialize a single decision tree
tree_model = DecisionTreeClassifier(random_state=14)

# Set up hyperparameter grid for tuning, like max depth
param_grid = {
    'max_depth': [3, 5, 7, 10],  # Adjust the depth to control complexity
}

# Use GridSearchCV to find the best tree depth
grid_search_tree = GridSearchCV(estimator=tree_model, param_grid=param_grid, scoring='accuracy', cv=5, verbose=1)

# Fit the model
grid_search_tree.fit(final_X_train, final_Y_train)

# Get the best parameters and model
print(f"Best parameters: {grid_search_tree.best_params_}")
best_tree_model = grid_search_tree.best_estimator_

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters: {'max_depth': 7}


In [216]:
# now lets play with our test data
predictions2 = best_tree_model.predict(test_df_X_top_75)

submission_df = pd.DataFrame({
    'LoanID': test_df_loanID,
    'Default': predictions2
})

submission_df.to_csv('submission_singleTree_removed_bottom25.csv', index=False)