## Import pandas numpy and other dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

In [None]:
df=pd.read_csv('train.csv')
df.head()



Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,DRIRC89L0T,18,137576,209136,846,26,2,10.47,60,0.81,High School,Self-employed,Single,Yes,No,Business,No,0
1,TS0FIUNHNU,47,57194,5970,748,30,2,19.72,36,0.73,High School,Unemployed,Divorced,No,Yes,Education,No,0
2,I0YR284A1V,26,84328,95065,453,7,2,24.25,12,0.45,Master's,Self-employed,Married,No,No,Other,Yes,0
3,WB1T7NQV8A,53,49795,229582,533,107,3,14.44,60,0.17,Bachelor's,Self-employed,Single,Yes,No,Auto,Yes,1
4,J6GU9M4G1Z,49,115450,22072,840,0,4,24.48,12,0.11,Bachelor's,Part-time,Single,No,Yes,Education,Yes,0


In [None]:
df.drop(columns=['LoanID'], inplace=True)
df['HasMortgage'] = df['HasMortgage'].replace({'Yes':1, 'No':0})
df['HasDependents'] = df['HasDependents'].replace({'Yes':1, 'No':0})
df['HasCoSigner'] = df['HasCoSigner'].replace({'Yes':1, 'No':0})

df['Education'] = df['Education'].replace({'High School':0, 'Bachelor\'s':1, 'Master\'s':2, 'PhD':3})

df['EmploymentType'] = df['EmploymentType'].replace({'Unemployed':0, 'Part-time':1, 'Full-time':2, 'Self-employed':3})

df['MaritalStatus'] = df['MaritalStatus'].replace({'Single':0, 'Married':1, 'Divorced':2})

                                                  # one hot encoding it, since there seems to be no natural order
df = pd.get_dummies(df, columns=['LoanPurpose'])

#turn all false to 0 and true to 1

columns_to_convert = [
    'LoanPurpose_Auto',
    'LoanPurpose_Business',
    'LoanPurpose_Education',
    'LoanPurpose_Home',
    'LoanPurpose_Other'
]

# Convert True/False to 1/0 in the specified columns
df[columns_to_convert] = df[columns_to_convert].astype(int)

  df['HasMortgage'] = df['HasMortgage'].replace({'Yes':1, 'No':0})
  df['HasDependents'] = df['HasDependents'].replace({'Yes':1, 'No':0})
  df['HasCoSigner'] = df['HasCoSigner'].replace({'Yes':1, 'No':0})
  df['Education'] = df['Education'].replace({'High School':0, 'Bachelor\'s':1, 'Master\'s':2, 'PhD':3})
  df['EmploymentType'] = df['EmploymentType'].replace({'Unemployed':0, 'Part-time':1, 'Full-time':2, 'Self-employed':3})
  df['MaritalStatus'] = df['MaritalStatus'].replace({'Single':0, 'Married':1, 'Divorced':2})


In [None]:
df.head()


Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,...,MaritalStatus,HasMortgage,HasDependents,HasCoSigner,Default,LoanPurpose_Auto,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other
0,18,137576,209136,846,26,2,10.47,60,0.81,0,...,0,1,0,0,0,0,1,0,0,0
1,47,57194,5970,748,30,2,19.72,36,0.73,0,...,2,0,1,0,0,0,0,1,0,0
2,26,84328,95065,453,7,2,24.25,12,0.45,2,...,1,0,0,1,0,0,0,0,0,1
3,53,49795,229582,533,107,3,14.44,60,0.17,1,...,0,1,0,1,1,1,0,0,0,0
4,49,115450,22072,840,0,4,24.48,12,0.11,1,...,0,0,1,1,0,0,0,1,0,0


## searching for the best hyper parameters for the logistic regression model

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Separate features (X) and target variable (y)
X = df.drop('Default', axis=1)
y = df['Default']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=114)


# Initialize ans randomize search for the hyper-paramters
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}
logreg = LogisticRegression()

random_search = RandomizedSearchCV(logreg, param_distributions=param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=114)
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)




Best Hyperparameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 100}


In [None]:

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier


# Load the test data
df_test1 = pd.read_csv('test.csv')
df_test = df_test1.drop(columns=['LoanID'])


# Apply the same preprocessing steps to the test data
df_test['HasMortgage'] = df_test['HasMortgage'].replace({'Yes': 1, 'No': 0})
df_test['HasDependents'] = df_test['HasDependents'].replace({'Yes': 1, 'No': 0})
df_test['HasCoSigner'] = df_test['HasCoSigner'].replace({'Yes': 1, 'No': 0})
df_test['Education'] = df_test['Education'].replace({'High School': 0, 'Bachelor\'s': 1, 'Master\'s': 2, 'PhD': 3})
df_test['EmploymentType'] = df_test['EmploymentType'].replace({'Unemployed': 0, 'Part-time': 1, 'Full-time': 2, 'Self-employed': 3})
df_test['MaritalStatus'] = df_test['MaritalStatus'].replace({'Single': 0, 'Married': 1, 'Divorced': 2})
df_test = pd.get_dummies(df_test, columns=['LoanPurpose'])

logreg = LogisticRegression(C=100, penalty='l1', solver='liblinear')
logreg.fit(X, y)
# Make predictions on the test data
y_pred = logreg.predict(df_test)


# Save LoanID and prediction
output = pd.DataFrame({'LoanID': df_test1['LoanID'], 'Default': y_pred})
output.to_csv('output_logreg.csv', index=False)
print("done")

  df_test['HasMortgage'] = df_test['HasMortgage'].replace({'Yes': 1, 'No': 0})
  df_test['HasDependents'] = df_test['HasDependents'].replace({'Yes': 1, 'No': 0})
  df_test['HasCoSigner'] = df_test['HasCoSigner'].replace({'Yes': 1, 'No': 0})
  df_test['Education'] = df_test['Education'].replace({'High School': 0, 'Bachelor\'s': 1, 'Master\'s': 2, 'PhD': 3})
  df_test['EmploymentType'] = df_test['EmploymentType'].replace({'Unemployed': 0, 'Part-time': 1, 'Full-time': 2, 'Self-employed': 3})
  df_test['MaritalStatus'] = df_test['MaritalStatus'].replace({'Single': 0, 'Married': 1, 'Divorced': 2})


done
