In [1]:
#### Dependencies ####
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [2]:
#### Load Data ####
train_loan_data = pd.read_csv('Data/train.csv').drop('Id',axis=1)
test_loan_data = pd.read_csv('Data/test.csv').drop('Id',axis=1)

In [3]:
#### Data Exploration ####
train_loan_data.head(20)
train_loan_data['Years in current job'].value_counts()

Years in current job
10+ years    2332
2 years       705
3 years       620
< 1 year      563
5 years       516
1 year        504
4 years       469
6 years       426
7 years       396
8 years       339
9 years       259
Name: count, dtype: int64

In [4]:
#### Report on Missing Values ####
def missing_values_report(data,train=False):
    features = data.columns
    if train :
        print('----Missing Values in Training Data----')
    else:
        print('----Missing Values in Test Data----')
    for feature in features:
        if data[feature].dtype in ['int64','float64']:
            missing_values = data[feature].isnull().sum()
            if missing_values > 0:
                print(f'{feature} : {missing_values}')
        else:
            missing_values = data[feature].isnull().sum() + data[data[feature] == ""].shape[0]
            if missing_values > 0:
                print(f'{feature} : {missing_values}')
        
missing_values_report(train_loan_data,train=True)
missing_values_report(test_loan_data)

----Missing Values in Training Data----
Annual Income : 1557
Years in current job : 371
Months since last delinquent : 4081
Bankruptcies : 14
Credit Score : 1557
----Missing Values in Test Data----
Annual Income : 513
Years in current job : 86
Months since last delinquent : 1358
Bankruptcies : 3
Credit Score : 513


In [5]:
#### Filling Missing Values ####
### Convert years in current job to numerical values
def convert_years_to_numerical(data,feature):
    data[feature] = data[feature].str.extract('(\d+)').astype(float)
    return data

def fill_with_median(data,features):
    for feature in features:
        data[feature].fillna(data[feature].dropna().median(),inplace=True)
    return data


train_loan_data = convert_years_to_numerical(train_loan_data,'Years in current job')
test_loan_data = convert_years_to_numerical(test_loan_data,'Years in current job')

med_test_loan_data = fill_with_median(test_loan_data,['Annual Income','Years in current job','Months since last delinquent','Bankruptcies','Credit Score'])
med_train_loan_data = fill_with_median(train_loan_data,['Annual Income','Years in current job','Months since last delinquent','Bankruptcies','Credit Score'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[feature].fillna(data[feature].dropna().median(),inplace=True)


In [6]:
med_train_loan_data.head(20)
#missing_values_report(med_train_loan_data,train=True)
#missing_values_report(med_test_loan_data)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,6.0,0.0,11.0,26.3,685960.0,1.0,32.0,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10.0,0.0,15.0,15.3,1181730.0,0.0,32.0,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8.0,0.0,11.0,35.0,1182434.0,0.0,32.0,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,6.0,0.0,8.0,22.5,147400.0,1.0,32.0,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,8.0,0.0,13.0,13.6,385836.0,1.0,32.0,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0
5,Rent,1168386.0,7.0,0.0,12.0,14.6,366784.0,0.0,32.0,0.0,other,Long Term,337304.0,165680.0,18692.0,731.0,1
6,Home Mortgage,1511108.0,10.0,0.0,9.0,20.3,388124.0,0.0,73.0,0.0,home improvements,Short Term,99999999.0,51623.0,2317.0,745.0,0
7,Rent,1040060.0,10.0,0.0,13.0,12.0,330374.0,0.0,18.0,0.0,other,Short Term,250888.0,89015.0,19761.0,705.0,1
8,Home Mortgage,1168386.0,5.0,0.0,17.0,15.7,0.0,1.0,32.0,1.0,home improvements,Short Term,129734.0,19.0,17.0,731.0,0
9,Home Mortgage,1168386.0,1.0,0.0,10.0,24.6,511302.0,0.0,6.0,0.0,debt consolidation,Long Term,572880.0,205333.0,17613.0,731.0,1


No more missing values, great !

In [7]:
#### Encoding categorical variables ####
np.unique(med_test_loan_data['Home Ownership'])
np.unique(med_test_loan_data['Purpose'])
np.unique(med_test_loan_data['Term'])

array(['Long Term', 'Short Term'], dtype=object)

Ordinal Data, so we're using hot-one encoding

In [8]:
from sklearn.preprocessing import OneHotEncoder

# Initialize OneHotEncoder
ohenc = OneHotEncoder()
# Fit and transform the encoder
ohenc.fit(med_train_loan_data[['Home Ownership','Purpose','Term']])

med_train_pre_encoded= med_train_loan_data.drop(['Home Ownership','Purpose','Term'],axis=1)
med_test_pre_encoded = med_test_loan_data.drop(['Home Ownership','Purpose','Term'],axis=1)

train_encoded = pd.DataFrame(ohenc.transform(med_train_loan_data[['Home Ownership','Purpose','Term']]).toarray(),columns=ohenc.get_feature_names_out(['Home Ownership','Purpose','Term']))
test_encoded = pd.DataFrame(ohenc.transform(med_test_loan_data[['Home Ownership','Purpose','Term']]).toarray(),columns=ohenc.get_feature_names_out(['Home Ownership','Purpose','Term']))

med_train_loan_data = pd.concat([med_train_pre_encoded,train_encoded],axis=1)
med_test_loan_data = pd.concat([med_test_pre_encoded,test_encoded],axis=1)

In [9]:
#### Basic Model ####
occurences = med_train_loan_data['Credit Default'].value_counts()

### Choose to set 1 as positive prediction (max freq)
tp = occurences[0]
tn = 0
fp = occurences[1]
fn = 0

precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1 = 2*(precision*recall)/(precision+recall)
print("F1 Score : ",f1)

F1 Score :  0.8360363156669512


In [10]:
#### Naive Classification ####
df = pd.DataFrame({
    'Id': range(7500, 10000),
    'Credit Default': 0
})

# Write the DataFrame to a CSV file
df.to_csv('Data/NaiveSubmission.csv', index=False)

In [11]:
#### CV Log Reg ####
log_reg = LogisticRegression()
X = med_train_loan_data.drop('Credit Default',axis=1)
y = med_train_loan_data['Credit Default']
cross_val_score(log_reg,X,y,cv=5,scoring='f1').mean()

0.3266696559273545

No Improvement ... The model needs to be improved