# Case Loans

In [3]:
# Import von Modulen

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Data 

In [4]:
## Link to data

ROOT = "https://raw.githubusercontent.com/kirenz/modern-statistics/main/data/"
DATA = "loans.csv"

df = pd.read_csv(ROOT + DATA)

In [5]:
## Overview

df.info()

print("------------------")
print(f"We have {len(df.index):,} observations and {len(df.columns)} columns in our dataset.")
print("------------")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 55 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   emp_title                         9167 non-null   object 
 1   emp_length                        9183 non-null   float64
 2   state                             10000 non-null  object 
 3   homeownership                     10000 non-null  object 
 4   annual_income                     10000 non-null  float64
 5   verified_income                   10000 non-null  object 
 6   debt_to_income                    9976 non-null   float64
 7   annual_income_joint               1495 non-null   float64
 8   verification_income_joint         1455 non-null   object 
 9   debt_to_income_joint              1495 non-null   float64
 10  delinq_2y                         10000 non-null  int64  
 11  months_since_last_delinq          4342 non-null   float64
 12  earli

## Data correction

In [8]:
df['state'] = df['state'].astype('category')

In [9]:
# Convert to categorical

cat_convert = ['emp_title', 'homeownership', 'verified_income']

for i in cat_convert:
    df[i] = df[i].astype("category")

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 55 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   emp_title                         9167 non-null   category
 1   emp_length                        9183 non-null   float64 
 2   state                             10000 non-null  category
 3   homeownership                     10000 non-null  category
 4   annual_income                     10000 non-null  float64 
 5   verified_income                   10000 non-null  category
 6   debt_to_income                    9976 non-null   float64 
 7   annual_income_joint               1495 non-null   float64 
 8   verification_income_joint         1455 non-null   object  
 9   debt_to_income_joint              1495 non-null   float64 
 10  delinq_2y                         10000 non-null  int64   
 11  months_since_last_delinq          4342 non-null   float

In [11]:
df["metric"] = df["annual_income"] / df["total_credit_limit"]

In [12]:
# list of all numerical data
list_num = df.select_dtypes(include=[np.number]).columns.tolist()

In [13]:
list_num

['emp_length',
 'annual_income',
 'debt_to_income',
 'annual_income_joint',
 'debt_to_income_joint',
 'delinq_2y',
 'months_since_last_delinq',
 'earliest_credit_line',
 'inquiries_last_12m',
 'total_credit_lines',
 'open_credit_lines',
 'total_credit_limit',
 'total_credit_utilized',
 'num_collections_last_12m',
 'num_historical_failed_to_pay',
 'months_since_90d_late',
 'current_accounts_delinq',
 'total_collection_amount_ever',
 'current_installment_accounts',
 'accounts_opened_24m',
 'months_since_last_credit_inquiry',
 'num_satisfactory_accounts',
 'num_accounts_120d_past_due',
 'num_accounts_30d_past_due',
 'num_active_debit_accounts',
 'total_debit_limit',
 'num_total_cc_accounts',
 'num_open_cc_accounts',
 'num_cc_carrying_balance',
 'num_mort_accounts',
 'account_never_delinq_percent',
 'tax_liens',
 'public_record_bankrupt',
 'loan_amount',
 'term',
 'interest_rate',
 'installment',
 'balance',
 'paid_total',
 'paid_principal',
 'paid_interest',
 'paid_late_fees',
 'metric']

In [14]:
# list of all categorical data
list_cat = df.select_dtypes(include=['category']).columns.tolist()
list_cat

['emp_title', 'state', 'homeownership', 'verified_income']

## Preparation for data splitting

In [15]:
# define outcome variable as y_label
y_label = 'interest_rate'

# select features
features = df.drop(columns=[y_label]).columns.tolist()

In [17]:
# create feature data for data splitting
X = df[features]

# list of numeric features
feat_num = X.select_dtypes(include=[np.number]).columns.tolist()


In [18]:
# list of categorical features
feat_cat = X.select_dtypes(include=['category']).columns.tolist() 

# create response for data splitting
y = df[y_label]

## Train test split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

## Exploration data

In [23]:
df_train = pd.DataFrame(X_train.copy()) 
df_train = df_train.join(pd.DataFrame(y_train)) # df_train nutzen und mit y_train joinen