In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

In [24]:
df_credit = pd.read_csv("risk_data.csv")
df_credit.head(10)

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,B
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad
5,35,male,1,free,,,9055,36,education,good
6,53,male,2,own,quite rich,,2835,24,furniture/equipment,good
7,35,male,3,rent,little,moderate,6948,36,car,good
8,61,male,1,own,rich,,3059,12,radio/TV,good
9,28,male,3,own,little,moderate,5234,30,car,bad


In [25]:
#Duplicates Removal
print("Shape before duplicates:", df_credit.shape)
df_credit = df_credit.drop_duplicates()
print("Shape after duplicates removal:", df_credit.shape)

Shape before duplicates: (1000, 10)
Shape after duplicates removal: (1000, 10)


In [26]:
#Define Data
num = ["Age", "Credit amount", "Duration"]
dummies = ["Sex", "Job", "Housing","Saving accounts", "Checking account", "Purpose"]
target = "Risk"

In [27]:
#Data normalization
#Now let’s check the format of different dummies by checking the existing unique values in each column.
for v in dummies + [target]:
    print("******** Unique values of", v," are :**********\n",df_credit[v].unique())

******** Unique values of Sex  are :**********
 ['male' 'female']
******** Unique values of Job  are :**********
 [2 1 3 0]
******** Unique values of Housing  are :**********
 ['own' 'free       ' 'free' 'rent' 'own       ' 'rent       ']
******** Unique values of Saving accounts  are :**********
 [nan 'little' 'quite rich' 'rich' 'moderate']
******** Unique values of Checking account  are :**********
 ['little' 'moderate' nan 'rich']
******** Unique values of Purpose  are :**********
 ['radio/TV' 'education' 'furniture/equipment' 'car' 'business'
 'domestic appliances' 'repairs' 'vacation/others']
******** Unique values of Risk  are :**********
 ['good' 'B' 'bad' 'G']


In [28]:
#Normalize risk variable
mapping_risk = {"good": 1,"G": 1,"bad": 0,"B": 0}
df_credit.loc[:,"Risk"] = df_credit.loc[:,"Risk"].apply(lambda x : mapping_risk.get(x))

In [29]:
df_credit["Risk"].unique()

array([1, 0], dtype=object)

In [30]:
df_credit.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,1
1,22,female,2,own,little,moderate,5951,48,radio/TV,0
2,49,male,1,own,little,,2096,12,education,1
3,45,male,2,free,little,little,7882,42,furniture/equipment,1
4,53,male,2,free,little,little,4870,24,car,0


In [31]:
df_credit.loc[:,"Housing"] = df_credit.loc[:,"Housing"].apply(lambda x :x.rstrip())

In [32]:
df_credit["Housing"].unique()

array(['own', 'free', 'rent'], dtype=object)