In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [47]:
# Everything I create, in my profile, in Python I'll try to create as equal as possible in R.

# Data from: https://www.kaggle.com/kabure/german-credit-data-with-risk

In [48]:
df = pd.read_csv('german_credit_risk_target.csv')
df.head(10)

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad
5,5,35,male,1,free,,,9055,36,education,good
6,6,53,male,2,own,quite rich,,2835,24,furniture/equipment,good
7,7,35,male,3,rent,little,moderate,6948,36,car,good
8,8,61,male,1,own,rich,,3059,12,radio/TV,good
9,9,28,male,3,own,little,moderate,5234,30,car,bad


In [49]:
# Pre-processing

df.drop(['Unnamed: 0'], 1)
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})
df['Housing'] = df['Housing'].map({'free' : 1, 'own' : 2, 'rent' : 3})
df['Saving accounts'] = df['Saving accounts'].map({'little' : 1, 'moderate' : 2, 'quite rich' : 3, 'rich' : 4})
df['Checking account'] = df['Checking account'].map({'little' : 1, 'moderate' : 2, 'rich' : 3})
df['Purpose'] = df['Purpose'].map({'radio/TV' : 1, 'education' : 2, 'furniture/equipment' : 3,
                                   'car' : 4, 'business' : 5, 'domestic appliances' : 6,
                                   'repairs' : 7, 'vacation/others' : 8})
df['Risk'] = df['Risk'].map({'bad' : 1, 'good' : 2})

df.head(10)

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,1,2,2,,1.0,1169,6,1,2
1,1,22,0,2,2,1.0,2.0,5951,48,1,1
2,2,49,1,1,2,1.0,,2096,12,2,2
3,3,45,1,2,1,1.0,1.0,7882,42,3,2
4,4,53,1,2,1,1.0,1.0,4870,24,4,1
5,5,35,1,1,1,,,9055,36,2,2
6,6,53,1,2,2,3.0,,2835,24,3,2
7,7,35,1,3,3,1.0,2.0,6948,36,4,2
8,8,61,1,1,2,4.0,,3059,12,1,2
9,9,28,1,3,2,1.0,2.0,5234,30,4,1


In [50]:
# Dealing with NaN

df.fillna(df.median(), inplace = True)

df.head(10)

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,1,2,2,1.0,1.0,1169,6,1,2
1,1,22,0,2,2,1.0,2.0,5951,48,1,1
2,2,49,1,1,2,1.0,2.0,2096,12,2,2
3,3,45,1,2,1,1.0,1.0,7882,42,3,2
4,4,53,1,2,1,1.0,1.0,4870,24,4,1
5,5,35,1,1,1,1.0,2.0,9055,36,2,2
6,6,53,1,2,2,3.0,2.0,2835,24,3,2
7,7,35,1,3,3,1.0,2.0,6948,36,4,2
8,8,61,1,1,2,4.0,2.0,3059,12,1,2
9,9,28,1,3,2,1.0,2.0,5234,30,4,1


In [51]:
# Spliting train and test samples

x = df[['Age', 'Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account', 'Credit amount', 'Duration', 'Purpose']]
y = df['Risk']

X_train, X_test, y_train, y_test = train_test_split(x , y, random_state = 0)

In [52]:
# Padronizing the data

X_train_scaled = scale(X_train)
X_test_scaled = scale(X_test)

In [53]:
# Training and testing the model

lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [54]:
y_predicted = lr.predict(X_test_scaled)

In [55]:
print('Train Accuracy: ', lr.score(X_train_scaled, y_train))
print('Test Accuracy: ', lr.score(X_test_scaled, y_test))
print('Test Confusion Matrix: ')
print(confusion_matrix(y_test, y_predicted))

Train Accuracy:  0.7333333333333333
Test Accuracy:  0.736
Test Confusion Matrix: 
[[ 21  53]
 [ 13 163]]
