# Data Science I Project

In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [32]:
# fetch dataset 
statlog_german_credit_data = fetch_ucirepo(id=144)

In [33]:
metadata = statlog_german_credit_data.metadata

print("Abstract: ", metadata["abstract"])
print("Area: ", metadata["area"])
print("Number of Features: ", metadata["num_features"])
print("Number of Instances: ", metadata["num_instances"])

Abstract:  This dataset classifies people described by a set of attributes as good or bad credit risks. Comes in two formats (one all numeric). Also comes with a cost matrix
Area:  Social Science
Number of Features:  20
Number of Instances:  1000


In [34]:
statlog_german_credit_data.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,Attribute1,Feature,Categorical,,Status of existing checking account,,no
1,Attribute2,Feature,Integer,,Duration,months,no
2,Attribute3,Feature,Categorical,,Credit history,,no
3,Attribute4,Feature,Categorical,,Purpose,,no
4,Attribute5,Feature,Integer,,Credit amount,,no
5,Attribute6,Feature,Categorical,,Savings account/bonds,,no
6,Attribute7,Feature,Categorical,Other,Present employment since,,no
7,Attribute8,Feature,Integer,,Installment rate in percentage of disposable i...,,no
8,Attribute9,Feature,Categorical,Marital Status,Personal status and sex,,no
9,Attribute10,Feature,Categorical,,Other debtors / guarantors,,no


In [35]:
dataset_df = statlog_german_credit_data.data.original

COLUMN_RENAME_DICT = {
    'Attribute1': 'CheckingAccountStatus',
    'Attribute2': 'DurationInMonths',
    'Attribute3': 'CreditHistory',
    'Attribute4': 'Purpose',
    'Attribute5': 'CreditAmount',
    'Attribute6': 'SavingsAccountBonds',
    'Attribute7': 'EmploymentSince',
    'Attribute8': 'InstallmentRate',
    'Attribute9': 'PersonalStatusSex',
    'Attribute10': 'OtherDebtorsGuarantors',
    'Attribute11': 'ResidenceSince',
    'Attribute12': 'Property',
    'Attribute13': 'Age',
    'Attribute14': 'OtherInstallmentPlans',
    'Attribute15': 'Housing',
    'Attribute16': 'ExistingCreditsCount',
    'Attribute17': 'Job',
    'Attribute18': 'PeopleLiableMaintenance',
    'Attribute19': 'Telephone',
    'Attribute20': 'ForeignWorker',
    'class': 'CreditRiskClass'
}

dataset_df = dataset_df.rename(columns = COLUMN_RENAME_DICT)

In [36]:
dataset_df


Unnamed: 0,CheckingAccountStatus,DurationInMonths,CreditHistory,Purpose,CreditAmount,SavingsAccountBonds,EmploymentSince,InstallmentRate,PersonalStatusSex,OtherDebtorsGuarantors,...,Property,Age,OtherInstallmentPlans,Housing,ExistingCreditsCount,Job,PeopleLiableMaintenance,Telephone,ForeignWorker,CreditRiskClass
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,A14,12,A32,A42,1736,A61,A74,3,A92,A101,...,A121,31,A143,A152,1,A172,1,A191,A201,1
996,A11,30,A32,A41,3857,A61,A73,4,A91,A101,...,A122,40,A143,A152,1,A174,1,A192,A201,1
997,A14,12,A32,A43,804,A61,A75,4,A93,A101,...,A123,38,A143,A152,1,A173,1,A191,A201,1
998,A11,45,A32,A43,1845,A61,A73,4,A93,A101,...,A124,23,A143,A153,1,A173,1,A192,A201,2
