# Calculate conditional probabilities

The dataset contains 1000 observations/data subjects and 21 variables. Variables are described in "german data description.txt". In this activity, we will only use three of them:

VAR19: Telephone (A191 : none; A192 : yes, registered under the customers name)
VAR21: Status (target) : (1: Good, 2: Bad)

In [1]:
import pandas as pd

credit_data1 = pd.read_csv('german.csv', usecols=['VAR19', 'VAR21'])
credit_data1.head(10)

# For better presentation let's rename the variables and replace the attribute values

credit_data1.rename(columns={'VAR19':'Telephone','VAR21' : 'Status'}, inplace =True)

replacements = {'A191':'Unknown', 'A192':'Known',1:'Good',2:'Bad'}
credit_data1.replace(replacements, inplace=True)
credit_data1.head(10)

Unnamed: 0,Telephone,Status
0,Known,Good
1,Unknown,Bad
2,Unknown,Good
3,Unknown,Good
4,Unknown,Bad
5,Known,Good
6,Unknown,Good
7,Known,Good
8,Unknown,Good
9,Unknown,Bad


In [4]:
# Produce a one-way table
telephone_freq = pd.crosstab(index=credit_data1['Telephone'],columns='Count')
telephone_freq

col_0,Count
Telephone,Unnamed: 1_level_1
Known,404
Unknown,596


In [10]:
# To get proportions, i.e. probabilities, we can use 'normalize' option

telephone_prob = pd.crosstab(index=credit_data1['Telephone'], columns='Prob', normalize='columns')
telephone_prob

col_0,Prob
Telephone,Unnamed: 1_level_1
Known,0.404
Unknown,0.596


In [20]:
# To produce a two-way table add the second variable into the column

telephone_cross = pd.crosstab(index=credit_data1['Telephone'], columns=credit_data1['Status'], margins=True)

telephone_cross

Status,Bad,Good,All
Telephone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Known,113,291,404
Unknown,187,409,596
All,300,700,1000


In [13]:
telephone_crossc = pd.crosstab(index=credit_data1['Telephone'], columns=credit_data1['Status'],margins=True, normalize='columns')
telephone_crossc

Status,Bad,Good,All
Telephone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Known,0.376667,0.415714,0.404
Unknown,0.623333,0.584286,0.596


In [21]:
telephone_crossp = pd.crosstab(index=credit_data1['Status'], columns=credit_data1['Telephone'], margins=True, normalize='columns')

telephone_crossp

Telephone,Known,Unknown,All
Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bad,0.279703,0.313758,0.3
Good,0.720297,0.686242,0.7


In [23]:
credit_data2 = pd.read_csv('german.csv', usecols=['VAR15','VAR21'])
credit_data2.head(10)

credit_data2.rename (columns={'VAR15' : 'Housing', 'VAR21':'Status'}, inplace=True)
replacements = {
    'A151' : 'Rent',
    'A152' : 'Own',
    'A153' : 'Free',
    1 : 'Good',
    2 : 'Bad'
}
credit_data2.replace(replacements, inplace=True)
credit_data2.head(10)

Unnamed: 0,Housing,Status
0,Own,Good
1,Own,Bad
2,Own,Good
3,Free,Good
4,Free,Bad
5,Free,Good
6,Own,Good
7,Rent,Good
8,Own,Good
9,Own,Bad


In [24]:
housing_cross = pd.crosstab(index=credit_data2['Housing'], columns=credit_data2['Status'], margins = True)

housing_cross

Status,Bad,Good,All
Housing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Free,44,64,108
Own,186,527,713
Rent,70,109,179
All,300,700,1000


In [25]:
housing_crossc = pd.crosstab(index=credit_data2['Housing'], columns=credit_data2['Status'], margins=True, normalize='columns')

housing_crossc

Status,Bad,Good,All
Housing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Free,0.146667,0.091429,0.108
Own,0.62,0.752857,0.713
Rent,0.233333,0.155714,0.179


In [26]:
housing_crossp = pd.crosstab(index=credit_data2['Status'], columns=credit_data2['Housing'], margins=True, normalize='columns')

housing_crossp

Housing,Free,Own,Rent,All
Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bad,0.407407,0.26087,0.391061,0.3
Good,0.592593,0.73913,0.608939,0.7
