In [1]:
%load_ext autoreload
%autoreload 2

import warnings
import os
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler



In [2]:

data_file = r"../../InputData/Pipelines/healthcare/incomeK/before_selection_incomeK.csv"
data = pd.read_csv(data_file)
len(data)


887

In [3]:
print(data.describe())

                id  num_children      income  complications
count   887.000000    887.000000  887.000000     887.000000
mean    495.503946      2.390079  162.365276       5.145434
std     289.406386      1.676768  110.010853       3.129673
min       1.000000      0.000000    8.000000       0.000000
25%     243.500000      1.000000   76.000000       2.000000
50%     493.000000      2.000000  140.000000       5.000000
75%     743.000000      4.000000  218.000000       8.000000
max    1000.000000      5.000000  447.000000      10.000000


In [4]:
relax_rate = 1.05
contract_rate = 0.95
relax_rate_change = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]
contract_rate_change = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# q1

In [5]:
data1 = data[(data['income'] >= 200) & (data['num_children'] >= 3)]
data1 = data1[(data1['county'] == "county2") | (data1['county'] == "county3")]
len(data1)

46

In [6]:
data1.to_csv("result_q1.csv", index=False)

## relax

In [18]:
data1 = pd.read_csv("result_q1.csv")
print(data1.groupby(['race']).size())


num = len(data1[data1['race'] == 'race3'])
print(num)
print(num * relax_rate)
print(num/len(data1))

race
race1    16
race2    14
race3     9
dtype: int64
9
9.450000000000001
0.1956521739130435


In [14]:
for r in relax_rate_change:
    print(num * r)

9.9
10.799999999999999
11.700000000000001
12.6
13.5
14.4


In [16]:
print(data1.groupby(['race', 'age_group']).size())


num = len(data1[(data1['race'] == 'race1') & (data1["age_group"] == "group1")])
print(num)
print(num * relax_rate)

race   age_group
race1  group1       2
       group2       8
       group3       6
race2  group1       5
       group2       4
       group3       5
race3  group1       3
       group2       2
       group3       4
dtype: int64
2
2.1


## contract

In [11]:
num = len(data1[data1['race'] == 'race2'])
print(num)
print(num * contract_rate)

14
13.299999999999999


In [12]:
for r in contract_rate_change:
    print(num * r)

5.6000000000000005
7.0
8.4
9.799999999999999
11.200000000000001
12.6


In [13]:
num = len(data[(data['age_group'] == "group2")])
print(num)
print(num * contract_rate)

298
283.09999999999997


## refine

In [14]:
num = len(data1[data1['race'] == 'race2'])
print(num)
print(num * 1.05)

14
14.700000000000001


In [15]:
for s in [1.02, 1.04, 1.06, 1.08, 1.10, 1.12]:
    print(num * s)

14.280000000000001
14.56
14.84
15.120000000000001
15.400000000000002
15.680000000000001


In [16]:

num = len(data1[(data1["age_group"] == "group2")])
print(num)
print(num * 0.95)


15
14.25


In [17]:
s = [0.88, 0.90, 0.92, 0.94, 0.96, 0.98]
for r in s:
    print(num * r)

13.2
13.5
13.8
14.1
14.399999999999999
14.7


# q2

In [18]:

data_file = r"../../InputData/Pipelines/healthcare/incomeK/before_selection_incomeK.csv"
data = pd.read_csv(data_file)
len(data)


887

In [19]:
data2 = data[(data['income'] >= 150) & (data['num_children'] <= 4) & (data['complications'] <= 8)]
data2 = data2[(data2['county'] == "county2") | (data2['county'] == "county4")]
len(data2)

89

## relax

In [20]:
data1 = pd.read_csv("result_q2.csv")
num = len(data1[data1['race'] == 'race2'])
print(num)
print(num * relax_rate)

33
34.65


In [21]:
for r in relax_rate_change:
    print(num * r)

36.300000000000004
39.6
42.9
46.199999999999996
49.5
52.800000000000004


In [22]:

num = len(data1[(data1['race'] == 'race1') & (data1["age_group"] == "group1")])
print(num)
print(num * relax_rate)

13
13.65


# contract

In [23]:
num = len(data1[data1['race'] == 'race2'])
print(num)
print(num * contract_rate)

33
31.349999999999998


In [24]:
for r in contract_rate_change:
    print(num * r)

13.200000000000001
16.5
19.8
23.099999999999998
26.400000000000002
29.7


In [25]:
num = len(data1[data1['age_group'] == 'group2'])
print(num)
print(num * contract_rate)

32
30.4


## refine

In [26]:
num = len(data1[data1['race'] == 'race2'])
print(num)
print(num * 1.05)

33
34.65


In [27]:
for s in [1.02, 1.04, 1.06, 1.08, 1.10, 1.12]:
    print(num * s)

33.660000000000004
34.32
34.980000000000004
35.64
36.300000000000004
36.96


In [28]:

num = len(data1[(data1["age_group"] == "group2")])
print(num)
print(num * 0.95)


32
30.4
