In [3]:
import os
import time

import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('whitegrid', {'axes.edgecolor': '.2'})
sns.set_context('notebook', font_scale=1.4)

from fairlearn.datasets import fetch_acs_income

In [4]:
data = pd.read_csv('../../data/acs_income.csv', index_col=False)

In [5]:
print(data.describe())


               AGEP           COW          SCHL           MAR          OCCP  \
count  1.664500e+06  1.664500e+06  1.664500e+06  1.664500e+06  1.664500e+06   
mean   4.341127e+01  2.077500e+00  1.861814e+01  2.521997e+00  4.180517e+03   
std    1.530203e+01  1.825338e+00  3.297826e+00  1.796720e+00  2.658717e+03   
min    1.700000e+01  1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+01   
25%    3.000000e+01  1.000000e+00  1.600000e+01  1.000000e+00  2.205000e+03   
50%    4.300000e+01  1.000000e+00  1.900000e+01  1.000000e+00  4.200000e+03   
75%    5.600000e+01  3.000000e+00  2.100000e+01  5.000000e+00  5.740000e+03   
max    9.600000e+01  8.000000e+00  2.400000e+01  5.000000e+00  9.830000e+03   

               POBP          RELP          WKHP         RAC1P         PINCP  
count  1.664500e+06  1.664500e+06  1.664500e+06  1.664500e+06  1.664500e+06  
mean   6.581708e+01  2.241254e+00  3.833390e+01  1.874745e+00  5.666386e+04  
std    9.306245e+01  4.385288e+00  1.308073e+01  2.084

In [6]:
data1 = data[(data["SEX"] == "F") & (data["MAR"] == 1)]
print(len(data1))

414019


In [7]:
data2 = data[(data["RAC1P"] == 2)]
print(len(data2))

147573


In [8]:
print(len(data[data['WKHP'] >= 23]))

1444173


In [9]:
data = data[(data['WKHP'] >= 44) & (data['SCHL'] >= 21)]
data = data[(data['COW'] == 3) | (data['COW'] == 4) | (data['COW'] == 5)]
len(data)


34768

In [91]:
relax_rate = 1.3
contract_rate = 0.8
relax_rate_change = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]
contract_rate_change = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# relax

In [92]:
data1 = data[(data["SEX"] == "F") & (data["MAR"] == 1)]
print(len(data1))
print(len(data1) * relax_rate)

12009
15611.7


In [93]:
for s in relax_rate_change:
    print(len(data1) * s)

13209.900000000001
14410.8
15611.7
16812.6
18013.5
19214.4


In [94]:
for s in [1.2, 1.4, 1.6, 1.8, 2.0, 2.2]:
    print(len(data1) * s)

14410.8
16812.6
19214.4
21616.2
24018.0
26419.800000000003


In [95]:
data2 = data[(data["RAC1P"] == 2)]
print(len(data2))
print(len(data2) * relax_rate)

2244
2917.2000000000003


# contract

In [100]:
data1 = data[(data["SEX"] == "M") & (data["RELP"] == 1)]
print(len(data1))
print(len(data1) * contract_rate)

4145
3316.0


In [101]:
for s in contract_rate_change:
    print(len(data1) * s)

1658.0
2072.5
2487.0
2901.5
3316.0
3730.5


In [102]:
data1 = data[(data["MAR"] == 3) & (data["age_bucket"] == "30-60")]
print(len(data1))
print(len(data1) * contract_rate)

2698
2158.4


# refine

In [112]:
data1 = data[(data["SEX"] == "F") & (data["MAR"] == 1)]
print(len(data1))
print(len(data1) * 1.05)

12009
12609.45


In [113]:
s = [1.02, 1.04, 1.06, 1.08, 1.10, 1.12]
for r in s:
    print(len(data1) * r)

12249.18
12489.36
12729.54
12969.720000000001
13209.900000000001
13450.080000000002


In [110]:
data1 = data[(data["SEX"] == "M") & (data["RAC1P"] == 6)]
print(len(data1))
print(len(data1) * 0.95)

726
689.6999999999999


In [111]:
s = [0.88, 0.90, 0.92, 0.94, 0.96, 0.98]
for r in s:
    print(len(data1) * r)

638.88
653.4
667.9200000000001
682.4399999999999
696.9599999999999
711.48


# query selectivity

In [12]:
data = pd.read_csv('../../data/acs_income.csv', index_col=False)
# data = data[(data['WKHP'] >= 44) & (data['SCHL'] >= 21)]
# data = data[(data['COW'] == 3) | (data['COW'] == 4) | (data['COW'] == 5)]
# len(data)


In [13]:
for i in range(0, 101, 10):
    value = data['WKHP'].quantile(i / 100)
    print(f"{i}% percentile: {value}")

0% percentile: 1.0
10% percentile: 20.0
20% percentile: 30.0
30% percentile: 38.0
40% percentile: 40.0
50% percentile: 40.0
60% percentile: 40.0
70% percentile: 40.0
80% percentile: 45.0
90% percentile: 50.0
100% percentile: 99.0


In [14]:
for i in range(0, 101, 10):
    value = data['SCHL'].quantile(i / 100)
    print(f"{i}% percentile: {value}")

0% percentile: 1.0
10% percentile: 16.0
20% percentile: 16.0
30% percentile: 17.0
40% percentile: 19.0
50% percentile: 19.0
60% percentile: 20.0
70% percentile: 21.0
80% percentile: 21.0
90% percentile: 22.0
100% percentile: 24.0
