In [14]:
import os
import time

import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('whitegrid', {'axes.edgecolor': '.2'})
sns.set_context('notebook', font_scale=1.4)

from fairlearn.datasets import fetch_acs_income

In [15]:
data = pd.read_csv('../../data/acs_income.csv', index_col=False)

In [16]:
data.describe()


Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,RAC1P,PINCP
count,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0
mean,43.41127,2.0775,18.61814,2.521997,4180.517,65.81708,2.241254,38.3339,1.874745,56663.86
std,15.30203,1.825338,3.297826,1.79672,2658.717,93.06245,4.385288,13.08073,2.084384,73067.45
min,17.0,1.0,1.0,1.0,10.0,1.0,0.0,1.0,1.0,104.0
25%,30.0,1.0,16.0,1.0,2205.0,18.0,0.0,35.0,1.0,20000.0
50%,43.0,1.0,19.0,1.0,4200.0,36.0,1.0,40.0,1.0,39000.0
75%,56.0,3.0,21.0,5.0,5740.0,48.0,2.0,44.0,1.0,68000.0
max,96.0,8.0,24.0,5.0,9830.0,554.0,17.0,99.0,9.0,1423000.0


In [17]:
data1 = data[(data["SEX"] == "F") & (data["MAR"] == 1)]
print(len(data1))

414019


In [18]:
data2 = data[(data["RAC1P"] == 2)]
print(len(data2))

147573


In [19]:
print(len(data[data['WKHP'] >= 23]))

1444173


# query

In [42]:
data = data[(data['WKHP'] <= 40) & (data['PINCP'] >= 39000)]
data = data[(data['OCCP_bucket'] == "CSSO") | (data['OCCP_bucket'] == "HPTO") | (data['OCCP_bucket'] == "PSO") | (data['OCCP_bucket'] == "PCSO") | (data['OCCP_bucket'] == "CEO")]
len(data)


105768

In [21]:
relax_rate = 1.3
contract_rate = 0.8
relax_rate_change = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]
contract_rate_change = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# relax

In [36]:
data1 = data[(data["age_bucket"] == "60+") & (data["RAC1P"] == 2)]
print(len(data1))
print(len(data1) * relax_rate)

18267
23747.100000000002


In [37]:
for s in relax_rate_change:
    print(len(data1) * s)

20093.7
21920.399999999998
23747.100000000002
25573.8
27400.5
29227.2


In [38]:
for s in [1.2, 1.4, 1.6, 1.8, 2.0, 2.2]:
    print(len(data1) * s)

21920.399999999998
25573.8
29227.2
32880.6
36534.0
40187.4


In [39]:
data2 = data[(data["RELP"] == 2) & (data["MAR"] == 3)]
print(len(data2))
print(len(data2) * relax_rate)

11083
14407.9


In [41]:
data3 = data[(data["SEX"] == 'M') & (data["RAC1P"] == 9)]
print(len(data3))
print(len(data3) * relax_rate)

21068
27388.4


# contract

In [26]:
data1 = data[(data["SEX"] == "M") & (data["RELP"] == 1)]
print(len(data1))
print(len(data1) * contract_rate)

14471
11576.800000000001


In [27]:
for s in contract_rate_change:
    print(len(data1) * s)

5788.400000000001
7235.5
8682.6
10129.699999999999
11576.800000000001
13023.9


In [28]:
data1 = data[(data["MAR"] == 3) & (data["age_bucket"] == "30-60")]
print(len(data1))
print(len(data1) * contract_rate)

9419
7535.200000000001


# refine

In [29]:
data1 = data[(data["SEX"] == "F") & (data["MAR"] == 1)]
print(len(data1))
print(len(data1) * 1.05)

34966
36714.3


In [30]:
s = [1.02, 1.04, 1.06, 1.08, 1.10, 1.12]
for r in s:
    print(len(data1) * r)

35665.32
36364.64
37063.96
37763.280000000006
38462.600000000006
39161.920000000006


In [31]:
data1 = data[(data["SEX"] == "M") & (data["RAC1P"] == 6)]
print(len(data1))
print(len(data1) * 0.95)

2674
2540.2999999999997


In [32]:
s = [0.88, 0.90, 0.92, 0.94, 0.96, 0.98]
for r in s:
    print(len(data1) * r)

2353.12
2406.6
2460.08
2513.56
2567.04
2620.52


# query selectivity

In [33]:
data = pd.read_csv('../../data/acs_income.csv', index_col=False)
# data = data[(data['WKHP'] >= 44) & (data['SCHL'] >= 21)]
# data = data[(data['COW'] == 3) | (data['COW'] == 4) | (data['COW'] == 5)]
# len(data)


In [34]:
for i in range(0, 101, 10):
    value = data['WKHP'].quantile(i / 100)
    print(f"{i}% percentile: {value}")

0% percentile: 1.0
10% percentile: 20.0
20% percentile: 30.0
30% percentile: 38.0
40% percentile: 40.0
50% percentile: 40.0
60% percentile: 40.0
70% percentile: 40.0
80% percentile: 45.0
90% percentile: 50.0
100% percentile: 99.0


In [35]:
for i in range(0, 101, 10):
    value = data['SCHL'].quantile(i / 100)
    print(f"{i}% percentile: {value}")

0% percentile: 1.0
10% percentile: 16.0
20% percentile: 16.0
30% percentile: 17.0
40% percentile: 19.0
50% percentile: 19.0
60% percentile: 20.0
70% percentile: 21.0
80% percentile: 21.0
90% percentile: 22.0
100% percentile: 24.0
