In [2]:
import os
import time

import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('whitegrid', {'axes.edgecolor': '.2'})
sns.set_context('notebook', font_scale=1.4)

from fairlearn.datasets import fetch_acs_income

In [3]:
data = pd.read_csv('../../data/TX_data.csv', index_col=False)

In [4]:
print(data.describe())


                AGEP            COW           SCHL            MAR  \
count  135924.000000  135924.000000  135924.000000  135924.000000   
mean       42.964855       2.041023      18.249367       2.458528   
std        15.092504       1.821862       3.712312       1.770848   
min        17.000000       1.000000       1.000000       1.000000   
25%        30.000000       1.000000      16.000000       1.000000   
50%        43.000000       1.000000      19.000000       1.000000   
75%        55.000000       3.000000      21.000000       5.000000   
max        92.000000       8.000000      24.000000       5.000000   

                OCCP           POBP           RELP           WKHP  \
count  135924.000000  135924.000000  135924.000000  135924.000000   
mean     4199.161531      87.705578       2.007740      39.593037   
std      2657.135602     101.211506       4.028999      13.130812   
min        10.000000       1.000000       0.000000       1.000000   
25%      2205.000000      47.0000

In [5]:
data1 = data[(data["SEX"] == "F") & (data["MAR"] == 1)]
print(len(data1))

0


In [6]:
data2 = data[(data["RAC1P"] == 2)]
print(len(data2))

12529


In [7]:
print(len(data[data['WKHP'] >= 23]))

120324


In [8]:
data = data[(data['WKHP'] >= 40) & (data['SCHL'] >= 19)]
data = data[(data['COW'] == 3) | (data['COW'] == 4) | (data['COW'] == 5)]
len(data)


12051

In [9]:
relax_rate = 1.3
contract_rate = 0.8
relax_rate_change = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]
contract_rate_change = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# relax

In [10]:
data1 = data[(data["SEX"] == "F") & (data["MAR"] == 1)]
print(len(data1))
print(len(data1) * relax_rate)

0
0.0


In [29]:
for s in relax_rate_change:
    print(len(data1) * s)

51211.600000000006
55867.2
60522.8
65178.399999999994
69834.0
74489.6


In [30]:
for s in [1.2, 1.4, 1.6, 1.8, 2.0, 2.2]:
    print(len(data1) * s)

55867.2
65178.399999999994
74489.6
83800.8
93112.0
102423.20000000001


In [31]:
data2 = data[(data["RAC1P"] == 2)]
print(len(data2))
print(len(data2) * relax_rate)

14970
19461.0


# contract

In [32]:
data1 = data[(data["SEX"] == "M") & (data["RELP"] == 1)]
print(len(data1))
print(len(data1) * contract_rate)

16052
12841.6


In [33]:
for s in contract_rate_change:
    print(len(data1) * s)

6420.8
8026.0
9631.199999999999
11236.4
12841.6
14446.800000000001


In [34]:
data1 = data[(data["MAR"] == 3) & (data["age_bucket"] == "30-60")]
print(len(data1))
print(len(data1) * contract_rate)

12104
9683.2


# refine

In [35]:
data1 = data[(data["SEX"] == "F") & (data["MAR"] == 1)]
print(len(data1))
print(len(data1) * 1.05)

46556
48883.8


In [36]:
s = [1.02, 1.04, 1.06, 1.08, 1.10, 1.12]
for r in s:
    print(len(data1) * r)

47487.12
48418.240000000005
49349.36
50280.48
51211.600000000006
52142.72000000001


In [37]:
data1 = data[(data["SEX"] == "M") & (data["RAC1P"] == 6)]
print(len(data1))
print(len(data1) * 0.95)

3672
3488.3999999999996


In [38]:
s = [0.88, 0.90, 0.92, 0.94, 0.96, 0.98]
for r in s:
    print(len(data1) * r)

3231.36
3304.8
3378.2400000000002
3451.68
3525.12
3598.56


# query selectivity

In [12]:
data = pd.read_csv('../../data/acs_income.csv', index_col=False)
# data = data[(data['WKHP'] >= 44) & (data['SCHL'] >= 21)]
# data = data[(data['COW'] == 3) | (data['COW'] == 4) | (data['COW'] == 5)]
# len(data)


In [13]:
for i in range(0, 101, 10):
    value = data['WKHP'].quantile(i / 100)
    print(f"{i}% percentile: {value}")

0% percentile: 1.0
10% percentile: 20.0
20% percentile: 30.0
30% percentile: 38.0
40% percentile: 40.0
50% percentile: 40.0
60% percentile: 40.0
70% percentile: 40.0
80% percentile: 45.0
90% percentile: 50.0
100% percentile: 99.0


In [None]:
for i in range(0, 101, 10):
    value = data['SCHL'].quantile(i / 100)
    print(f"{i}% percentile: {value}")