In [None]:
import pandas as pd
import os
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
datadir = '/content/drive/MyDrive/UCB Files/Research/DRO/Code/FairnessML/data/adult/'

In [None]:
col_names = ['age', 'work', 'fnlwgt', 'education', 'edu-num', 'marry', 'occupation', 'relationship', 'race', 'sex','capital-gain', 'capital-loss','hours-per-week','country','income' ]
df = pd.read_csv(datadir + 'adult.data',sep = ',',index_col=False, names = col_names)
df_test = pd.read_csv(datadir + 'adult.test',sep = ',',index_col=False, names = col_names)
col_reorder = col_names = ['sex','age', 'fnlwgt', 'edu-num', 'capital-gain', 'capital-loss','hours-per-week', 'work', 'education' , 'marry', 'occupation', 'relationship', 'race','country','income' ]
df = df[col_reorder]
df_test = df_test[col_reorder]
print('Training Set Size:',df.shape)
print('Test Set Size:', df_test.shape)


Training Set Size: (32561, 15)
Test Set Size: (16282, 15)


In [None]:
df = pd.concat([df,df_test], join='inner')
df.shape

(48843, 15)

In [None]:
df['sex'] = (df['sex'] == ' Male').astype(int)
df = df.replace(' ?', np.nan)
df = df.dropna()
df['age'] = df['age'].astype(int)
df = df.reset_index(drop=True)

df.head(20)

Unnamed: 0,sex,age,fnlwgt,edu-num,capital-gain,capital-loss,hours-per-week,work,education,marry,occupation,relationship,race,country,income
0,1,39,77516.0,13.0,2174.0,0.0,40.0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,United-States,<=50K
1,1,50,83311.0,13.0,0.0,0.0,13.0,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,United-States,<=50K
2,1,38,215646.0,9.0,0.0,0.0,40.0,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,United-States,<=50K
3,1,53,234721.0,7.0,0.0,0.0,40.0,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,United-States,<=50K
4,0,28,338409.0,13.0,0.0,0.0,40.0,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Cuba,<=50K
5,0,37,284582.0,14.0,0.0,0.0,40.0,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,United-States,<=50K
6,0,49,160187.0,5.0,0.0,0.0,16.0,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Jamaica,<=50K
7,1,52,209642.0,9.0,0.0,0.0,45.0,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,United-States,>50K
8,0,31,45781.0,14.0,14084.0,0.0,50.0,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,United-States,>50K
9,1,42,159449.0,13.0,5178.0,0.0,40.0,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,United-States,>50K


In [None]:
#First Normalize capital-gain, capital-loss, fnlwgt by dividing with its max value
# df['capital-gain'] = df['capital-gain'] / df['capital-gain'].max()
# df['capital-loss'] = df['capital-loss'] / df['capital-loss'].max()
# df['fnlwgt'] = df['fnlwgt'] / df['fnlwgt'].max()
cont_vars = ['age', 'fnlwgt', 'edu-num', 'capital-gain', 'capital-loss', 'hours-per-week']
binary_vars = list(set(col_names) - set(cont_vars))
df_cont = df[cont_vars]
df_bin = df[binary_vars]
df_cont.head(20)

Unnamed: 0,age,fnlwgt,edu-num,capital-gain,capital-loss,hours-per-week
0,39,77516.0,13.0,2174.0,0.0,40.0
1,50,83311.0,13.0,0.0,0.0,13.0
2,38,215646.0,9.0,0.0,0.0,40.0
3,53,234721.0,7.0,0.0,0.0,40.0
4,28,338409.0,13.0,0.0,0.0,40.0
5,37,284582.0,14.0,0.0,0.0,40.0
6,49,160187.0,5.0,0.0,0.0,16.0
7,52,209642.0,9.0,0.0,0.0,45.0
8,31,45781.0,14.0,14084.0,0.0,50.0
9,42,159449.0,13.0,5178.0,0.0,40.0


In [None]:
#Create Dummy Variables and response y
y = (df['income'] == ' >50K').astype(int)
train_df = df.drop(['income'],axis = 1)
train_df.head(5)

Unnamed: 0,sex,age,fnlwgt,edu-num,capital-gain,capital-loss,hours-per-week,work,education,marry,occupation,relationship,race,country
0,1,39,77516.0,13.0,2174.0,0.0,40.0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,United-States
1,1,50,83311.0,13.0,0.0,0.0,13.0,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,United-States
2,1,38,215646.0,9.0,0.0,0.0,40.0,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,United-States
3,1,53,234721.0,7.0,0.0,0.0,40.0,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,United-States
4,0,28,338409.0,13.0,0.0,0.0,40.0,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Cuba


In [None]:
df_dummies = pd.get_dummies(train_df,drop_first = True)

In [None]:
#Drop Continuous Var
df_dummies = df_dummies.drop(cont_vars,axis=1)
df_dummies.head(5)

Unnamed: 0,sex,work_ Local-gov,work_ Private,work_ Self-emp-inc,work_ Self-emp-not-inc,work_ State-gov,work_ Without-pay,education_ 11th,education_ 12th,education_ 1st-4th,...,country_ Portugal,country_ Puerto-Rico,country_ Scotland,country_ South,country_ Taiwan,country_ Thailand,country_ Trinadad&Tobago,country_ United-States,country_ Vietnam,country_ Yugoslavia
0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
y.shape

(45222,)

In [None]:

#Create Polynomial processed data
deg_list = [4]
np_cont = df_cont.to_numpy()
for deg in deg_list:
  poly = PolynomialFeatures(deg)
  deg_data = poly.fit_transform(np_cont)
  scaler = preprocessing.MinMaxScaler()
  #scaler = preprocessing.StandardScaler()
  deg_data[:,1:] = scaler.fit_transform(deg_data[:,1:])
  print('deg_data_shape:',deg_data.shape)
  #Add to df_dummies
  #print(pd.DataFrame(deg_data).head(20))
  #Scale Continuous Var to be in range [0,1]
  df_new = pd.concat([df_dummies,pd.DataFrame(deg_data)],axis=1).round(decimals=6)
  df_new['y'] = y
  df_new.to_csv(datadir+'adult_processed_poly={}.csv'.format(deg))


deg_data_shape: (45222, 210)


In [None]:
df_arr = df_new.to_numpy()
n,d = df_arr.shape
np.sum(df_arr == 0) / (n * d)

0.7198865074785726

In [None]:
!pip install cvxpy
!pip install Mosek

In [None]:
import cvxpy as cp

In [None]:
n = 10000 #number of samples to use
df_np = df_dummies.to_numpy()
df_np.shape

(30162, 92)

In [None]:
X = df_np[:30000,:-1]
y = df_np[:30000,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = n / 30000, shuffle = True, random_state = 77)

In [None]:
np.random.seed(1)
d = X_train.shape[1]
theta = cp.Variable(d)
p = np.ones(n) / n
c = 0.05

In [None]:
obj = -p @ (cp.multiply(1-y_train, X_train @ theta) -cp.logistic(X_train @ theta))

In [None]:
sex_normalized = X_train[:,0] - np.mean(X_train[:,0])

In [None]:
constraints = [p @ cp.multiply(sex_normalized, X_train @ theta) <= c]
constraints += [-p @ cp.multiply(sex_normalized, X_train @ theta) <= c]
problem = cp.Problem(cp.Minimize(obj), constraints)
problem.solve(solver=cp.MOSEK, verbose = True)

                                     CVXPY                                     
                                     v1.2.3                                    
(CVXPY) Jan 15 11:08:58 PM: Your problem has 97 variables, 2 constraints, and 0 parameters.
(CVXPY) Jan 15 11:08:58 PM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Jan 15 11:08:58 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Jan 15 11:08:58 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Jan 15 11:08:58 PM: Compiling problem (target solver=MOSEK).
(CVXPY) Jan 15 11:08:58 PM: Reduction chain: Dcp2Cone -> CvxAttr2Constr -> ConeMatrixStuffing -

Error: ignored

In [None]:
problem.value

In [None]:
X_train

In [None]:
#Add polynomial terms to our data 



In [None]:
#Normalize by dividing with its max value for [capital-gain and captial-loss]. For other continuous variables, normalize by subtracting its mean and dividing by std
df['capital-gain'] = df['capital-gain'] / df['capital-gain'].max()
df['capital-loss'] = df['capital-loss'] / df['capital-loss'].max()
scaler = preprocessing.StandardScaler()
df[['age', 'fnlwgt', 'edu-num', 'hours-per-week']] = scaler.fit_transform(df[['age', 'fnlwgt', 'edu-num', 'hours-per-week']])

99999
