In [19]:
import pandas as pd 
import numpy as np 
from pathlib import Path

from imblearn.under_sampling import RandomUnderSampler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, cross_validate
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split



In [5]:
outputBasePath = Path("../../data/results/ACSIncome_USA_2018_binned_imbalanced_16645/").resolve()
populationPath = outputBasePath / "kAnon/k10/output_sample.csv"
postSamplePath = outputBasePath / "SSample/k10/B(0.5)/B(0.5)_sample.csv"
settingsPath = outputBasePath / "kAnon/k10/settings.csv"

population_df = pd.read_csv(populationPath, delimiter=';')
sample_df = pd.read_csv(postSamplePath, delimiter=';')

In [6]:
categorical, numerical, default_target = ['COW', 'SCHL', 'MAR', 'OCCP', 'POBP', 'RELP', 'SEX', 'RAC1P'], ['AGEP','WKHP'], 'PINCP'
verbose = True

In [7]:
class RangeTransformer(BaseEstimator, TransformerMixin):

    @staticmethod
    def transform_column(value):
        if isinstance(value, str) and '*' not in value:
            tmp = value[1:-1].split(', ')
            min, max = int(tmp[0]), int(tmp[1])
            return (min + max) / 2
        else:
            return value

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for column in X:
            unique = X[column].unique()
            if len(unique) == 1 and '*' in unique:
                # column is suppressed
                X[column] = 0
            elif X[column].dtype == 'O':
                X[column] = X[column].transform(self.transform_column)
        return X
    
class SuppressedFilter(BaseEstimator, TransformerMixin):
    def __init__(self, qid):
        self.qid = qid

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if self.qid is not None:
            X = X.copy()
            groups = X.groupby(self.qid)
            key = tuple(['*'] * len(self.qid))
            if key in groups.groups:
                suppressed_rows = groups.get_group(name=key).index
                X.drop(suppressed_rows, inplace=True)
        return X

In [8]:
def read_attributes(settingsPath: str, attributeType: str) -> list:
  settings = pd.read_csv(settingsPath, delimiter=';')
  attributes = settings[attributeType].to_list()[0][1:-1].split(', ')
  return attributes

In [9]:
def drop_suppressed(df, qid):
    if qid is not None:
        suppressed_rows = df.groupby(qid).get_group(name=tuple(['*'] * len(qid))).index
        df.drop(suppressed_rows, inplace=True)
    return df.infer_objects()

In [10]:
def get_pipelines2(qid_list):
  pipeline = Pipeline([
    ('filter_suppressed', SuppressedFilter(qid_list)),
    ('range_to_mean', RangeTransformer()), 
    ('scaler', StandardScaler())
    ])
  return pipeline


In [11]:
def get_pipelines(mlbalance):
    numerical_pipe_1 = Pipeline([('range_to_mean', RangeTransformer()), ('scaler', StandardScaler())])
    # pipelines
    if not mlbalance:
        pipe = Pipeline([
            ('preprocessing', ColumnTransformer(
                transformers=[
                    ("cat", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical),
                    ("num", numerical_pipe_1, numerical)],
                remainder='passthrough',
                verbose=verbose)),
        ], verbose=verbose)
    else:
        # let ml balance the dataset
        pipe = Pipeline([
            ('sampler', RandomUnderSampler()),
            ('preprocessing', ColumnTransformer(
                transformers=[
                    ("cat", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical),
                    ("num", numerical_pipe_1, numerical)],
                remainder='passthrough',
                verbose=verbose)),
        ], verbose=verbose)

    numerical_pipe_2 = Pipeline([('range_to_mean', RangeTransformer()), ('scaler', StandardScaler())])
    baseline_pipe = Pipeline([
        ('preprocessing', ColumnTransformer(
            transformers=[("cat", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical),
                          ("num", numerical_pipe_2, numerical)],
            remainder='passthrough',
            verbose=verbose)),
        ('model', DummyClassifier())
    ], verbose=verbose)

    return pipe, baseline_pipe

In [12]:
qid_list = read_attributes(settingsPath, "QID")
pipe, baselinePipe = get_pipelines(False)
print(pipe)

drop_suppressed(population_df, qid_list)
X = population_df.drop(['PINCP'], axis=1)
y = population_df['PINCP']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=110)

print(qid_list)
X_train

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=-1),
                                                  ['COW', 'SCHL', 'MAR', 'OCCP',
                                                   'POBP', 'RELP', 'SEX',
                                                   'RAC1P']),
                                                 ('num',
                                                  Pipeline(steps=[('range_to_mean',
                                                                   RangeTransformer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['A

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P
12125,"[40, 80[",Employee of a private for-profit company or bu...,Higher education,Married,Nontechnical,North America,Reference person,4.0,*,White
9974,"[0, 40[",Employee of a private for-profit company or bu...,Secondary education,Never married or under 15 years old,Other,North America,Biological son or daughter,20.0,*,White
2457,"[0, 40[",Employee of a private for-profit company or bu...,Higher education,Never married or under 15 years old,Nontechnical,North America,Reference person,20.0,*,White
7309,"[40, 80[",Employee of a private for-profit company or bu...,Higher education,Married,Technical,North America,Reference person,45.0,*,White
3282,"[40, 80[",State government employee,Secondary education,Married,Nontechnical,North America,Husband/wife,24.0,*,White
...,...,...,...,...,...,...,...,...,...,...
7747,"[40, 80[",Employee of a private for-profit company or bu...,Higher education,Married,Other,North America,Husband/wife,50.0,*,White
11264,"[40, 80[","Local government employee (city, county, etc.)",Higher education,Divorced,Nontechnical,North America,Housemate or roommate,40.0,*,Black
2480,"[0, 40[",Employee of a private for-profit company or bu...,Higher education,Married,Nontechnical,North America,Reference person,50.0,*,White
11832,"[40, 80[",Employee of a private for-profit company or bu...,Higher education,Married,Technical,North America,Husband/wife,40.0,*,White


In [13]:
X_train = pipe.fit_transform(X_train, y_train)

[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   0.0s
[Pipeline] ..... (step 1 of 1) Processing preprocessing, total=   0.1s


In [14]:
X_train_df = pd.DataFrame(X_train)
X_train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,1.0,0.0,2.0,13.0,0.0,4.0,0.861438,-2.620308
1,0.0,2.0,2.0,1.0,2.0,1.0,0.0,4.0,-1.160849,-1.404502
2,0.0,0.0,2.0,0.0,2.0,13.0,0.0,4.0,-1.160849,-1.404502
3,0.0,0.0,1.0,2.0,2.0,13.0,0.0,4.0,0.861438,0.495196
4,6.0,2.0,1.0,0.0,2.0,7.0,0.0,4.0,0.861438,-1.100550
...,...,...,...,...,...,...,...,...,...,...
11207,0.0,0.0,1.0,1.0,2.0,7.0,0.0,4.0,0.861438,0.875135
11208,3.0,0.0,0.0,0.0,2.0,6.0,0.0,2.0,0.861438,0.115256
11209,0.0,0.0,1.0,0.0,2.0,13.0,0.0,4.0,-1.160849,0.875135
11210,0.0,0.0,1.0,2.0,2.0,7.0,0.0,4.0,0.861438,0.115256


In [15]:
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

In [20]:
y_pred = model.predict(pipe.fit_transform(X_test))
print(classification_report(y_test,y_pred))

[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   0.0s
[Pipeline] ..... (step 1 of 1) Processing preprocessing, total=   0.0s
                precision    recall  f1-score   support

     [0-20000[       0.68      0.67      0.67       694
  [100000-inf[       0.57      0.06      0.11       372
[20000-100000[       0.73      0.88      0.80      1737

      accuracy                           0.72      2803
     macro avg       0.66      0.54      0.53      2803
  weighted avg       0.70      0.72      0.68      2803

