ToC:
- prior as constraint
- 



In [None]:

model = data x constraints 

P(theta | D ) =  P(D | theta) P(theta)

In [6]:
from sklego.datasets import load_arrests

In [37]:
df = load_arrests(give_pandas=True).assign(
    colour=lambda d: d['colour'] == 'Black',
    sex=lambda d: d['sex'] == 'Female',
    age=lambda d: d['age'] < 25,
)

X, y = df.drop(columns='released'), df['released']

df.head()

Unnamed: 0,released,colour,year,age,sex,employed,citizen,checks
0,Yes,False,2002,True,False,Yes,Yes,3
1,No,True,1999,True,False,Yes,Yes,3
2,Yes,False,2000,True,False,Yes,Yes,3
3,No,True,2000,False,False,Yes,Yes,1
4,Yes,True,1999,False,True,Yes,Yes,1


In [104]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklego.common import as_list
from sklearn.utils.validation import check_is_fitted

class ColumnDropper(BaseEstimator, TransformerMixin):

    def __init__(self, columns: list):
        # if the columns parameter is not a list, make it into a list
        self.columns = columns

    def fit(self, X, y=None):
        """
        Checks 1) if input is a DataFrame, and 2) if column names are in this DataFrame
        :param X: ``pd.DataFrame`` on which we apply the column selection
        :param y: ``pd.Series`` labels for X. unused for column selection
        :returns: ``ColumnSelector`` object.
        """
        self.columns = as_list(self.columns)
        self._check_X_for_type(X)
        self._check_column_names(X)
        self.feature_names_ = list(X.drop(columns=self.columns).columns)
        self._check_column_length()
        return self

    def transform(self, X):
        """Returns a pandas DataFrame with only the specified columns
        :param X: ``pd.DataFrame`` on which we apply the column selection
        :returns: ``pd.DataFrame`` with only the selected columns
        """
        check_is_fitted(self, ['feature_names_'])
        self._check_X_for_type(X)
        if self.columns:
            return X.drop(columns=self.columns)
        return X

    def get_feature_names(self):
        return self.feature_names_

    def _check_column_length(self):
        """Check if all columns are droped"""
        if len(self.feature_names_) == 0:
            raise ValueError(f"Dropping {self.columns} would result in an empty output DataFrame")

    def _check_column_names(self, X):
        """Check if one or more of the columns provided doesn't exist in the input DataFrame"""
        non_existent_columns = set(self.columns).difference(X.columns)
        if len(non_existent_columns) > 0:
            raise KeyError(f'{list(non_existent_columns)} column(s) not in DataFrame')

    @staticmethod
    def _check_X_for_type(X):
        """Checks if input of the Selector is of the required dtype"""
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Provided variable X is not of type pandas.DataFrame")
            
            
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns: list):
        # if the columns parameter is not a list, make it into a list
        self.columns = columns

    def fit(self, X, y=None):
        """
        Checks 1) if input is a DataFrame, and 2) if column names are in this DataFrame
        :param X: ``pd.DataFrame`` on which we apply the column selection
        :param y: ``pd.Series`` labels for X. unused for column selection
        :returns: ``ColumnSelector`` object.
        """
        self.columns = as_list(self.columns)
        self._check_X_for_type(X)
        self._check_column_length()
        self._check_column_names(X)
        return self

    def transform(self, X):
        """Returns a pandas DataFrame with only the specified columns
        :param X: ``pd.DataFrame`` on which we apply the column selection
        :returns: ``pd.DataFrame`` with only the selected columns
        """
        self._check_X_for_type(X)
        if self.columns:
            return X[self.columns]
        return X

    def get_feature_names(self):
        return self.columns

    def _check_column_length(self):
        """Check if no column is selected"""
        if len(self.columns) == 0:
            raise ValueError("Expected columns to be at least of length 1, found length of 0 instead")

    def _check_column_names(self, X):
        """Check if one or more of the columns provided doesn't exist in the input DataFrame"""
        non_existent_columns = set(self.columns).difference(X.columns)
        if len(non_existent_columns) > 0:
            raise KeyError(f'{list(non_existent_columns)} column(s) not in DataFrame')

    @staticmethod
    def _check_X_for_type(X):
        """Checks if input of the Selector is of the required dtype"""
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Provided variable X is not of type pandas.DataFrame")

In [128]:
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklego.metrics import equal_opportunity_score
from sklego.preprocessing import PandasTypeSelector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklego.linear_model import EqualOpportunityClassifier


categorical_pipeline = make_pipeline(
    PandasTypeSelector('object'),
    OneHotEncoder(sparse=False, drop='first'),
)
numerical_pipeline = make_pipeline(
    PandasTypeSelector('number'),
    StandardScaler()
)
pipeline = make_pipeline(
    ColumnDropper(['colour', 'age', 'sex']),
    make_union(
        categorical_pipeline,
        numerical_pipeline,
    ),
    LogisticRegression(class_weight='balanced')
    
)

eq_op_pipeline = make_pipeline(
    make_union(
        ColumnSelector(['colour', 'age', 'sex']),
        categorical_pipeline,
        numerical_pipeline,
    ),
    EqualOpportunityClassifier(covariance_threshold=0.9, positive_target='Yes', sensitive_cols=[0, 1, 2])
    
)



In [129]:
from sklearn.model_selection import cross_validate, StratifiedKFold, GridSearchCV
from sklearn.metrics import precision_score, recall_score, make_scorer
gs = GridSearchCV(
    pipeline, 
    param_grid={},
    scoring = {
        'eq_op_colour': equal_opportunity_score('colour', positive_target='Yes'),
        'eq_op_age': equal_opportunity_score('age', positive_target='Yes'),
        'eq_op_sex': equal_opportunity_score('sex', positive_target='Yes'),
        'precision': make_scorer(precision_score, pos_label='Yes'),
        'recall': make_scorer(recall_score, pos_label='Yes')
    },
    cv=StratifiedKFold(5),
    refit='precision'
)

gs.fit(X, y)
# fitted_pipeline = gs.best_estimator_

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columndropper',
                                        ColumnDropper(columns=['colour', 'age',
                                                               'sex'])),
                                       ('featureunion',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('pipeline-1',
                                                                        Pipeline(memory=None,
                                                                                 steps=[('pandastypeselector',
                                                                                         PandasTypeSelector(exclude=None,
                                                                                                     

In [None]:
gs.best_estimator_[-1].coef_

In [130]:
gs.best_estimator_[-1].coef_

array([[ 0.79136998,  0.75373455, -0.01010115, -0.59511747]])

In [131]:
pd.DataFrame(gs.cv_results_).filter(like='mean_')

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_eq_op_colour,mean_test_eq_op_age,mean_test_eq_op_sex,mean_test_precision,mean_test_recall
0,0.038643,0.058397,0.698671,0.786174,0.830984,0.91879,0.634526


In [None]:
fitted_pipeline

In [65]:
(fitted_pipeline.predict(X) == 'Yes').mean()

0.9747416762342136

In [60]:
fitted_pipeline

(Pipeline(memory=None,
          steps=[('columndropper',
                  ColumnDropper(columns=['colour', 'age', 'sex'])),
                 ('featureunion',
                  FeatureUnion(n_jobs=None,
                               transformer_list=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('pandastypeselector',
                                                                   PandasTypeSelector(exclude=None,
                                                                                      include='object')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(categories='auto',
                                                                                 drop=None,
                                                                                 dtype=<c

In [13]:
df.groupby(['released', 'colour']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,year,age,sex,employed,citizen,checks
released,colour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
No,Black,333,333,333,333,333,333
No,White,559,559,559,559,559,559
Yes,Black,955,955,955,955,955,955
Yes,White,3379,3379,3379,3379,3379,3379


In [14]:
df.groupby(['released', 'sex']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,colour,year,age,employed,citizen,checks
released,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
No,Female,63,63,63,63,63,63
No,Male,829,829,829,829,829,829
Yes,Female,380,380,380,380,380,380
Yes,Male,3954,3954,3954,3954,3954,3954


In [10]:
df['year'].value_counts()a

2000    1270
2001    1211
1999    1099
1998     877
1997     492
2002     277
Name: year, dtype: int64

In [11]:
df['sex'].value_counts()

Male      4783
Female     443
Name: sex, dtype: int64

In [12]:
df['colour'].value_counts()

White    3938
Black    1288
Name: colour, dtype: int64