In [None]:
# Team: Tethical Machine Learning
# Team members: Mellanie Martin, Wyatt Pigeon, Koby Grah

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

class SelectColumns( BaseEstimator, TransformerMixin ):
    def __init__( self, columns ):
        self.columns = columns

    def fit( self, xs, ys, **params ):
        return self
    
    def transform( self, xs ):
        return xs[ self.columns ]

if __name__ == "__main__":
    data = pd.read_csv("Data/test.csv")
    cleanedData = pd.get_dummies(data, drop_first=True)
    cleanedData = cleanedData.fillna(0)
    bg = data.filter(like='bg').columns
    cals = data.filter(like='cals').columns
    insulin = data.filter(like='insulin').columns
    regressor = TransformedTargetRegressor(
        GradientBoostingRegressor(criterion = 'squared_error'),
        func = np.sqrt,
        inverse_func = np.square
    )
    steps = [
        ('column_select', SelectColumns([bg, cals, insulin])),
        ('gradientBoost', regressor ),
    ]
    pipe = Pipeline(steps)

    grid = { 
        'column_select__columns': [
            [bg, insulin, cals],
        ],
        'gradientBoost__loss': ["squared_error", "absolute_error"],
        'gradientBoost__min_samples_split': range(2, 5),
        'gradientBoost__max_depth': range(1, 10),
        'gradient_boost': [
        GradientBoostingRegressor(criterion = 'squared_error') # no transformation
        ]
    }

    search = GridSearchCV(pipe, grid, cv = 5, scoring = "r2", n_jobs = -1)
    search.fit(xs, ys)
    print("\nGradient Boosting:")
    print(f"R-squared: {search.best_score_}")




Notes:
1. Final predicted value is a float, so this is a regression problem
2. Potential models to look at; Kernel Ridge, AdaBoost, Gradient Boosting
3. Pipeline with minimum 3 stages is needed, and we need to use GridSearchCV