<a href="https://colab.research.google.com/github/Kiron-Ang/DFHL/blob/main/cortisol_depression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Print Python version and install/update/import libraries!
!python -V

!pip install -U scikit-learn > output.txt
import sklearn
print("scikit-learn", sklearn.__version__)

!pip install -U polars > output.txt
import polars
print("polars", polars.__version__)

Python 3.10.12
scikit-learn 1.5.2
polars 1.9.0


In [2]:
# read_csv() to store data in a Polars DataFrame
cortisol = polars.read_csv("cortisol - cleansed.csv")
cortisol = cortisol.drop_nulls()
print("cortisol", cortisol.shape)
print("cortisol contains", cortisol.columns)

cortisol (21587, 14)
cortisol contains ['record_id', 'person_id', 'name', 'cortisol', 'month', 'day', 'year', 'collection', 'awakening', 'collection_minutes', 'awakening_minutes', 'difference_minutes', 'age', 'gender']


In [3]:
# Create two separate Polars DataFrames to hold input X and target y
X = cortisol["month", "collection_minutes", "awakening_minutes", "difference_minutes", "age", "gender"]
print("X", X.shape)
print("X contains", X.columns)
print(X)

y = cortisol["cortisol"]
print("y", y.shape)
print("y contains", y.name)
print(y)

X (21587, 6)
X contains ['month', 'collection_minutes', 'awakening_minutes', 'difference_minutes', 'age', 'gender']
shape: (21_587, 6)
┌───────┬────────────────────┬───────────────────┬────────────────────┬─────┬────────┐
│ month ┆ collection_minutes ┆ awakening_minutes ┆ difference_minutes ┆ age ┆ gender │
│ ---   ┆ ---                ┆ ---               ┆ ---                ┆ --- ┆ ---    │
│ i64   ┆ i64                ┆ i64               ┆ i64                ┆ i64 ┆ i64    │
╞═══════╪════════════════════╪═══════════════════╪════════════════════╪═════╪════════╡
│ 7     ┆ 450                ┆ 390               ┆ 60                 ┆ 1   ┆ 0      │
│ 8     ┆ 490                ┆ 420               ┆ 70                 ┆ 1   ┆ 0      │
│ 7     ┆ 465                ┆ 390               ┆ 75                 ┆ 1   ┆ 0      │
│ 7     ┆ 480                ┆ 375               ┆ 105                ┆ 1   ┆ 0      │
│ 7     ┆ 490                ┆ 360               ┆ 130                ┆ 1   ┆ 0   

In [5]:
# make_pipeline() to organize model steps
import sklearn.pipeline
import sklearn.ensemble

pipeline = sklearn.pipeline.make_pipeline(
    sklearn.ensemble.HistGradientBoostingRegressor(
        random_state = 900, scoring = "r2", validation_fraction = 0.2))

print(pipeline)

Pipeline(steps=[('histgradientboostingregressor',
                 HistGradientBoostingRegressor(random_state=900, scoring='r2',
                                               validation_fraction=0.2))])


In [8]:
# GridSearchCV() to find the best parameters
import sklearn.model_selection

param_grid = {
    'histgradientboostingregressor__learning_rate': [0.001, 0.01, 0.1, 1],
    'histgradientboostingregressor__max_iter': [1, 10, 100, 1000],
    'histgradientboostingregressor__max_leaf_nodes': [10, 100, 1000, 10000],
}

gridsearchcv = sklearn.model_selection.GridSearchCV(pipeline, param_grid)

gridsearchcv.fit(X, y)

best_estimator = gridsearchcv.best_estimator_
print(best_estimator)

Pipeline(steps=[('histgradientboostingregressor',
                 HistGradientBoostingRegressor(learning_rate=0.01,
                                               max_iter=1000, max_leaf_nodes=10,
                                               random_state=900, scoring='r2',
                                               validation_fraction=0.2))])


In [9]:
# make_pipeline() to incorporate grid search results
import sklearn.pipeline
import sklearn.ensemble

pipeline = sklearn.pipeline.make_pipeline(
    sklearn.ensemble.HistGradientBoostingRegressor(
        random_state = 900, scoring = "r2", validation_fraction = 0.2,
        learning_rate = 0.01, max_iter = 1000, max_leaf_nodes = 10))

print(pipeline)

Pipeline(steps=[('histgradientboostingregressor',
                 HistGradientBoostingRegressor(learning_rate=0.01,
                                               max_iter=1000, max_leaf_nodes=10,
                                               random_state=900, scoring='r2',
                                               validation_fraction=0.2))])


In [None]:
# cross_validate() to determine model performance


In [None]:
# fit() and predict() to forecast/extrapolate cortisol measurements