<a href="https://colab.research.google.com/github/Kiron-Ang/DFHL/blob/main/cortisol_bdi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Print Python version and install/update/import libraries!
!python -V

!pip install -U scikit-learn > output.txt
import sklearn
print("scikit-learn", sklearn.__version__)

!pip install -U matplotlib > output.txt
import matplotlib
print("matplotlib", matplotlib.__version__)

!pip install -U seaborn > output.txt
import seaborn
print("seaborn", seaborn.__version__)

!pip install -U polars > output.txt
import polars
print("polars", polars.__version__)

# Read in two CSV files, one with cortisol data and one with BDI data
cortisol = polars.read_csv("cortisol.csv")
cortisol = cortisol.drop_nulls()
print("cortisol", cortisol.shape)

bdi = polars.read_csv("bdi.csv")
bdi = bdi.drop_nulls()
print("bdi", bdi.shape)

Python 3.10.12
scikit-learn 1.5.2
matplotlib 3.7.1
seaborn 0.13.2
polars 1.8.2
cortisol (21363, 7)
bdi (403, 7)


In [28]:
# Create two separate DataFrames to hold input X and target y
X = cortisol[:,1:]
print("X", X.shape)
print("X contains", X.columns)
y = cortisol[:,0]
print("y", y.shape)
print("y contains", y.name)

X (21363, 6)
X contains ['sample_month', 'gender', 'age', 'collection_time_minutes', 'awakening_time_minutes', 'minutes_awake']
y (21363,)
y contains cortisol_micrograms_per_deciliter


In [81]:
# Make a pipeline to eliminate data leakage
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.neighbors
import sklearn.ensemble

pipeline = sklearn.pipeline.make_pipeline(
    sklearn.ensemble.HistGradientBoostingRegressor(
        loss = "squared_error",
        learning_rate = 0.2,
        max_iter = 1000,
        max_leaf_nodes = 20,
        scoring = "r2",
        random_state = 1234
    )
)

print(pipeline)

Pipeline(steps=[('histgradientboostingregressor',
                 HistGradientBoostingRegressor(learning_rate=0.2, max_iter=1000,
                                               max_leaf_nodes=20,
                                               random_state=1234,
                                               scoring='r2'))])


In [82]:
import sklearn.model_selection

# Use cross_validate instead of cross_val_score because
# cross_validate can give us more information
cross_validate_results = sklearn.model_selection.cross_validate(
    pipeline, X, y, cv = 10, scoring = ("r2", "neg_root_mean_squared_error"),
    verbose = 3
)

print("Mean of R2 scores:", cross_validate_results["test_r2"].mean())
print("Mean of Negative RMSE values:", cross_validate_results["test_neg_root_mean_squared_error"].mean())

[CV] END  neg_root_mean_squared_error: (test=-0.142) r2: (test=0.325) total time=   0.1s
[CV] END  neg_root_mean_squared_error: (test=-0.130) r2: (test=0.417) total time=   0.2s
[CV] END  neg_root_mean_squared_error: (test=-0.131) r2: (test=0.389) total time=   0.1s
[CV] END  neg_root_mean_squared_error: (test=-0.099) r2: (test=0.523) total time=   0.3s
[CV] END  neg_root_mean_squared_error: (test=-0.112) r2: (test=0.466) total time=   0.2s
[CV] END  neg_root_mean_squared_error: (test=-0.101) r2: (test=0.515) total time=   0.2s
[CV] END  neg_root_mean_squared_error: (test=-0.112) r2: (test=0.404) total time=   0.3s
[CV] END  neg_root_mean_squared_error: (test=-0.095) r2: (test=0.530) total time=   0.2s
[CV] END  neg_root_mean_squared_error: (test=-0.116) r2: (test=0.413) total time=   0.2s
[CV] END  neg_root_mean_squared_error: (test=-0.115) r2: (test=0.364) total time=   0.2s
Mean of R2 scores: 0.43456162259688097
Mean of Negative RMSE values: -0.11515924574056949
