<a href="https://colab.research.google.com/github/Kiron-Ang/DFHL/blob/main/cortisol_bdi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# Print Python version and install/update/import libraries!
!python -V

!pip install -U scikit-learn > output.txt
import sklearn
print("scikit-learn", sklearn.__version__)

!pip install -U matplotlib > output.txt
import matplotlib
print("matplotlib", matplotlib.__version__)

!pip install -U seaborn > output.txt
import seaborn
print("seaborn", seaborn.__version__)

!pip install -U polars > output.txt
import polars
print("polars", polars.__version__)

# Read in one CSV file with cortisol data
cortisol = polars.read_csv("cortisol.csv")
cortisol = cortisol.drop_nulls()
print("cortisol", cortisol.shape)
print("cortisol contains", cortisol.columns)

Python 3.10.12
scikit-learn 1.5.2
matplotlib 3.7.1
seaborn 0.13.2
polars 1.8.2
cortisol (21363, 7)
cortisol contains ['cortisol_micrograms_per_deciliter', 'sample_month', 'gender', 'age', 'collection_time_minutes', 'awakening_time_minutes', 'minutes_awake']


In [2]:
# Create two separate DataFrames to hold input X and target y
X = cortisol[:,1:]
print("X", X.shape)
print("X contains", X.columns)
y = cortisol[:,0]
print("y", y.shape)
print("y contains", y.name)

X (21363, 6)
X contains ['sample_month', 'gender', 'age', 'collection_time_minutes', 'awakening_time_minutes', 'minutes_awake']
y (21363,)
y contains cortisol_micrograms_per_deciliter


In [8]:
# Make a pipeline to eliminate data leakage
import sklearn.pipeline
import sklearn.ensemble

pipeline = sklearn.pipeline.make_pipeline(
    sklearn.ensemble.HistGradientBoostingRegressor(
        random_state = 900,
        learning_rate = 0.3,
        max_iter = 1000,
        max_leaf_nodes = 20,
        scoring = "r2"
    )
)

print(pipeline)

Pipeline(steps=[('histgradientboostingregressor',
                 HistGradientBoostingRegressor(learning_rate=0.3, max_iter=1000,
                                               max_leaf_nodes=20,
                                               random_state=900,
                                               scoring='r2'))])


In [12]:
import sklearn.model_selection

# Use cross_validate instead of cross_val_score because
# cross_validate can give us more information
cross_validate_results = sklearn.model_selection.cross_validate(
    pipeline, X, y, cv = 10, scoring = ("r2", "neg_median_absolute_error"),
    verbose = 3
)

print("Mean of R2 scores:", cross_validate_results["test_r2"].mean())
print("Mean of Median Absolute Error values:", cross_validate_results["test_neg_median_absolute_error"].mean() * -1)

[CV] END  neg_median_absolute_error: (test=-0.054) r2: (test=0.318) total time=   0.2s
[CV] END  neg_median_absolute_error: (test=-0.050) r2: (test=0.420) total time=   0.2s
[CV] END  neg_median_absolute_error: (test=-0.057) r2: (test=0.350) total time=   0.1s
[CV] END  neg_median_absolute_error: (test=-0.047) r2: (test=0.530) total time=   0.1s
[CV] END  neg_median_absolute_error: (test=-0.046) r2: (test=0.461) total time=   0.1s
[CV] END  neg_median_absolute_error: (test=-0.047) r2: (test=0.514) total time=   0.2s
[CV] END  neg_median_absolute_error: (test=-0.047) r2: (test=0.411) total time=   0.1s
[CV] END  neg_median_absolute_error: (test=-0.046) r2: (test=0.535) total time=   0.2s
[CV] END  neg_median_absolute_error: (test=-0.049) r2: (test=0.410) total time=   0.1s
[CV] END  neg_median_absolute_error: (test=-0.053) r2: (test=0.362) total time=   0.2s
Mean of R2 scores: 0.43109365187288373
Mean of Median Absolute Error values: 0.04964318661179198


In [15]:
# Read in two CSV files with both hypothetical and real information
# of villagers
predict1 = polars.read_csv("predict1.csv")
predict1 = predict1.drop_nulls()
print("predict1", predict1.shape)
print("predict1 contains", predict1.columns)

predict2 = polars.read_csv("predict2.csv")
predict2 = predict2.drop_nulls()
print("predict2", predict2.shape)
print("predict2 contains", predict2.columns)

predict1 (403, 6)
predict1 contains ['sample_month', 'gender', 'age', 'collection_time_minutes', 'awakening_time_minutes', 'minutes_awake']
predict2 (403, 6)
predict2 contains ['sample_month', 'gender', 'age', 'collection_time_minutes', 'awakening_time_minutes', 'minutes_awake']


In [18]:
pipeline.fit(X, y)
predictions = pipeline.predict(predict1)
for pre in predictions:
  print(pre)

0.32188428419209886
0.32216865337533046
0.32216865337533046
0.32216865337533046
0.32188428419209886
0.31583834840683533
0.31583834840683533
0.3442784769942033
0.3442784769942033
0.29433479718223965
0.3455893611809033
0.3455893611809033
0.3455893611809033
0.32773519156555936
0.32773519156555936
0.27648062756689573
0.32773519156555936
0.27648062756689573
0.27648062756689573
0.27648062756689573
0.32773519156555936
0.27648062756689573
0.27648062756689573
0.32773519156555936
0.32773519156555936
0.32773519156555936
0.27648062756689573
0.32773519156555936
0.27648062756689573
0.27648062756689573
0.32773519156555936
0.24674535431632885
0.2979999183149925
0.2979999183149925
0.24674535431632885
0.24674535431632885
0.2979999183149925
0.2979999183149925
0.24674535431632885
0.24674535431632885
0.23913268525346348
0.23913268525346348
0.23913268525346348
0.2903872492521271
0.23913268525346348
0.2903872492521271
0.23913268525346348
0.2903872492521271
0.23913268525346348
0.23913268525346348
0.2391326852

In [19]:
predictions = pipeline.predict(predict2)
for pre in predictions:
  print(pre)

0.0684189612781259
0.06705851789086324
0.06705851789086324
0.06705851789086324
0.0684189612781259
0.0607282129223681
0.0607282129223681
0.06958701436790928
0.06958701436790928
0.07199935102121048
0.07063890763394783
0.07063890763394783
0.07063890763394783
0.07262505608109217
0.07262505608109217
0.07398549946835482
0.07262505608109217
0.07398549946835482
0.07398549946835482
0.07398549946835482
0.07262505608109217
0.07398549946835482
0.07398549946835482
0.07262505608109217
0.07262505608109217
0.07262505608109217
0.07398549946835482
0.07262505608109217
0.07398549946835482
0.07398549946835482
0.07262505608109217
0.07027092735715544
0.06891048396989279
0.06891048396989279
0.07027092735715544
0.07027092735715544
0.06891048396989279
0.06891048396989279
0.07027092735715544
0.07027092735715544
0.06265825829429005
0.06265825829429005
0.06265825829429005
0.06129781490702739
0.06265825829429005
0.06129781490702739
0.06265825829429005
0.06129781490702739
0.06265825829429005
0.06265825829429005
0.06