In [1]:
# from FeatureCloud.app.engine.app import AppState, app_state, Role
import time
import os
import logging
# from data_fetcher import DataFetcher, ValidationDataFetcher
# from random_forest import randomForestA, randomForestB

from neo4j import GraphDatabase, Query, Record
from neo4j.exceptions import ServiceUnavailable
from pandas import DataFrame
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

# from utils import read_config,write_output,ResultRow,CSVResultsBuilder

# from FeatureCloud.app.engine.app import AppState, app_state

In [2]:
NEO4J_URI = "neo4j://83.229.84.12"
NEO4J_USERNAME = "tumaiReadonly"
NEO4J_PASSWORD = "MAKEATHON2024"
NEO4J_DB = "graph2.db"

OUTPUT_DIR = "data"

In [69]:
class Subject:
    def __init__(self, subjectId, disease, icd10):
        self.subjectId = subjectId
        self.disease = disease
        self.hasIcd10 = icd10 != None
        self.icd10 = None if not self.hasIcd10 else icd10.replace("ICD10CM:", "")
        self.isControl = disease == "control"
        self.isSick = not self.isControl
        self.icdFirstLetter = self.icd10[0] if self.hasIcd10 else ("CTL" if self.isControl else "NC")
        self.phenotypes = []
        self.subjectMetrics = {}

    def __repr__(self):
        return f"Subject(subjectId={self.subjectId}, disease={self.disease}, icd10={self.icd10})"
    
class ValidationSubject:
    def __init__(self, subjectId):
        self.subjectId = subjectId
        self.phenotypes = []

    def __repr__(self):
        return f"ValidationSubject(subjectId={self.subjectId})"

class Phenotypes:
    def __init__(self, subjectId, phenotypes):
        self.subjectId = subjectId
        self.phenotypes = phenotypes

    def __repr__(self):
        return f"Phenotypes(subjectId={self.subjectId}, phenotypes={self.phenotypes})"
    
def get_phenotypes(session, subjectId):
    query = """MATCH (a:Biological_sample {subjectid:\"""" + subjectId + """\"})-[:HAS_PHENOTYPE]->(p:Phenotype) 
RETURN a.subjectid as subjectId, collect(p.id) as phenotypes"""
    data = session.run(query).data()
    if len(data) == 0:
        return Phenotypes(subjectId=subjectId, phenotypes=[])
    return Phenotypes(**data[0])

class SubjectMetrics:
    def __init__(self, subjectId, subjectMetrics):
        self.subjectId = subjectId
        self.subjectMetrics = subjectMetrics

    def __repr__(self):
        return f"SubjectMetrics(subjectId={self.subjectId}, numProteins={self.numProteins}, avgProteinScore={self.avgProteinScore}, minProteinScore={self.minProteinScore}, maxProteinScore={self.maxProteinScore}, sumProteinScore={self.sumProteinScore}, numGenes={self.numGenes}, avgGeneScore={self.avgGeneScore}, minGeneScore={self.minGeneScore}, maxGeneScore={self.maxGeneScore}, sumGeneScore={self.sumGeneScore}, numPhenotypes={self.numPhenotypes})"
    
def get_subject_metrics(session, subjectId):
    query = """MATCH (bs:Biological_sample {subjectid:\"""" + subjectId + """\"})
MATCH (bs)-[r_protein:HAS_PROTEIN]->()
WITH bs, 
     COUNT(DISTINCT r_protein) AS numProteins,
     AVG(r_protein.score) AS avgProteinScore,
     MIN(r_protein.score) AS minProteinScore,
     MAX(r_protein.score) AS maxProteinScore,
     SUM(r_protein.score) AS sumProteinScore
MATCH (bs)-[r_damage:HAS_DAMAGE]->()
WITH bs, numProteins, avgProteinScore, minProteinScore, maxProteinScore, sumProteinScore,
     COUNT(DISTINCT r_damage) AS numGenes,
     AVG(r_damage.score) AS avgGeneScore,
     MIN(r_damage.score) AS minGeneScore,
     MAX(r_damage.score) AS maxGeneScore,
     SUM(r_damage.score) AS sumGeneScore
MATCH (bs)-[r_phenotype:HAS_PHENOTYPE]->()
WITH bs, numProteins, avgProteinScore, minProteinScore, maxProteinScore, sumProteinScore, numGenes, avgGeneScore, minGeneScore, maxGeneScore, sumGeneScore,
     COUNT(DISTINCT r_phenotype) AS numPhenotypes
RETURN bs.subjectid AS subjectId,
       {numProteins: numProteins,
        avgProteinScore: avgProteinScore,
        minProteinScore: minProteinScore,
        maxProteinScore: maxProteinScore,
        sumProteinScore: sumProteinScore,
        numGenes: numGenes,
        avgGeneScore: avgGeneScore,
        minGeneScore: minGeneScore,
        maxGeneScore: maxGeneScore,
        sumGeneScore: sumGeneScore,
        numPhenotypes: numPhenotypes} as subjectMetrics"""
    data = session.run(query).data()
    if len(data) == 0:
        return SubjectMetrics(subjectMetrics={})
    return SubjectMetrics(**data[0])

class DataFetcher:
    def __init__(self, session):
        self.session = session
        self.subjects = self.fetch()
    
    def fetch(self):
        subjects = self.get_subjects(self.session)
        for subject in subjects:
            subject.phenotypes = get_phenotypes(self.session, subject.subjectId).phenotypes
            subject.subjectMetrics = get_subject_metrics(self.session, subject.subjectId).subjectMetrics
        return subjects

    def get_subjects(self, session):
    #     query = """MATCH (b:Biological_sample)-->(d:Disease)
    # WITH *, [s in d.synonyms WHERE s STARTS WITH "ICD10CM" | s] as ICD10
    # RETURN b.subjectid as subjectId, d.name as disease, ICD10[0] as icd10"""
        query = """MATCH (b:Biological_sample)
                OPTIONAL MATCH (b)-->(d:Disease)
                WITH b, d, CASE WHEN d IS NOT NULL THEN [s in d.synonyms WHERE s STARTS WITH "ICD10CM" | s] ELSE [] END as ICD10
                RETURN b.subjectid as subjectId, CASE WHEN d IS NOT NULL THEN d.name ELSE "control" END as disease, ICD10[0] as icd10"""   
        data = session.run(query).data()
        subjects = [Subject(**record) for record in data]
        return subjects

class ValidationDataFetcher:
    def __init__(self, session):
        self.session = session
        self.subjects = self.fetch()
    
    def fetch(self):
        subjects = self.get_subjects(self.session)
        for subject in subjects:
            subject.phenotypes = get_phenotypes(self.session, subject.subjectId).phenotypes
            subject.subjectMetrics = get_subject_metrics(self.session, subject.subjectId).subjectMetrics

        return subjects

    def get_subjects(self, session):
        query = """MATCH (b:Biological_sample)
    WHERE NOT (b)-[:HAS_DISEASE]-(:Disease)
    RETURN b.subjectid as subjectId"""     
        data = session.run(query).data()
        subjects = [ValidationSubject(**record) for record in data]
        return subjects

# Modelling

In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

"""
df format:
+------------+-----------------+---------------------+
| subjectId  | isSick (0/1)    | phenotypes (list)   |
+------------+-----------------+---------------------+
testdf format:
+------------+-------------------+
| subjectId  | phenotypes (list) |
+------------+-------------------+
"""
def randomForestA(df, testdf):
  # One-hot encode the phenotypes
  mlb = MultiLabelBinarizer()
  encoded_phenotypes = pd.DataFrame(mlb.fit_transform(df['phenotypes']), columns=mlb.classes_, index=df.index)
  df = df.drop('phenotypes', axis=1).join(encoded_phenotypes)

  # Convert boolean to int
  df['isSick'] = df['isSick'].astype(int)

  # Split the data into a training set and a test set
  X_train, y_train = df.drop(['isSick', 'icdFirstLetter'], axis=1), df['isSick']

  # Train a Random Forest classifier
  clf = RandomForestClassifier(n_estimators=100, random_state=42)
  clf.fit(X_train, y_train)

  # Make predictions on the test set
  encoded_test_data = pd.DataFrame(mlb.transform(testdf['phenotypes']), columns=mlb.classes_, index=testdf.index)
  testdf = testdf.drop('phenotypes', axis=1).join(encoded_test_data)
  y_pred = clf.predict(testdf)

  # Print a classification report
  #logger.info(f"Results Task A {classification_report(y_test, y_pred)}")

  # Create a DataFrame with subjectId and y_pred
  results_df = pd.DataFrame({
    'subjectId': testdf['subjectId'],
    'disease': y_pred
  })

  return results_df

def randomForestB(df, testdf):
  # Remove the control subjects and no ICD10 code subjects
  df = df[~df['icdFirstLetter'].isin(['CTL','NC'])]
  # One-hot encode the phenotypes
  mlb = MultiLabelBinarizer()
  encoded_phenotypes = pd.DataFrame(mlb.fit_transform(df['phenotypes']), columns=mlb.classes_, index=df.index)
  df = df.drop('phenotypes', axis=1).join(encoded_phenotypes)

  # Convert boolean to int
  df['isSick'] = df['isSick'].astype(int)

  # Split the data into a training set and a test set
  X_train, y_train = df.drop(['isSick', 'icdFirstLetter'], axis=1), df['icdFirstLetter']

  # Train a Random Forest classifier
  clf = RandomForestClassifier(n_estimators=100, random_state=42)
  clf.fit(X_train, y_train)

  # Make predictions on the test set
  encoded_test_data = pd.DataFrame(mlb.transform(testdf['phenotypes']), columns=mlb.classes_, index=testdf.index)
  testdf = testdf.drop('phenotypes', axis=1).join(encoded_test_data)
  y_pred = clf.predict(testdf)

  # Print a classification report
  #logger.info(f"Results Task A {classification_report(y_test, y_pred)}")

  # Create a DataFrame with subjectId and y_pred
  results_df = pd.DataFrame({
    'subjectId': testdf['subjectId'],
    'disease': y_pred
  })


  return results_df

In [112]:
OUTPUT_DIR = "data"
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

with driver.session(database=NEO4J_DB) as session:
            # Result Builder
            logger.info("Fetching data from Neo4j: ...")
            fetcher = DataFetcher(session)
            logger.info("Fetching data from Neo4j: Done")
            
            logger.info("Fetching validation data from Neo4j: ...")
            validationFetcher = ValidationDataFetcher(session)
            logger.info("Fetching validation data from Neo4j: Done")
            
data = [vars(obj) for obj in fetcher.subjects]
df = pd.DataFrame(data)
df = df[['subjectId', 'isSick', 'icdFirstLetter', 'phenotypes','subjectMetrics']]

testdata = [vars(obj) for obj in validationFetcher.subjects]
testdf = pd.DataFrame(testdata)
testdf = testdf[['subjectId', 'phenotypes','subjectMetrics']]

# resultA = randomForestA(df, testdf)
# logger.info(f"Results Task A: {resultA}")
# resultA.to_csv(f"{OUTPUT_DIR}/results_task_A.csv", index=False)

# resultB = randomForestB(df, testdf)
# logger.info(f"Results Task B: {resultB}")
# resultB.to_csv(f"{OUTPUT_DIR}/results_task_B.csv", index=False)

# Close the driver connection
driver.close()

INFO:__main__:Fetching data from Neo4j: ...
INFO:__main__:Fetching data from Neo4j: Done
INFO:__main__:Fetching validation data from Neo4j: ...
INFO:__main__:Fetching validation data from Neo4j: Done


In [113]:
# mlb = MultiLabelBinarizer()
# encoded_phenotypes = pd.DataFrame(mlb.fit_transform(df['phenotypes']), columns=mlb.classes_, index=df.index)
# df = df.drop('phenotypes', axis=1).join(encoded_phenotypes)

df.drop("phenotypes", inplace=True, axis=1) 

# Expand the dictionaries into separate columns
expanded_df = df['subjectMetrics'].apply(pd.Series)
# Join the expanded DataFrame with the original DataFrame
df = pd.concat([df.drop(['subjectMetrics'], axis=1), expanded_df], axis=1)
df

Unnamed: 0,subjectId,isSick,icdFirstLetter,minProteinScore,sumGeneScore,maxProteinScore,minGeneScore,avgProteinScore,avgGeneScore,numPhenotypes,maxGeneScore,numGenes,sumProteinScore,numProteins
0,10006,True,NC,1.421489,48.780277,19.959806,6.916941,10.284833,9.756055,73.0,13.352960,5.0,411.393301,40.0
1,10006,True,D,1.421489,48.780277,19.959806,6.916941,10.284833,9.756055,73.0,13.352960,5.0,411.393301,40.0
2,10011,False,CTL,2.454149,24.826939,18.806843,6.453478,10.469729,12.413470,46.0,18.373461,2.0,460.668063,44.0
3,10013,False,CTL,1.013870,16.145850,19.974438,16.145850,9.504722,16.145850,31.0,16.145850,1.0,399.198305,42.0
4,10017,True,NC,1.081303,86.700504,18.996982,2.207838,9.108670,12.385786,43.0,18.552647,7.0,418.998829,46.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,44212,True,D,1.672232,66.882241,19.898810,8.350957,10.939106,13.376448,74.0,17.117089,5.0,492.259786,45.0
167,44212,True,J,1.672232,66.882241,19.898810,8.350957,10.939106,13.376448,74.0,17.117089,5.0,492.259786,45.0
168,44212,True,NC,1.672232,66.882241,19.898810,8.350957,10.939106,13.376448,74.0,17.117089,5.0,492.259786,45.0
169,44222,False,CTL,1.546461,79.015097,19.954694,2.991419,10.637237,9.876887,26.0,18.835869,8.0,574.410807,54.0


In [114]:

# Convert boolean to int
df['isSick'] = df['isSick'].astype(int)
df.isSick.value_counts()

isSick
1    138
0     33
Name: count, dtype: int64

In [128]:

# Remove the control subjects and no ICD10 code subjects
# df = df[~df['icdFirstLetter'].isin(['NC'])]

# Remove duplicate subjects
df = df.drop_duplicates(subset=['subjectId'], keep='first')#.reset_index(drop=True)
# df = df.drop_duplicates()

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(df.drop(['isSick', 'icdFirstLetter'], axis=1), df['isSick'], test_size=0.2, random_state=42)
# X_train, y_train = df.drop(['isSick', 'icdFirstLetter'], axis=1), df['isSick']

In [116]:
df

Unnamed: 0,subjectId,isSick,icdFirstLetter,minProteinScore,sumGeneScore,maxProteinScore,minGeneScore,avgProteinScore,avgGeneScore,numPhenotypes,maxGeneScore,numGenes,sumProteinScore,numProteins
0,10006,1,NC,1.421489,48.780277,19.959806,6.916941,10.284833,9.756055,73.0,13.352960,5.0,411.393301,40.0
2,10011,0,CTL,2.454149,24.826939,18.806843,6.453478,10.469729,12.413470,46.0,18.373461,2.0,460.668063,44.0
3,10013,0,CTL,1.013870,16.145850,19.974438,16.145850,9.504722,16.145850,31.0,16.145850,1.0,399.198305,42.0
4,10017,1,NC,1.081303,86.700504,18.996982,2.207838,9.108670,12.385786,43.0,18.552647,7.0,418.998829,46.0
5,10019,1,J,1.840337,91.855458,19.984401,4.347068,10.259853,11.481932,61.0,18.826241,8.0,523.252490,51.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,44083,0,CTL,1.048651,93.139765,19.816916,3.306982,11.270304,9.313977,47.0,14.404929,10.0,574.785502,51.0
165,44154,1,NC,1.037814,55.358057,19.700540,2.167607,9.774397,13.839514,40.0,18.989707,4.0,606.012620,62.0
166,44212,1,D,1.672232,66.882241,19.898810,8.350957,10.939106,13.376448,74.0,17.117089,5.0,492.259786,45.0
169,44222,0,CTL,1.546461,79.015097,19.954694,2.991419,10.637237,9.876887,26.0,18.835869,8.0,574.410807,54.0


In [117]:
df.isSick.value_counts()

isSick
1    67
0    33
Name: count, dtype: int64

In [118]:
X_train

Unnamed: 0,subjectId,minProteinScore,sumGeneScore,maxProteinScore,minGeneScore,avgProteinScore,avgGeneScore,numPhenotypes,maxGeneScore,numGenes,sumProteinScore,numProteins
82,40304,1.002688,50.458710,19.767743,2.625830,9.924919,7.208387,46.0,13.472010,7.0,545.870556,55.0
150,43798,1.091013,74.205008,19.844212,5.104727,10.329445,12.367501,56.0,17.177646,6.0,495.813358,48.0
36,10074,1.043464,103.716615,19.012476,6.803698,10.243303,12.964577,28.0,17.633333,8.0,409.732140,40.0
62,10117,1.590377,70.469673,18.946089,4.793417,10.419203,11.744946,75.0,18.730705,6.0,531.379344,51.0
118,42066,1.196341,51.302923,19.459701,1.336072,10.078094,8.550487,18.0,18.555658,6.0,524.060864,52.0
...,...,...,...,...,...,...,...,...,...,...,...,...
97,40601,1.280661,74.767949,19.870943,10.648187,9.863071,14.953590,32.0,19.049671,5.0,443.838206,45.0
121,42135,1.506459,18.175845,19.655046,18.175845,9.962626,18.175845,72.0,18.175845,1.0,448.318159,45.0
21,10042,1.105995,50.186074,19.525514,6.514164,9.952516,10.037215,42.0,14.841579,5.0,547.388370,55.0
159,43881,1.159927,30.408335,19.704340,3.091496,10.331551,10.136112,57.0,19.147586,3.0,702.545454,68.0


In [119]:
y_train

82     1
150    1
36     0
62     1
118    0
      ..
97     1
121    1
21     0
159    1
76     1
Name: isSick, Length: 80, dtype: int64

In [120]:
X_test

Unnamed: 0,subjectId,minProteinScore,sumGeneScore,maxProteinScore,minGeneScore,avgProteinScore,avgGeneScore,numPhenotypes,maxGeneScore,numGenes,sumProteinScore,numProteins
144,42458,1.296069,46.409462,19.864944,3.111947,10.377044,9.281892,24.0,15.461336,5.0,446.212877,43.0
80,40277,1.088545,75.499846,19.595922,7.9025,9.831637,15.099969,21.0,19.335538,5.0,550.571673,56.0
119,42075,1.006855,53.662126,19.526848,8.666488,10.142552,13.415531,78.0,17.465269,4.0,436.129754,43.0
66,10124,1.677427,104.172135,18.216292,8.442071,10.293495,14.881734,69.0,17.781632,7.0,473.500757,46.0
65,10120,1.474158,46.97137,19.016267,12.265479,10.197943,15.657123,74.0,19.707136,3.0,530.293032,52.0
58,10111,1.413594,46.496818,19.875273,6.527706,11.612175,11.624205,50.0,18.489121,4.0,603.833106,52.0
31,10064,1.04876,66.649988,19.735617,1.468478,10.176369,9.521427,35.0,18.288977,7.0,559.700293,55.0
138,42367,1.059922,41.180032,19.849491,1.14187,10.809915,10.295008,63.0,16.743124,4.0,508.065996,47.0
13,10035,1.185555,84.884097,19.998401,6.551054,9.803031,12.1263,17.0,19.389959,7.0,519.560646,53.0
0,10006,1.421489,48.780277,19.959806,6.916941,10.284833,9.756055,73.0,13.35296,5.0,411.393301,40.0


In [121]:

# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train.drop(columns=["subjectId"]), y_train)

# # Make predictions on the test set
# encoded_test_data = pd.DataFrame(mlb.transform(testdf['phenotypes']), columns=mlb.classes_, index=testdf.index)
# testdf = testdf.drop('phenotypes', axis=1).join(encoded_test_data)
# # Expand the dictionaries into separate columns
# expanded_testdf = testdf['subjectMetrics'].apply(pd.Series)
# # Join the expanded DataFrame with the original DataFrame
# testdf = pd.concat([testdf.drop(['subjectMetrics'], axis=1), expanded_df], axis=1)
# y_pred = clf.predict(testdf)
y_pred = clf.predict(X_test.drop(columns=["subjectId"]))


# Print a classification report
# logger.info(f"Results Task A {classification_report(y_test, y_pred)}")
print(f"Results Task A {classification_report(y_test, y_pred)}")

# Create a DataFrame with subjectId and y_pred
results_df = pd.DataFrame({
'subjectId': X_test['subjectId'],
'disease': y_pred
})

Results Task A               precision    recall  f1-score   support

           0       0.50      0.17      0.25         6
           1       0.72      0.93      0.81        14

    accuracy                           0.70        20
   macro avg       0.61      0.55      0.53        20
weighted avg       0.66      0.70      0.64        20



In [130]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

from neo4j import GraphDatabase
from pandas import DataFrame
import csv
from neo4j.debug import watch
import os

import warnings
from sklearn.exceptions import ConvergenceWarning
# Ignore ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)

# Define classifiers
classifiers = {
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'SVC': SVC(),
    'LogisticRegression': LogisticRegression()
}

# Define parameter grids for each classifier
param_grids = {
    'RandomForestClassifier': {
        'classifier__n_estimators': [10, 50, 100, 200],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10]
    },
    'GradientBoostingClassifier': {
        'classifier__n_estimators': [10, 50, 100, 200],
        'classifier__learning_rate': [0.1, 0.01, 0.001],
        'classifier__max_depth': [3, 10, 20]
    },
    'DecisionTreeClassifier': {
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10]
    },
    'KNeighborsClassifier': {
        'classifier__n_neighbors': [3, 5, 7, 10],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__p': [1, 2]
    },
    'SVC': {
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__gamma': ['scale', 'auto']
    },
    'LogisticRegression': {
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    }
}


# Initialize DataFrame to store metrics
metrics_df = pd.DataFrame(columns=['precision', 'recall', 'f1', 'roc_auc'])

# Train each model, make predictions, and store metrics
for name, classifier in classifiers.items():
    model = Pipeline([
        ('classifier', classifier)
    ])
    
    param_grid = param_grids[name]
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    # Print the confusion matrix
    print(f"\n{name}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Print the best parameters
    print("Best parameters:", grid_search.best_params_)

    print("Test Values:", y_test)
    print("Predictions:", y_pred)
    
    

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    metrics_df.loc[name] = [precision, recall, f1, roc_auc]

print("\nMetrics DataFrame:")
print(metrics_df)


RandomForestClassifier
Confusion Matrix:
[[ 3  3]
 [ 1 13]]
Best parameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}
Test Values: 144    0
80     0
119    1
66     1
65     1
58     1
31     0
138    1
13     1
0      1
26     0
47     1
123    0
50     0
155    1
5      1
129    1
131    1
16     1
48     1
Name: isSick, dtype: int64
Predictions: [0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 0]

GradientBoostingClassifier
Confusion Matrix:
[[ 1  5]
 [ 2 12]]
Best parameters: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__n_estimators': 50}
Test Values: 144    0
80     0
119    1
66     1
65     1
58     1
31     0
138    1
13     1
0      1
26     0
47     1
123    0
50     0
155    1
5      1
129    1
131    1
16     1
48     1
Name: isSick, dtype: int64
Predictions: [1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 0]

DecisionTreeClassifier
Confusion Matrix:
[[1 5]
 [5 9]]
Best parameters: {'classifier__max_dep




LogisticRegression
Confusion Matrix:
[[ 4  2]
 [ 4 10]]
Best parameters: {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}
Test Values: 144    0
80     0
119    1
66     1
65     1
58     1
31     0
138    1
13     1
0      1
26     0
47     1
123    0
50     0
155    1
5      1
129    1
131    1
16     1
48     1
Name: isSick, dtype: int64
Predictions: [0 1 1 1 1 1 0 1 0 1 0 0 0 1 0 1 1 1 1 0]

Metrics DataFrame:
                            precision    recall        f1   roc_auc
RandomForestClassifier       0.812500  0.928571  0.866667  0.714286
GradientBoostingClassifier   0.705882  0.857143  0.774194  0.511905
DecisionTreeClassifier       0.642857  0.642857  0.642857  0.404762
KNeighborsClassifier         0.625000  0.714286  0.666667  0.357143
SVC                          0.705882  0.857143  0.774194  0.511905
LogisticRegression           0.833333  0.714286  0.769231  0.690476


60 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/home/jadam/miniconda3/envs/mi4peoplec4r/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jadam/miniconda3/envs/mi4peoplec4r/lib/python3.12/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jadam/miniconda3/envs/mi4peoplec4r/lib/python3.12/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_para