In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Load your data
# Replace 'gene_expression.csv' and 'patient_annotation.csv' with actual file paths
gene_expression_df = pd.read_csv('serial_norm_gene_exp_df.csv', index_col=0)  # Gene expression data
patient_annotation_df = pd.read_csv('serial_samples_annotation.csv')          # Patient annotation data

# Step 1: Calculate the average expression for each gene and select the top 500 genes

# Step 2: Transpose gene data to have patients as rows and genes as columns
genes_transposed = gene_expression_df.T
genes_transposed.index.name = 'X'  # Rename index to match patient ID for merging

# Step 3: Merge the gene expression data with patient annotations
combined_df = patient_annotation_df.merge(genes_transposed, on='X', how='inner')

# Step 4: Prepare features (gene expressions) and target (Delirium) for modeling
X = combined_df.drop(columns=['X', 'Delirium', 'Diagnosis', 'Steroids', 'Late_del', 'Subject', 'Day'])  # Feature matrix
y = combined_df['Diagnosis']  # Target variable


from sklearn.feature_selection import SelectKBest, f_classif
#ANOVA F-value between label/feature for classification tasks.
select_features = SelectKBest(f_classif, k = 200).fit(X, y)

X_selected = select_features.fit_transform(X, y)



In [17]:
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.svm import SVC

In [18]:
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1],
    'kernel': ['rbf', 'linear']
}

# Set up inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=1)  # Inner CV for hyperparameter tuning
outer_cv = KFold(n_splits=5, shuffle=True, random_state=1)  # Outer CV for model evaluation

# Initialize the model
model = SVC()

# Use GridSearchCV for hyperparameter tuning within each outer fold
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=inner_cv, scoring='accuracy')

# Perform nested cross-validation
nested_scores = cross_val_score(grid_search, X, y, cv=outer_cv, scoring='accuracy')

# Print results
print(f"Nested CV Accuracy: {nested_scores.mean():.3f} ± {nested_scores.std():.3f}")

Nested CV Accuracy: 0.895 ± 0.009


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import numpy as np



# Define the parameter grid for logistic regression hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10],              # Regularization strength
    'penalty': ['l2'],               # Regularization type (l2 is standard for LogisticRegression)
    'solver': ['lbfgs']              # Solver compatible with l2 penalty
}

# Set up inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=1)  # for hyperparameter tuning
outer_cv = KFold(n_splits=5, shuffle=True, random_state=1)  # for model evaluation

# Initialize the logistic regression model
log_reg = LogisticRegression(max_iter=1000)

# Use GridSearchCV for hyperparameter tuning within each outer fold
grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=inner_cv, scoring='accuracy')

# Perform nested cross-validation
nested_scores = cross_val_score(grid_search, X, y, cv=outer_cv, scoring='accuracy')

# Output results
print(f"Nested CV Accuracy: {nested_scores.mean():.3f} ± {nested_scores.std():.3f}")


Nested CV Accuracy: 0.864 ± 0.022


In [20]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1],
    'kernel': ['rbf', 'linear']
}

# Set up inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=1)  # Inner CV for hyperparameter tuning
outer_cv = KFold(n_splits=5, shuffle=True, random_state=1)  # Outer CV for model evaluation

# Initialize the model
model = RandomForestClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20],     # Maximum depth of each tree
    'min_samples_split': [2, 5, 10], # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]    # Minimum samples required to be at a leaf node
}

# Set up inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=1)  # for hyperparameter tuning
outer_cv = KFold(n_splits=5, shuffle=True, random_state=1)  # for model evaluation

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=1)

# Use GridSearchCV for hyperparameter tuning within each outer fold
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=inner_cv, scoring='accuracy')

# Perform nested cross-validation
nested_scores = cross_val_score(grid_search, X, y, cv=outer_cv, scoring='accuracy')

# Print the results
print(f"Nested CV Accuracy: {nested_scores.mean():.3f} ± {nested_scores.std():.3f}")

Nested CV Accuracy: 0.735 ± 0.025


In [15]:
from sklearn.neighbors import KNeighborsClassifier

param_grid = {
    'n_neighbors': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,18,20,21],     # Different values for number of neighbors
    'weights': ['uniform', 'distance'],  # Use either uniform or distance-based weights
    'metric': ['euclidean', 'manhattan'] # Distance metrics to consider
}

# Set up inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=1)  # for hyperparameter tuning
outer_cv = KFold(n_splits=5, shuffle=True, random_state=1)  # for model evaluation

# Initialize the K-Nearest Neighbors model
knn_model = KNeighborsClassifier()

# Use GridSearchCV for hyperparameter tuning within each outer fold
grid_search = GridSearchCV(estimator=knn_model, param_grid=param_grid, cv=inner_cv, scoring='accuracy')

# Perform nested cross-validation
nested_scores = cross_val_score(grid_search, X, y, cv=outer_cv, scoring='accuracy')

# Print the results
print(f"Nested CV Accuracy: {nested_scores.mean():.3f} ± {nested_scores.std():.3f}")


Nested CV Accuracy: 0.794 ± 0.061
