In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import accuracy_score
from itertools import cycle
from sklearn.svm import SVC

# Load demo data

data_dir = '/media/garlicseed/data2/Schizophrenia'
data = pd.read_csv(f'{data_dir}/train.tsv',sep='\t')

# Load graph measures (example of loading one, repeat for others)

g1 = pd.read_pickle(f'{data_dir}/feature_train/G1_feature.pkl')
g2 = pd.read_pickle(f'{data_dir}/feature_train/G2_feature.pkl')
GV = pd.read_pickle(f'{data_dir}/feature_train/GV_feature.pkl')
Curv = pd.read_pickle(f'{data_dir}/feature_train/Gaussian_Curvature_feature.pkl')
Thickness = pd.read_pickle(f'{data_dir}/feature_train/Thickness_feature.pkl')
GD = pd.read_pickle(f'{data_dir}/feature_train/Geodesic_distance_feature.pkl') 

# Stack the topo measures
comprehensive_measures = pd.concat([g1, g2,GV,Curv,Thickness,GD], axis=1)
# Combine with demo data
#features = pd.concat([data[['group']], topo_measures], axis=1)
annot = pd.read_csv(f"/media/garlicseed/data2/Schizophrenia/integration/material/schaefer_400_label.csv")

# 替换"7Networks_"前缀
annot['StructName'] = annot['StructName'].str.replace("7Networks_", "", regex=False)

# 进一步替换具体的字符
annot['StructName'] = annot['StructName'].replace({
    'LH_': 'L_', 'RH_': 'R_', 'DorsAttn': 'DA', 'Default': 'DF', 'SalVentAttn': 'SVA',
    'Limbic': 'LB', 'Cont': 'CON', 'SomMot': 'SM', 'Vis': 'VS'
}, regex=True)
roi_labels = annot['StructName'].values

# Prepare model matrix
X = comprehensive_measures

#处理索引
region_labels = X.columns.get_level_values('Region_Label')
networks = X.columns.get_level_values('Network')
modalities = X.columns.get_level_values('Modality')
types = X.columns.get_level_values('label')
region_x = X.columns.get_level_values('region')
column_info = {
    'region_labels': region_labels,
    'networks': networks,
    'modalities': modalities,
    'types':types,
    'regions':region_x
}
X.columns = region_labels

y1 = data['diagnosis'].astype('category')
from sklearn.model_selection import train_test_split, GridSearchCV
# Split data into train/test
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
encoder = LabelEncoder()
y = encoder.fit_transform(y1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3, stratify=y)
scaler = StandardScaler()
encoder = LabelEncoder()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = scaler.fit_transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_train.columns)

In [None]:
#grid_search
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# 模型字典
models_dict = {
    "RandomForest": Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(random_state=42, class_weight= 'balanced_subsample', n_jobs= -1))
    ]),
    "GradientBoosting": Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', GradientBoostingClassifier(random_state=42, ))
    ]),
    "LightGBM": Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LGBMClassifier(random_state=42, verbose=-1, n_jobs= -1, colsample_bylevel = 0.6, subsample = 0.7))
    ]),
    "CatBoost": Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', CatBoostClassifier(verbose=0, random_state=42, thread_count = -1, colsample_bylevel= 0.6, subsample = 0.7))
    ]),
    "SVM": Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', SVC(probability=True, class_weight='balanced', random_state=42))
    ])
}

# 超参数网格
param_grids = {
    "RandomForest": {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None,3, 5, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__max_features': ['sqrt', 'log2',None]
    },
    "GradientBoosting": {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7],
        'classifier__subsample': [0.8, 1.0]
    },
    "LightGBM": {
        'classifier__n_estimators': [50, 100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
       'classifier__max_depth': [-1, 10, 20],
        'classifier__num_leaves': [31, 50, 100]
    },
    "CatBoost": {
        'classifier__iterations': [100, 200, 500],
        'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
        'classifier__depth': [3, 6, 10]
    },
    "SVM": {
        'classifier__C': [0.001, 0.01, 0.1, 1],
        'classifier__gamma': [0.0001, 0.001, 0.01, 0.1],
        'classifier__kernel': ['linear', 'rbf']
    }
}

# 创建 GridSearchCV 对象
gridsearch_dict = {}

for model_name, pipeline in models_dict.items():
    print(f"Creating GridSearchCV for {model_name}...")
    param_grid = param_grids[model_name]
    gridsearch = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=10,
        scoring='accuracy',
        verbose=1,
        n_jobs=20
    )
    gridsearch_dict[f"{model_name}_gridsearch"] = gridsearch
    
for model_name, gridsearch in gridsearch_dict.items():
    print(f"Training {model_name}...")
    gridsearch.fit(X_train_scaled, y_train)  # 替换 X_train_scaled 和 y_train 为您的训练集
    print(f"Best parameters for {model_name}: {gridsearch.best_params_}")
    print(f"Best accuracy for {model_name}: {gridsearch.best_score_}")

    