# Import all necessary packages to notebook

In [1]:
from xgboost import XGBClassifier
import os
from sklearn.metrics import accuracy_score
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
import optuna
import random 
from IPython.display import clear_output
random.seed(100)

### Basic informations about coumpounds and mice list 

In [2]:
DrugIds = {1:'Sal',
           2:'SalT',
           3:'Amph',
           4:'Raclo',
           5:'SCH',
           6:'CLZ1',
           7:'CLZ3'}

drg_combinations = {'Amph' :['Amph','Sal'],                   
                    'Raclo':['Raclo','SalT'],
                    'SCH'  :['SCH', 'Sal'],
                    'CLZ1' :['CLZ1','Sal'],
                    'CLZ3' :['CLZ3','Sal']}

ExcelPath   = os.getcwd()
MasterExcel = 'SIFT connectivity data with final exclusion.xlsx'

## read the connectivity excel to df
sheet_name = 'GGC_PreInjBaseline'
df_conn = pd.read_excel(os.path.join(ExcelPath, MasterExcel), sheet_name = sheet_name)

### Definations to classification on data provide by optuna

In [3]:
def calculate_accuracy(n_repetition, metaCol, df_drg_filt, final_intervals, MouseList, time_points, clf, params):    

    """
    Calculate accuracy scores for a given classifier over multiple time points.

    Parameters:
    n_repetition (int): Number of repetitions for cross-validation.
    metaCol (str): Name of the column containing metadata.
    df_drg_filt (DataFrame): Filtered DataFrame containing drug data.
    final_intervals (dict): Dictionary containing intervals for each time point.
    MouseList (list): List of MouseIDs.
    time_points (list): List of time points.
    clf: Classifier object.
    params: Parameters for the classifier.

    Returns:
    DataFrame: DataFrame containing accuracy scores.
    """
    
    temp_df = pd.DataFrame([params])    
    temp_df.columns = ['params_' + col for col in temp_df.columns]

    for tbin in time_points:       
        intervals = final_intervals[tbin]        
        df_tim_filt = df_drg_filt[df_drg_filt['times'].isin(intervals)]    
        
        test_accuracy  = [];
        train_accuracy = [];

        for i in range(0, n_repetition):
            
            y_train, X_train, y_test, X_test = [], [], [], []

            testList  = random.sample(MouseList, 2)
            trainList = [i for i in MouseList if i not in testList] ## collect data which is not in test list
            testList.sort()
            trainList.sort()

            ### Select desired events only 
            df_train = df_tim_filt[df_tim_filt['MouseID'].isin(trainList)]                
            df_test  = df_tim_filt[df_tim_filt['MouseID'].isin(testList)]

            X_train = df_train.drop(metaCol, axis = 1)
            X_test  = df_test.drop(metaCol, axis = 1)

            label_encoder = LabelEncoder()
            y_train = pd.Series(label_encoder.fit_transform(df_train['DrugID']))
            y_test  = pd.Series(label_encoder.fit_transform(df_test['DrugID']))

            clf.fit(X_train, y_train)            
            y_train_pred = clf.predict(X_train)
            y_test_pred  = clf.predict(X_test)
            # Calculate the training accuracy    
          
            # if i == 0:
            #     print('Training data class counts is ', y_train.value_counts())
            #     print('Test data class counts is ',y_test.value_counts())
                
            train_accuracy1 = accuracy_score(y_train, y_train_pred, normalize=True)
            test_accuracy1  = accuracy_score(y_test, y_test_pred, normalize=True)

            test_accuracy.append(test_accuracy1)
            train_accuracy.append(train_accuracy1)
            
        mean_test_accuracy  = np.array(test_accuracy).mean()
        mean_train_accuracy = np.array(train_accuracy).mean()
        
        
        temp_df[f'test_accu_{tbin}']  = mean_test_accuracy
        temp_df[f'train_accu_{tbin}'] = mean_test_accuracy
 
    return temp_df

## Create optuna objective functions here parameteres can be added as per the requirment

In [4]:
# Define objective function outside the loop
def objective(trial, df_drg_filt, MouseList):
    """
    Objective function for hyperparameter optimization.

    Parameters:
    trial: Instance of optuna's Trial object.
    df_drg_filt (DataFrame): Filtered DataFrame containing drug data.
    MouseList (list): List of MouseIDs.

    Returns:
    tuple: Test accuracies at different time points.
    """
    
    global results_df
    
    # Define the hyperparameters to tune
    params = {            
            'objective': 'binary:logistic',
            'metric': 'binary_logloss',
            'boosting_type': 'gbdt',
            'verbosity': 0,
            'n_jobs' : -1,  
            'tree_method': 'gpu_hist' ,
            'seed':42,
            'max_leaves': trial.suggest_int('n_estimators', 5, 30),
            'max_depth': trial.suggest_int('max_depth', 1, 12),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 1, log = True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.1),
            'min_child_weight': trial.suggest_float('min_child_weight', 0.01, 2.0, log = True),
            'num_boost_round': trial.suggest_int('num_boost_round', 5, 100),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0, step=0.1),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0, step=0.1)  }

    # Create ml model
    clf = XGBClassifier(**params)
    
    # List of the time bins to be optimized 
    time_points = ['30_min', '40_min', '50_min', '60_min']
    
    temp_df = calculate_accuracy(n_repetition, metaCol, df_drg_filt, final_intervals, MouseList, time_points, clf, params)
    
    test_accu_30_min = temp_df['test_accu_30_min'].iloc[0]
    test_accu_40_min = temp_df['test_accu_40_min'].iloc[0]
    test_accu_50_min = temp_df['test_accu_50_min'].iloc[0]
    test_accu_60_min = temp_df['test_accu_60_min'].iloc[0]
    
    average_test_accu = np.array([test_accu_30_min, test_accu_40_min, test_accu_50_min, test_accu_60_min]).mean()        
    temp_df['avg_test'] = average_test_accu

    if results_df.empty:
            results_df = temp_df.reset_index(drop=True)
    else:      
        results_df = pd.concat([results_df, temp_df.reset_index(drop=True)], axis = 0)     
    return test_accu_30_min, test_accu_40_min, test_accu_50_min, test_accu_60_min


## Run the optuna in loop 

In [1]:
## If you want to run freqency subspace optuna then set it to true
## If true need to specify freqbands (Important - Freq bands should start with _ because 
## If this is set to false then it runs all frequency features together
FeqSubSpace = False
freqBands = ['Delta', 'Theta', 'Beta', 'LoGamma', 'MidGamma', 'HiGamma']
n_repetition = 10
n_optuna_trials = 5000; ##Total number of trial to run with optuna (5000 is used in manuscript and it can be changed as per the requirment)

metaCol = ['times','MouseID','ExptDay','DrugID', 'DrugName']

if FeqSubSpace:
    excelName = f'XGB SIFT freq-subspace optuna results for {sheet_name}.xlsx'
else:
    excelName = f'XGB SIFT all freq together optuna results for {sheet_name}.xlsx'
    
writer = pd.ExcelWriter(excelName, engine='xlsxwriter')

interval_bin_dur = 10 ## Mins
total_expt_dur   = 60 ## Mins
trial_dur = 20 ## Secs
time_index1 = list(range(interval_bin_dur, total_expt_dur+1, interval_bin_dur))
time_index2 = list(range(trial_dur, (total_expt_dur*total_expt_dur)+1, trial_dur))
    
final_intervals = {}
for idx, x in enumerate(time_index1):
    key = f'{x}_min'
    """The value 30 is number obtained as 20s intervals for 10 mins i.e. give 30 trials """
    value = time_index2[idx * 30 : (idx + 1) * 30]  # Get the slice of time_index2
    final_intervals[key] = value

# Loop through all specified drug combinations
for drg_cmb in drg_combinations.keys():
    if 'Amph' in drg_cmb:
        MouseList = [530, 574, 591, 303, 304, 305, 336, 340]
        print(MouseList)
    else:
        MouseList = [530, 574, 591, 518, 587, 303, 304, 305, 336, 340]
        print(MouseList)
    
    ### If FeqSubSpace is True, perform additional steps
    if FeqSubSpace:        
        for freq in freqBands:  
            ## clear the notebook cell ouput (helps to reduce workload)
            clear_output(wait=False)

            ##Filter the data frame for specific drug vs veh combination as per the loop
            df_drg_filt = df_conn[df_conn['DrugName'].isin(drg_combinations[drg_cmb])]

            ##get a list of all col names which contains specified freq-band name
            freq_col = [col for col in df_drg_filt.columns if f'_{freq}' in col]
            
            filtered_col = metaCol + freq_col ### Combine meta column and freq-specific list
            df_drg_filt = df_drg_filt[filtered_col] ### Filter the data to get only specific freqband data

            ##Sheet name to save optuna results
            optuna_sheet_name = f'{drg_combinations[drg_cmb][0]} vs {drg_combinations[drg_cmb][1]}_{freq}'

            ##Empty dataframe for results from study
            results_df = pd.DataFrame()
            
            # Create a single Optuna study object
            study = optuna.create_study(directions=['maximize', 'maximize', 'maximize', 'maximize'])
           
            # Run optimization
            study.optimize(lambda trial: objective(trial, df_drg_filt, MouseList), n_trials=n_optuna_trials)

            ##Save results to specific sheet
            results_df.to_excel(writer, sheet_name=optuna_sheet_name, index=False)

    else:
        ## clear the notebook cell ouput (helps to reduce workload)
        clear_output(wait=False)
        
        ##Empty dataframe for results from study
        results_df = pd.DataFrame()
        
        ### Filter the data to get only specific freqband data
        df_drg_filt = df_conn[df_conn['DrugName'].isin(drg_combinations[drg_cmb])]

        # Create a single Optuna study object
        study = optuna.create_study(directions=['maximize', 'maximize', 'maximize', 'maximize'])

        ##Sheet name to save optuna results
        optuna_sheet_name = f'{drg_combinations[drg_cmb][0]} vs {drg_combinations[drg_cmb][1]}'
        
        # Run optimization
        study.optimize(lambda trial: objective(trial, df_drg_filt, MouseList), n_trials=n_optuna_trials)

        ##Save results to specific sheet
        results_df.to_excel(writer, sheet_name=optuna_sheet_name, index=False)

writer.close()    


## Get the excel from Optuna and save top 5 parameter combination to json which can be used to run classification

In [60]:
import json

ExcelPath   = "D:\\GluA1_WT_Pharmacology_analysis\\LGBM optuna\\" 
ResultExcel = 'XGB SIFT all freq together optuna results for GGC_PreInjBaseline.xlsx'

xlsx = pd.ExcelFile(os.path.join(ExcelPath, ResultExcel))
# Get the sheet names
sheet_names = xlsx.sheet_names

top5_parameters_dict = {}
for sheet in sheet_names:
    json_file_name =  f"Top5 parameters for {ResultExcel[0:-5]}.json"
    
    result_df = pd.read_excel(os.path.join(ExcelPath, ResultExcel), sheet_name = sheet)
    ##get a list of all col names which contains specified freq-band name
    params_col = [col for col in result_df.columns if 'params_' in col]
    sorted_df = result_df.sort_values(by='avg_test', ascending=False).reset_index(drop=True)
    
    if sorted_df['avg_test'].max() == sorted_df.iloc[0]['avg_test']:
        top5_params = sorted_df.iloc[0:5][params_col]
        top5_params.columns = [col.replace('params_', '') for col in top5_params.columns]        
        top5_parameters_dict[sheet] = top5_params.to_dict(orient='index')
    else:
        print(f'df is not sorted out for {sheet} so check code again')

with open(json_file_name, 'w') as json_file:
    json.dump(top5_parameters_dict, json_file, indent=4)
    

    