In [1]:
import pandas as pd
import numpy as np
import os
import json
from sklearn.preprocessing import OneHotEncoder
import joblib
import xgboost as xgb
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils import class_weight


In [2]:
folder_path= 'E:\\Case Comp\\NEST\\Training\\'

In [3]:
def load_parquet_files(directory):
    """
    Loads all Parquet files within a given directory into a dictionary,
    where the keys are the file names (without extension) and the values are
    the corresponding DataFrames.

    Args:
        directory: Path to the directory containing the Parquet files.

    Returns:
        A dictionary where keys are file names (without extension)
        and values are DataFrames read from the corresponding files.
    """

    dataframes = {}
    for filename in os.listdir(directory):
        if filename.endswith(".parquet"):
            filepath = os.path.join(directory, filename)
            file_name = filename.split(".")[0]  # Extract filename without extension
            dataframes[file_name] = pd.read_parquet(filepath)
    return dataframes

# Example usage:
directory = folder_path +"preprocessed_train\\"   # Replace with the actual directory path
dataframes = load_parquet_files(directory)

# Access individual DataFrames using their names
model_interventional_non_oncology = dataframes['model_interventional_non_oncology']
model_interventional_oncology = dataframes['model_interventional_oncology']
model_interventional_other = dataframes['model_interventional_other']
model_observational_non_oncology = dataframes['model_observational_non_oncology']
model_observational_oncology = dataframes['model_observational_oncology']
model_observational_other = dataframes['model_observational_other']

directory = folder_path +"preprocessed_test\\"  # Replace with the actual directory path
dataframes = load_parquet_files(directory)

# Access individual DataFrames using their names
test_model_interventional_non_oncology = dataframes['model_interventional_non_oncology']
test_model_interventional_oncology = dataframes['model_interventional_oncology']
test_model_interventional_other = dataframes['model_interventional_other']
test_model_observational_non_oncology = dataframes['model_observational_non_oncology']
test_model_observational_oncology = dataframes['model_observational_oncology']
test_model_observational_other = dataframes['model_observational_other']

In [4]:
# Set 'data_split' for train datasets
model_interventional_non_oncology['data_split'] = 'train'
model_interventional_oncology['data_split'] = 'train'
model_interventional_other['data_split'] = 'train'
model_observational_non_oncology['data_split'] = 'train'
model_observational_oncology['data_split'] = 'train'
model_observational_other['data_split'] = 'train'

# Set 'data_split' for test datasets
test_model_interventional_non_oncology['data_split'] = 'test'
test_model_interventional_oncology['data_split'] = 'test'
test_model_interventional_other['data_split'] = 'test'
test_model_observational_non_oncology['data_split'] = 'test'
test_model_observational_oncology['data_split'] = 'test'
test_model_observational_other['data_split'] = 'test'

In [5]:
# Set 'data_type' for train datasets
model_interventional_non_oncology['data_type'] = 'model_interventional_non_oncology'
model_interventional_oncology['data_type'] = 'model_interventional_oncology'
model_interventional_other['data_type'] = 'model_interventional_other'
model_observational_non_oncology['data_type'] = 'model_observational_non_oncology'
model_observational_oncology['data_type'] = 'model_observational_oncology'
model_observational_other['data_type'] = 'model_observational_other'

# Set 'data_type' for test datasets
test_model_interventional_non_oncology['data_type'] = 'model_interventional_non_oncology'
test_model_interventional_oncology['data_type'] = 'model_interventional_oncology'
test_model_interventional_other['data_type'] = 'model_interventional_other'
test_model_observational_non_oncology['data_type'] = 'model_observational_non_oncology'
test_model_observational_oncology['data_type'] = 'model_observational_oncology'
test_model_observational_other['data_type'] = 'model_observational_other'

In [6]:
# Combine all the DataFrames into one single dataset
combined_data = pd.concat([
    model_interventional_non_oncology, model_interventional_oncology, model_interventional_other,
    model_observational_non_oncology, model_observational_oncology, model_observational_other,
    test_model_interventional_non_oncology, test_model_interventional_oncology, test_model_interventional_other,
    test_model_observational_non_oncology, test_model_observational_oncology, test_model_observational_other
], ignore_index=True)

# Drop columns that start with 'Unnamed'
combined_data = combined_data.loc[:, ~combined_data.columns.str.startswith('Unnamed')]

combined_data.head()

Unnamed: 0,NCT Number,Study Status,Sex,Enrollment,Funder Type,Study Type,Start Month,Start Quarter,Condition Category,Conditions_Category,...,Brief Summary_Sentence_Count,Brief Summary_Avg_Word_Length,Brief Summary_Sentiment_Score,Total Outcome Measures_Word_Count,Total Outcome Measures_Char_Count,Total Outcome Measures_Sentence_Count,Total Outcome Measures_Avg_Word_Length,Total Outcome Measures_Sentiment_Score,data_split,data_type
0,NCT00421603,Completed,ALL,81.0,OTHER,INTERVENTIONAL,2,1,Other Rare or Unclassified,Non-Oncology,...,5.0,5.737374,0.341667,122.0,628.0,5.0,5.147541,0.025,train,model_interventional_non_oncology
1,NCT01340534,Completed,FEMALE,370.0,OTHER,INTERVENTIONAL,10,4,Other Rare or Unclassified,Non-Oncology,...,1.0,5.0,0.0,182.0,994.0,6.0,5.461538,0.5,train,model_interventional_non_oncology
2,NCT04166370,Not_Completed,FEMALE,0.0,OTHER,INTERVENTIONAL,-1,-1,Other Rare or Unclassified,Non-Oncology,...,13.0,6.003236,0.079242,69.0,381.0,1.0,5.521739,-0.3125,train,model_interventional_non_oncology
3,NCT04670601,Completed,ALL,24.0,INDUSTRY,INTERVENTIONAL,-1,-1,Low-Risk Non-Oncology,Non-Oncology,...,2.0,6.292683,0.0,266.0,1259.0,2.0,4.733083,0.086726,train,model_interventional_non_oncology
4,NCT02158065,Completed,ALL,370.0,OTHER,INTERVENTIONAL,5,2,Other Rare or Unclassified,Non-Oncology,...,1.0,5.625,-0.083333,262.0,1437.0,1.0,5.484733,-0.038889,train,model_interventional_non_oncology


In [7]:
list(combined_data.columns)

['NCT Number',
 'Study Status',
 'Sex',
 'Enrollment',
 'Funder Type',
 'Study Type',
 'Start Month',
 'Start Quarter',
 'Condition Category',
 'Conditions_Category',
 'Locations',
 'Num_Collaborators',
 'Collaborator_Type',
 'Condition_Category_old',
 'Sponsor Type',
 'CHILD',
 'ADULT',
 'OLDER_ADULT',
 'Masking',
 'Observational Model',
 'Time Perspective',
 'Masking Details',
 'Allocation_',
 'Allocation_NA',
 'Allocation_NON_RANDOMIZED',
 'Allocation_RANDOMIZED',
 'Allocation_Unknown',
 'Intervention Model_',
 'Intervention Model_CROSSOVER',
 'Intervention Model_FACTORIAL',
 'Intervention Model_PARALLEL',
 'Intervention Model_SEQUENTIAL',
 'Intervention Model_SINGLE_GROUP',
 'Intervention Model_Unknown',
 'Masking Level_DOUBLE',
 'Masking Level_NONE',
 'Masking Level_QUADRUPLE',
 'Masking Level_SINGLE',
 'Masking Level_TRIPLE',
 'Primary Purpose_',
 'Primary Purpose_BASIC_SCIENCE',
 'Primary Purpose_DEVICE_FEASIBILITY',
 'Primary Purpose_DIAGNOSTIC',
 'Primary Purpose_ECT',
 'Prima

In [8]:
combined_data['Study Status'].value_counts()

Study Status
Completed                  221243
COMPLETED                   55132
Not_Completed               36334
TERMINATED                   5970
WITHDRAWN                    2909
SUSPENDED                     299
ACTIVE_NOT_RECRUITING          15
RECRUITING                     11
ENROLLING_BY_INVITATION         1
NOT_YET_RECRUITING              1
Name: count, dtype: int64

In [9]:
# Update 'Study Status' to 1 if 'Completed' or 'COMPLETED', else 0
combined_data['Study Status'] = combined_data['Study Status'].apply(lambda x: 1 if x in ['Completed', 'COMPLETED'] else 0)
combined_data['Study Status'].value_counts()

Study Status
1    276375
0     45540
Name: count, dtype: int64

In [10]:
# Remove 'NCT Number' and 'Study Status' from the DataFrame columns
combined_data = combined_data.drop(columns=['Locations'])

In [11]:
# Get categorical columns (object or category type)
categorical_columns = combined_data.select_dtypes(include=['object', 'category']).columns.tolist()

# Get numerical columns (int, float type)
numerical_columns = combined_data.select_dtypes(include=['number']).columns.tolist()

# Remove 'NCT Number' and 'Study Status' from the lists
categorical_columns = [col for col in categorical_columns if col not in ['NCT Number', 'Study Status', 'data_split', 'data_type']]
numerical_columns = [col for col in numerical_columns if col not in ['NCT Number', 'Study Status', 'data_split', 'data_type', 'Start Month', 'Start Quarter']]

# Explicitly add 'Start Month' and 'Start Quarter' to the categorical columns
categorical_columns.extend(['Start Month', 'Start Quarter'])

# Optionally, print the lists of categorical and numerical columns
print("Categorical Columns:", categorical_columns)
print("Numerical Columns:", numerical_columns)

Categorical Columns: ['Sex', 'Funder Type', 'Study Type', 'Condition Category', 'Conditions_Category', 'Collaborator_Type', 'Condition_Category_old', 'Sponsor Type', 'Masking', 'Observational Model', 'Time Perspective', 'Masking Details', 'Country', 'Development Category', 'Start Month', 'Start Quarter']
Numerical Columns: ['Enrollment', 'Num_Collaborators', 'CHILD', 'ADULT', 'OLDER_ADULT', 'Study Title_Word_Count', 'Study Title_Char_Count', 'Study Title_Sentence_Count', 'Study Title_Avg_Word_Length', 'Study Title_Sentiment_Score', 'Brief Summary_Word_Count', 'Brief Summary_Char_Count', 'Brief Summary_Sentence_Count', 'Brief Summary_Avg_Word_Length', 'Brief Summary_Sentiment_Score', 'Total Outcome Measures_Word_Count', 'Total Outcome Measures_Char_Count', 'Total Outcome Measures_Sentence_Count', 'Total Outcome Measures_Avg_Word_Length', 'Total Outcome Measures_Sentiment_Score']


In [12]:
# List of top 5 countries
top_5_countries = ['United States', 'Unknown', 'France', 'India', 'China']

# Create separate columns for each of the top 5 countries
for country in top_5_countries:
    combined_data[country] = combined_data['Country'].apply(lambda x: 1 if x == country else 0)

combined_data = combined_data.drop(columns=['Country'])

combined_data.head()

Unnamed: 0,NCT Number,Study Status,Sex,Enrollment,Funder Type,Study Type,Start Month,Start Quarter,Condition Category,Conditions_Category,...,Total Outcome Measures_Sentence_Count,Total Outcome Measures_Avg_Word_Length,Total Outcome Measures_Sentiment_Score,data_split,data_type,United States,Unknown,France,India,China
0,NCT00421603,1,ALL,81.0,OTHER,INTERVENTIONAL,2,1,Other Rare or Unclassified,Non-Oncology,...,5.0,5.147541,0.025,train,model_interventional_non_oncology,1,0,0,0,0
1,NCT01340534,1,FEMALE,370.0,OTHER,INTERVENTIONAL,10,4,Other Rare or Unclassified,Non-Oncology,...,6.0,5.461538,0.5,train,model_interventional_non_oncology,0,0,0,0,0
2,NCT04166370,0,FEMALE,0.0,OTHER,INTERVENTIONAL,-1,-1,Other Rare or Unclassified,Non-Oncology,...,1.0,5.521739,-0.3125,train,model_interventional_non_oncology,0,0,0,0,0
3,NCT04670601,1,ALL,24.0,INDUSTRY,INTERVENTIONAL,-1,-1,Low-Risk Non-Oncology,Non-Oncology,...,2.0,4.733083,0.086726,train,model_interventional_non_oncology,0,0,0,0,0
4,NCT02158065,1,ALL,370.0,OTHER,INTERVENTIONAL,5,2,Other Rare or Unclassified,Non-Oncology,...,1.0,5.484733,-0.038889,train,model_interventional_non_oncology,0,0,0,0,0


In [13]:
categorical_columns = [col for col in categorical_columns if col not in ['NCT Number', 'Study Status', 'data_split', 'data_type', 'Country']]
numerical_columns.extend(['United States', 'Unknown', 'France', 'India', 'China'])

In [14]:
# Create and fit the OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse_output=False)  # Correct argument for sparse output

# Fit the encoder on the categorical columns of the data
encoder.fit(combined_data[categorical_columns])

# Transform the categorical columns
encoded_columns = encoder.transform(combined_data[categorical_columns])

# Create a DataFrame with the one-hot encoded values
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(categorical_columns))

# Drop the original categorical columns and append the encoded columns
combined_data_encoded = combined_data.drop(columns=categorical_columns)
combined_data_encoded = pd.concat([combined_data_encoded, encoded_df], axis=1)

# Save the encoder for future use
joblib.dump(encoder, folder_path + 'one_hot_encoder.pkl')

# Optionally, display the first few rows of the encoded data
combined_data_encoded.head()

Unnamed: 0,NCT Number,Study Status,Enrollment,Num_Collaborators,CHILD,ADULT,OLDER_ADULT,Allocation_,Allocation_NA,Allocation_NON_RANDOMIZED,...,Start Month_7,Start Month_8,Start Month_9,Start Month_10,Start Month_11,Start Month_12,Start Quarter_1,Start Quarter_2,Start Quarter_3,Start Quarter_4
0,NCT00421603,1,81.0,1,0,1,0,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,NCT01340534,1,370.0,0,1,1,1,False,False,False,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,NCT04166370,0,0.0,1,0,1,1,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,NCT04670601,1,24.0,1,0,1,0,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,NCT02158065,1,370.0,6,0,1,1,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [15]:
# Get the exact feature names after encoding training data
final_feature_names = list(combined_data_encoded.columns)

# Save feature names for inference
with open(folder_path + 'one_hot_encoded_feature_names.json', 'w') as f:
    json.dump(final_feature_names, f)

In [16]:
combined_data_encoded.data_type.value_counts()

data_type
model_interventional_non_oncology    151938
model_interventional_other            66629
model_observational_non_oncology      37899
model_interventional_oncology         37368
model_observational_other             20329
model_observational_oncology           7752
Name: count, dtype: int64

In [17]:
for col in combined_data_encoded.columns:
    print(f"{col}: {combined_data_encoded[col].iloc[0]}")

NCT Number: NCT00421603
Study Status: 1
Enrollment: 81.0
Num_Collaborators: 1
CHILD: 0
ADULT: 1
OLDER_ADULT: 0
Allocation_: False
Allocation_NA: False
Allocation_NON_RANDOMIZED: False
Allocation_RANDOMIZED: True
Allocation_Unknown: False
Intervention Model_: False
Intervention Model_CROSSOVER: False
Intervention Model_FACTORIAL: False
Intervention Model_PARALLEL: True
Intervention Model_SEQUENTIAL: False
Intervention Model_SINGLE_GROUP: False
Intervention Model_Unknown: False
Masking Level_DOUBLE: False
Masking Level_NONE: False
Masking Level_QUADRUPLE: False
Masking Level_SINGLE: False
Masking Level_TRIPLE: True
Primary Purpose_: False
Primary Purpose_BASIC_SCIENCE: False
Primary Purpose_DEVICE_FEASIBILITY: False
Primary Purpose_DIAGNOSTIC: False
Primary Purpose_ECT: False
Primary Purpose_HEALTH_SERVICES_RESEARCH: False
Primary Purpose_OTHER: False
Primary Purpose_PREVENTION: False
Primary Purpose_SCREENING: False
Primary Purpose_SUPPORTIVE_CARE: False
Primary Purpose_TREATMENT: True


In [18]:
import os
import json
import shap
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.utils import class_weight
from sklearn.metrics import accuracy_score


# List of data types for creating separate models
data_types = [
    'model_interventional_non_oncology', 'model_interventional_oncology', 'model_interventional_other',
    'model_observational_non_oncology', 'model_observational_oncology', 'model_observational_other'
]

# Initialize an empty list to store results
combined_results = []
model_params = {}  # Dictionary to store models and features info

# Create a directory to store the models and SHAP plots if it doesn't exist
output_dir = os.path.join(folder_path, 'models/')
os.makedirs(output_dir, exist_ok=True)

def train_xgboost_model(X_train, y_train):
    """Trains an XGBoost classifier model with class weights."""

    # Calculate class weights
    class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    weight_dict = {i: w if i == 0 else w for i, w in enumerate(class_weights)}
    sample_weights = np.array([weight_dict[t] for t in y_train])

    # Create DMatrix using sample weights
    dtrain = xgb.DMatrix(X_train, label=y_train, weight=sample_weights)

    # Set up the parameters
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': 3,
        'eta': 0.1
    }

    # Train the model
    model = xgb.train(params, dtrain, num_boost_round=100)

    return model, params

# Loop over each data type
for data_type in data_types:
    # Filter data by data_type and data_split (train data)
    train_data = combined_data_encoded[
        (combined_data_encoded['data_type'] == data_type) & (combined_data_encoded['data_split'] == 'train')
    ]
    test_data = combined_data_encoded[
        (combined_data_encoded['data_type'] == data_type) & (combined_data_encoded['data_split'] == 'test')
    ]

    # Define features (X) and target (y) for train and test data
    X_train = train_data.drop(columns=['Study Status', 'data_split', 'data_type', 'NCT Number', 'Enrollment'])
    y_train = train_data['Study Status']
    X_test = test_data.drop(columns=['Study Status', 'data_split', 'data_type', 'NCT Number', 'Enrollment'])
    y_test = test_data['Study Status']

    # Train the XGBoost model with class balancing using the custom training function
    model, params = train_xgboost_model(X_train, y_train)

    # Save the trained model
    model_filename = f"{data_type}_xgboost_model.json"
    model.save_model(os.path.join(output_dir, model_filename))

    # Save the features used for the model
    feature_filename = f"{data_type}_features.json"
    features = list(X_train.columns)
    with open(os.path.join(output_dir, feature_filename), 'w') as f:
        json.dump(features, f)

    # Store the model parameters and feature information
    model_params[data_type] = {
        'params': params,
        'features': features,
        'model_filename': model_filename,
        'feature_filename': feature_filename
    }

    # Predict on the test set (get class labels and probabilities)
    dtest = xgb.DMatrix(X_test)
    y_pred_test = model.predict(dtest)
    y_pred_class_test = np.round(y_pred_test)
    y_pred_proba_test = np.vstack([1 - y_pred_test, y_pred_test]).T

    # Predict on the train set
    dtrain = xgb.DMatrix(X_train)
    y_pred_train = model.predict(dtrain)
    y_pred_class_train = np.round(y_pred_train)
    y_pred_proba_train = np.vstack([1 - y_pred_train, y_pred_train]).T

    # Prepare final predictions for the test set with NCT Number
    final_predictions_test = pd.DataFrame({
        "true_label": y_test,
        "predicted_class": y_pred_class_test,
        "prob_class_0": y_pred_proba_test[:, 0],
        "prob_class_1": y_pred_proba_test[:, 1],
        "NCT Number": test_data["NCT Number"],
        "data_split": "test",
        "data_type": data_type
    })

    # Prepare final predictions for the train set with NCT Number
    final_predictions_train = pd.DataFrame({
        "true_label": y_train,
        "predicted_class": y_pred_class_train,
        "prob_class_0": y_pred_proba_train[:, 0],
        "prob_class_1": y_pred_proba_train[:, 1],
        "NCT Number": train_data["NCT Number"],
        "data_split": "train",
        "data_type": data_type
    })

    # Append both train and test predictions into the combined results
    combined_results.append(final_predictions_test)
    combined_results.append(final_predictions_train)

    # Calculate SHAP values
    shap_values = model.predict(dtest, pred_contribs=True)[:, :-1]

    # Save SHAP Summary Plot
    plt.figure()
    shap.summary_plot(shap_values, X_test, feature_names=X_test.columns, show=False)
    shap_filename = os.path.join(output_dir, f"shap_summary_{data_type}.png")
    plt.savefig(shap_filename)
    plt.close()

    print(f"SHAP summary plot saved for {data_type} at {shap_filename}")

    # Calculate accuracy
    accuracy = accuracy_score(y_test, (y_pred_test > 0.5).astype(int))
    print(f"Accuracy ({data_type}): {accuracy}")

# Combine all results into a single DataFrame and save
combined_predictions = pd.concat(combined_results, ignore_index=True)
combined_predictions.to_csv(os.path.join(folder_path, 'combined_predictions.csv'), index=False)

# Save model parameters for inference
with open(os.path.join(output_dir, 'model_params.json'), 'w') as f:
    json.dump(model_params, f)

print("✅ Combined predictions, models, and SHAP plots have been saved.")


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


SHAP summary plot saved for model_interventional_non_oncology at E:\Case Comp\NEST\Training\models/shap_summary_model_interventional_non_oncology.png
Accuracy (model_interventional_non_oncology): 0.5831981728509483
SHAP summary plot saved for model_interventional_oncology at E:\Case Comp\NEST\Training\models/shap_summary_model_interventional_oncology.png
Accuracy (model_interventional_oncology): 0.6123588039867109
SHAP summary plot saved for model_interventional_other at E:\Case Comp\NEST\Training\models/shap_summary_model_interventional_other.png
Accuracy (model_interventional_other): 0.5992469879518072
SHAP summary plot saved for model_observational_non_oncology at E:\Case Comp\NEST\Training\models/shap_summary_model_observational_non_oncology.png
Accuracy (model_observational_non_oncology): 0.6432997676219985
SHAP summary plot saved for model_observational_oncology at E:\Case Comp\NEST\Training\models/shap_summary_model_observational_oncology.png
Accuracy (model_observational_oncolo

In [19]:
# # List of data types for creating separate models
# data_types = ['model_interventional_non_oncology', 'model_interventional_oncology', 'model_interventional_other',
#               'model_observational_non_oncology', 'model_observational_oncology', 'model_observational_other']

# # Initialize an empty list to store results
# combined_results = []
# model_params = {}  # Dictionary to store models and features info

# def train_xgboost_model(X_train, y_train):
#     """Trains an XGBoost classifier model with class weights."""

#     # Calculate class weights
#     class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
#     weight_dict = {i: w * 1 if i == 0 else w for i, w in enumerate(class_weights)}  # Apply 100x weight to class 0
#     sample_weights = np.array([weight_dict[t] for t in y_train])

#     # Create DMatrix using sample weights
#     dtrain = xgb.DMatrix(X_train, label=y_train, weight=sample_weights)  # Use sample weights in DMatrix

#     # Set up the parameters
#     params = {
#         'objective': 'binary:logistic',
#         'eval_metric': 'logloss',
#         'max_depth': 3,
#         'eta': 0.1
#     }

#     # Train the model
#     model = xgb.train(params, dtrain, num_boost_round=100)

#     return model, params

# # Create a directory to store the models and feature sets if it doesn't exist
# output_dir = folder_path+ 'models\\'
# os.makedirs(output_dir, exist_ok=True)

# # Loop over each data type
# for data_type in data_types:
#     # Filter data by data_type and data_split (train data)
#     train_data = combined_data_encoded[(combined_data_encoded['data_type'] == data_type) & (combined_data_encoded['data_split'] == 'train')]
#     test_data = combined_data_encoded[(combined_data_encoded['data_type'] == data_type) & (combined_data_encoded['data_split'] == 'test')]

#     # Define features (X) and target (y) for train data
#     X_train = train_data.drop(columns=['Study Status', 'data_split', 'data_type', 'NCT Number', 'Enrollment'])
#     y_train = train_data['Study Status']

#     # Define features (X) and target (y) for test data
#     X_test = test_data.drop(columns=['Study Status', 'data_split', 'data_type', 'NCT Number', 'Enrollment'])
#     y_test = test_data['Study Status']

#     # Train the XGBoost model with class balancing using the custom training function
#     model, params = train_xgboost_model(X_train, y_train)

#     # Save the trained model
#     model_filename = f"{data_type}_xgboost_model.json"
#     model.save_model(os.path.join(output_dir, model_filename))

#     # Save the features used for the model
#     feature_filename = f"{data_type}_features.json"
#     features = list(X_train.columns)
#     with open(os.path.join(output_dir, feature_filename), 'w') as f:
#         json.dump(features, f)

#     # Store the model parameters and feature information
#     model_params[data_type] = {
#         'params': params,
#         'features': features,
#         'model_filename': model_filename,
#         'feature_filename': feature_filename
#     }

#     # Predict on the test set (get class labels)
#     y_pred_test = model.predict(xgb.DMatrix(X_test))
#     y_pred_class_test = np.round(y_pred_test)  # Convert probabilities to class labels (0 or 1)

#     # Get predicted probabilities for both classes
#     y_pred_proba_test = model.predict(xgb.DMatrix(X_test))

#     # If the model outputs probabilities for a single class, we need to adjust for binary classification
#     if y_pred_proba_test.ndim == 1:  # Only one probability returned
#         y_pred_proba_test = np.vstack([1 - y_pred_proba_test, y_pred_proba_test]).T  # Create a 2D array: class 0 and class 1 probabilities

#     # Predict on the train set (get class labels)
#     y_pred_train = model.predict(xgb.DMatrix(X_train))
#     y_pred_class_train = np.round(y_pred_train)  # Convert probabilities to class labels (0 or 1)

#     # Get predicted probabilities for both classes for the train set
#     y_pred_proba_train = model.predict(xgb.DMatrix(X_train))

#     # If the model outputs probabilities for a single class, we need to adjust for binary classification
#     if y_pred_proba_train.ndim == 1:  # Only one probability returned
#         y_pred_proba_train = np.vstack([1 - y_pred_proba_train, y_pred_proba_train]).T  # Create a 2D array: class 0 and class 1 probabilities

#     # Prepare final predictions for the test set with NCT Number
#     final_predictions_test = pd.DataFrame()
#     final_predictions_test['true_label'] = y_test
#     final_predictions_test['predicted_class'] = y_pred_class_test
#     final_predictions_test['prob_class_0'] = y_pred_proba_test[:, 0]  # Probability of class 0
#     final_predictions_test['prob_class_1'] = y_pred_proba_test[:, 1]  # Probability of class 1
#     final_predictions_test['NCT Number'] = test_data['NCT Number']
#     final_predictions_test['data_split'] = 'test'  # Add data_split marker
#     final_predictions_test['data_type'] = data_type  # Add data_type marker

#     # Prepare final predictions for the train set with NCT Number
#     final_predictions_train = pd.DataFrame()
#     final_predictions_train['true_label'] = y_train
#     final_predictions_train['predicted_class'] = y_pred_class_train
#     final_predictions_train['prob_class_0'] = y_pred_proba_train[:, 0]  # Probability of class 0
#     final_predictions_train['prob_class_1'] = y_pred_proba_train[:, 1]  # Probability of class 1
#     final_predictions_train['NCT Number'] = train_data['NCT Number']
#     final_predictions_train['data_split'] = 'train'  # Add data_split marker
#     final_predictions_train['data_type'] = data_type  # Add data_type marker

#     # Append both train and test predictions into the combined results
#     combined_results.append(final_predictions_test)
#     combined_results.append(final_predictions_train)

# # Combine all results into a single DataFrame
# combined_predictions = pd.concat(combined_results, ignore_index=True)

# # Save the combined predictions to a single CSV file
# combined_predictions.to_csv(folder_path +'combined_predictions.csv', index=False)

# # Save the model parameters (used for inference) to a JSON file
# with open(folder_path + 'models\\model_params.json', 'w') as f:
#     json.dump(model_params, f)

# print("Combined predictions and models have been saved.")