In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

# Load the dataset
df = pd.read_csv('clinical_trial_sites.csv')

# Separate features and target
X = df.drop(columns=['site_id', 'site_rating'])  # Features: Remove site_id and site_rating
y = df['site_rating']  # Target: site_rating

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps for numeric and categorical data
numeric_features = ['num_doctors', 'num_nurses', 'avg_patient_age', 'avg_patient_income',
                    'trial_success_rate', 'patient_enrollment_rate', 'bed_count', 'years_operational']
categorical_features = ['region', 'site_size', 'previous_experience', 'site_type', 'training_quality',
                        'site_location', 'site_capacity']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Combine preprocessing and RandomForestClassifier into a single pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'classifier__n_estimators': [300, 400, 500],
    'classifier__max_depth': [10, 20, 30, 40],
    'classifier__min_samples_split': [5, 10, 20],
    'classifier__min_samples_leaf': [3, 6, 9]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model from GridSearchCV
best_model = grid_search.best_estimator_

# Evaluate on test set
y_pred = best_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save the trained model as a joblib file
joblib.dump(best_model, 'your_trained_model.pkl')
print("Model saved as 'your_trained_model.pkl'")

Classification Report:
              precision    recall  f1-score   support

     Average       0.24      0.20      0.22        59
   Excellent       0.25      0.18      0.21        50
        Good       0.26      0.23      0.24        48
        Poor       0.24      0.40      0.30        43

    accuracy                           0.24       200
   macro avg       0.25      0.25      0.24       200
weighted avg       0.25      0.24      0.24       200

Model saved as 'your_trained_model.pkl'


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

# Load the dataset
df = pd.read_csv('clinical_trial_sites.csv')

# Separate features and target
X = df.drop(columns=['site_id', 'site_rating'])
y = df['site_rating']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps for numeric and categorical data
numeric_features = ['num_doctors', 'num_nurses', 'avg_patient_age', 'avg_patient_income',
                    'trial_success_rate', 'patient_enrollment_rate', 'bed_count', 'years_operational']
categorical_features = ['region', 'site_size', 'previous_experience', 'site_type', 'training_quality',
                        'site_location', 'site_capacity']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Combine preprocessing and RandomForestClassifier into a single pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'classifier__n_estimators': [300, 400, 500],
    'classifier__max_depth': [10, 20, 30, 40],
    'classifier__min_samples_split': [5, 10, 20],
    'classifier__min_samples_leaf': [3, 6, 9]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model from GridSearchCV
best_model = grid_search.best_estimator_

# Evaluate on test set
y_pred = best_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save the trained model as a joblib file
joblib.dump(best_model, 'your_trained_model.pkl')
print("Model saved as 'your_trained_model.pkl'")

Classification Report:
              precision    recall  f1-score   support

     Average       0.24      0.20      0.22        59
   Excellent       0.25      0.18      0.21        50
        Good       0.26      0.23      0.24        48
        Poor       0.24      0.40      0.30        43

    accuracy                           0.24       200
   macro avg       0.25      0.25      0.24       200
weighted avg       0.25      0.24      0.24       200

Model saved as 'your_trained_model.pkl'


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib
from scipy.stats import randint, uniform

# Load the dataset
df = pd.read_csv('clinical_trial_sites.csv')

# Separate features and target
X = df.drop(columns=['site_id', 'site_rating'])
y = df['site_rating']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps for numeric and categorical data
numeric_features = ['num_doctors', 'num_nurses', 'avg_patient_age', 'avg_patient_income',
                    'trial_success_rate', 'patient_enrollment_rate', 'bed_count', 'years_operational']
categorical_features = ['region', 'site_size', 'previous_experience', 'site_type', 'training_quality',
                        'site_location', 'site_capacity']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Combine preprocessing and GradientBoostingClassifier into a single pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', GradientBoostingClassifier(random_state=42))])

# Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'classifier__n_estimators': randint(100, 1000),
    'classifier__learning_rate': uniform(0.01, 0.3),
    'classifier__max_depth': randint(3, 10),
    'classifier__min_samples_split': randint(2, 20),
    'classifier__min_samples_leaf': randint(1, 20),
    'classifier__subsample': uniform(0.5, 1.0)
}

random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=100, cv=5, n_jobs=-1, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Best model from RandomizedSearchCV
best_model = random_search.best_estimator_

# Evaluate on test set
y_pred = best_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save the trained model as a joblib file
joblib.dump(best_model, 'your_trained_model.pkl')
print("Model saved as 'your_trained_model.pkl'")

255 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\imrkm\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\imrkm\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\imrkm\anaconda3\Lib\site-packages\sklearn\ensemble\_gb.py", line 420, in fit
    self._validate_params()
  File "C:\Users\imrkm\anaconda3\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File 

Classification Report:
              precision    recall  f1-score   support

     Average       0.32      0.25      0.28        59
   Excellent       0.36      0.32      0.34        50
        Good       0.19      0.23      0.21        48
        Poor       0.22      0.26      0.24        43

    accuracy                           0.27       200
   macro avg       0.27      0.26      0.27       200
weighted avg       0.28      0.27      0.27       200

Model saved as 'your_trained_model.pkl'


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import joblib
from scipy.stats import randint, uniform

# Load the dataset
df = pd.read_csv('clinical_trial_sites.csv')

# Separate features and target
X = df.drop(columns=['site_id', 'site_rating'])
y = df['site_rating']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps for numeric and categorical data
numeric_features = ['num_doctors', 'num_nurses', 'avg_patient_age', 'avg_patient_income',
                    'trial_success_rate', 'patient_enrollment_rate', 'bed_count', 'years_operational']
categorical_features = ['region', 'site_size', 'previous_experience', 'site_type', 'training_quality',
                        'site_location', 'site_capacity']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Combine preprocessing and SMOTE in a pipeline
pipeline = ImbPipeline(steps=[('preprocessor', preprocessor),
                              ('smote', SMOTE(random_state=42)),
                              ('classifier', GradientBoostingClassifier(random_state=42))])

# Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'classifier__n_estimators': randint(100, 1000),
    'classifier__learning_rate': uniform(0.01, 0.3),
    'classifier__max_depth': randint(3, 10),
    'classifier__min_samples_split': randint(2, 20),
    'classifier__min_samples_leaf': randint(1, 20),
    'classifier__subsample': uniform(0.5, 1.0)
}

random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=100, cv=5, n_jobs=-1, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Best model from RandomizedSearchCV
best_model = random_search.best_estimator_

# Evaluate on test set
y_pred = best_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save the trained model as a joblib file
joblib.dump(best_model, 'your_trained_model.pkl')
print("Model saved as 'your_trained_model.pkl'")

255 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\imrkm\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\imrkm\anaconda3\Lib\site-packages\imblearn\pipeline.py", line 297, in fit
    self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  File "C:\Users\imrkm\anaconda3\Lib\site-packages\sklearn\ensemble\_gb.py", line 420, in fit
    self._validate_params()
  File "C:\Users\imrkm\anaconda3\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  Fil

Classification Report:
              precision    recall  f1-score   support

     Average       0.32      0.24      0.27        59
   Excellent       0.29      0.30      0.30        50
        Good       0.14      0.17      0.15        48
        Poor       0.25      0.28      0.26        43

    accuracy                           0.24       200
   macro avg       0.25      0.25      0.25       200
weighted avg       0.25      0.24      0.25       200

Model saved as 'your_trained_model.pkl'


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import joblib
from scipy.stats import randint, uniform
import streamlit as st

# Load the dataset
df = pd.read_csv('clinical_trial_sites.csv')

# Separate features and target
X = df.drop(columns=['site_id', 'site_rating'])
y = df['site_rating']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps for numeric and categorical data
numeric_features = ['num_doctors', 'num_nurses', 'avg_patient_age', 'avg_patient_income',
                    'trial_success_rate', 'patient_enrollment_rate', 'bed_count', 'years_operational']
categorical_features = ['region', 'site_size', 'previous_experience', 'site_type', 'training_quality',
                        'site_location', 'site_capacity']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Combine preprocessing and SMOTE in a pipeline
smote = SMOTE(random_state=42)

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

X_train_res, y_train_res = smote.fit_resample(X_train_preprocessed, y_train)

# Define classifiers
clf1 = GradientBoostingClassifier(random_state=42)
clf2 = RandomForestClassifier(random_state=42)
clf3 = LogisticRegression(max_iter=1000, random_state=42)

# Define the VotingClassifier with hard voting
voting_clf = VotingClassifier(estimators=[
    ('gb', clf1),
    ('rf', clf2),
    ('lr', clf3)
], voting='hard')

# Hyperparameter tuning using RandomizedSearchCV for each classifier
param_dist_gb = {
    'n_estimators': randint(100, 1000),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'subsample': uniform(0.5, 1.0)
}

param_dist_rf = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'bootstrap': [True, False]
}

param_dist_lr = {
    'C': uniform(0.01, 10)
}

random_search_gb = RandomizedSearchCV(clf1, param_distributions=param_dist_gb, n_iter=100, cv=5, n_jobs=-1, scoring='accuracy', random_state=42)
random_search_rf = RandomizedSearchCV(clf2, param_distributions=param_dist_rf, n_iter=100, cv=5, n_jobs=-1, scoring='accuracy', random_state=42)
random_search_lr = RandomizedSearchCV(clf3, param_distributions=param_dist_lr, n_iter=100, cv=5, n_jobs=-1, scoring='accuracy', random_state=42)

random_search_gb.fit(X_train_res, y_train_res)
random_search_rf.fit(X_train_res, y_train_res)
random_search_lr.fit(X_train_res, y_train_res)

# Best estimators from RandomizedSearchCV
best_gb = random_search_gb.best_estimator_
best_rf = random_search_rf.best_estimator_
best_lr = random_search_lr.best_estimator_

# Voting classifier with the best estimators
voting_clf = VotingClassifier(estimators=[
    ('gb', best_gb),
    ('rf', best_rf),
    ('lr', best_lr)
], voting='hard')

# Train the voting classifier
voting_clf.fit(X_train_res, y_train_res)

# Evaluate on test set
y_pred = voting_clf.predict(X_test_preprocessed)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save the trained model as a joblib file
joblib.dump(voting_clf, 'your_trained_model.pkl')
print("Model saved as 'your_trained_model.pkl'")

# Streamlit app to predict clinical sites with great success rate
st.title("Clinical Trial Site Success Prediction")

uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
if uploaded_file is not None:
    df_user = pd.read_csv(uploaded_file)
    X_user = df_user.drop(columns=['site_id'])
    X_user_preprocessed = preprocessor.transform(X_user)
    predictions = voting_clf.predict(X_user_preprocessed)
    df_user['Predicted_Site_Rating'] = predictions
    st.write(df_user)
    st.download_button("Download Predictions", df_user.to_csv(index=False), "predictions.csv")

255 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\imrkm\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\imrkm\anaconda3\Lib\site-packages\sklearn\ensemble\_gb.py", line 420, in fit
    self._validate_params()
  File "C:\Users\imrkm\anaconda3\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\imrkm\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterE

Classification Report:
              precision    recall  f1-score   support

     Average       0.33      0.37      0.35        59
   Excellent       0.27      0.28      0.28        50
        Good       0.19      0.17      0.18        48
        Poor       0.26      0.23      0.24        43

    accuracy                           0.27       200
   macro avg       0.26      0.26      0.26       200
weighted avg       0.27      0.27      0.27       200



2024-07-13 13:33:44.006 
  command:

    streamlit run C:\Users\imrkm\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


Model saved as 'your_trained_model.pkl'


In [12]:
df.head(4)

Unnamed: 0,site_id,region,site_size,num_doctors,num_nurses,avg_patient_age,avg_patient_income,trial_success_rate,patient_enrollment_rate,previous_experience,site_rating,accessibility_score,site_type,bed_count,years_operational,training_quality,site_location,site_capacity
0,1,West,Small,21,43.0,52,71647.0,0.59612,0.673184,Low,Good,3,Research,260,6,Good,Urban,Low
1,2,Central,Small,30,,68,,0.537986,0.645624,Low,Poor,5,Research,221,11,Average,Rural,Medium
2,3,East,Large,12,12.0,30,85838.0,0.625967,0.671237,High,Excellent,5,Teaching,281,1,Poor,Urban,Low
3,4,Central,Medium,44,21.0,65,59539.0,0.538658,0.869171,High,Excellent,10,Teaching,55,25,Average,Suburban,High


In [22]:
print(df.dtypes)

site_id                      int64
region                      object
site_size                   object
num_doctors                  int64
num_nurses                 float64
avg_patient_age              int64
avg_patient_income         float64
trial_success_rate         float64
patient_enrollment_rate    float64
previous_experience         object
site_rating                 object
accessibility_score          int64
site_type                   object
bed_count                    int64
years_operational            int64
training_quality            object
site_location               object
site_capacity               object
dtype: object


In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import joblib

# Load the dataset
df = pd.read_csv('clinical_trial_sites.csv')

# Separate features and target
X = df.drop(columns=['site_id', 'trial_success_rate'])  # Features: Remove site_id and trial_success_rate
y = df['trial_success_rate']  # Target: trial_success_rate

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Define transformers for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute with most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define models for bagging and boosting
models = {
    'RandomForest': RandomForestRegressor(random_state=42),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'Bagging': BaggingRegressor(base_estimator=LinearRegression(), random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42)
}

# Define parameter grids for RandomizedSearchCV
param_grids = {
    'RandomForest': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4]
    },
    'GradientBoosting': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    },
    'AdaBoost': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 1]
    },
    'Bagging': {
        'classifier__n_estimators': [10, 50, 100],
        'classifier__max_samples': [0.5, 1.0],
        'classifier__max_features': [0.5, 1.0]
    },
    'XGBoost': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    },
    'LightGBM': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__num_leaves': [31, 40, 50]
    }
}

best_models = {}

# Train and tune models
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', model)])
    param_grid = param_grids[name]
    search = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=10, scoring='neg_mean_squared_error', cv=3, random_state=42)
    search.fit(X_train, y_train)
    best_models[name] = search.best_estimator_

# Evaluate models
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"{name} RMSE: {rmse}")

# Save the best model
best_model = min(best_models, key=lambda name: mean_squared_error(y_test, best_models[name].predict(X_test), squared=False))
joblib.dump(best_models[best_model], 'best_model.pkl')
print(f"Best model saved as 'best_model.pkl'")



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000146 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 730
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training from score 0.696056
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000118 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training from score 0.699713


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000201 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 534, number of used features: 36
[LightGBM] [Info] Start training from score 0.698884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 730
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training from score 0.696056
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000125 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training fro

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000126 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 534, number of used features: 36
[LightGBM] [Info] Start training from score 0.698884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000099 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 730
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training from score 0.696056
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000104 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training fro

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 534, number of used features: 36
[LightGBM] [Info] Start training from score 0.698884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000098 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 730
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training from score 0.696056
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training fro

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000149 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 534, number of used features: 36
[LightGBM] [Info] Start training from score 0.698884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000107 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 730
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training from score 0.696056
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training fro

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000119 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training from score 0.699713
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 534, number of used features: 36
[LightGBM] [Info] Start training from score 0.698884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000104 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 730
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training fro

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000100 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training from score 0.699713
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 534, number of used features: 36
[LightGBM] [Info] Start training from score 0.698884


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 730
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training from score 0.696056
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training from score 0.699713


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000094 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 534, number of used features: 36
[LightGBM] [Info] Start training from score 0.698884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000123 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 730
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training from score 0.696056


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000118 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training from score 0.699713
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000100 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 534, number of used features: 36
[LightGBM] [Info] Start training from score 0.698884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000102 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 730
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training fro

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000123 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 36
[LightGBM] [Info] Start training from score 0.699713
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000114 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 534, number of used features: 36
[LightGBM] [Info] Start training from score 0.698884


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 925
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 36
[LightGBM] [Info] Start training from score 0.698218
RandomForest RMSE: 0.1141579946344068
GradientBoosting RMSE: 0.11293883049279202
AdaBoost RMSE: 0.11231097760697803
Bagging RMSE: 0.11446040724136708
XGBoost RMSE: 0.11225186182322239
LightGBM RMSE: 0.11382466109461885


Best model saved as 'best_model.pkl'
