In [37]:
import sys
import os
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor


In [38]:
# Add src to PYTHONPATH
sys.path.append(os.path.join(os.getcwd(), "src"))

from utils.visualization import plot_model_predictions, plot_model_errors
from utils.model_io import save_model
from models.linear_regression import fine_tune_linear_model
from models.random_forest import fine_tune_random_forest
from models.xgboost import fine_tune_xgboost
from models.train import run_regression_model


In [49]:
# Step 1: Load Dataset
data_path = "data/processed/structured_data_12_4.csv"
df = pd.read_csv(data_path)
# Step 2: Define features and target
X = df.drop(['copy_number', 'log_copy_number', 'time_to_threshold', 'Unnamed: 0'], axis=1)
y = df['log_copy_number']

In [50]:
X

Unnamed: 0,n_spots,max_n_spots,bulk_fluorescence,avg_spot_size,percent_area,max_spot_change,max_area_change,max_fluorescence_change
0,38,104,10.472729,0.000739,0.028094,30,0.011298,4.752852
1,27,40,12.495401,0.000356,0.009606,9,0.003808,7.474452
2,21,44,22.607191,0.000290,0.006097,11,0.003915,11.586981
3,75,142,17.086556,0.000303,0.022758,37,0.018607,9.947778
4,63,119,74.832584,0.000835,0.052581,32,0.014258,16.669493
...,...,...,...,...,...,...,...,...
89,1252,1330,362.481914,0.000399,0.499634,998,0.426684,111.010662
90,1223,1297,376.729894,0.000439,0.536888,1014,0.397206,115.842561
91,1380,1451,311.095399,0.000387,0.533936,1015,0.452014,100.562599
92,1353,1364,475.089333,0.000408,0.552328,1068,0.480369,135.813390


In [51]:
# Step 3: Optimize hyperparameters for each model
print("Optimizing Linear Regression...")
_, linear_model = fine_tune_linear_model(X, y, model_type='linear')


Optimizing Linear Regression...


In [52]:
print("Optimizing Random Forest...")
best_rf_params = fine_tune_random_forest(X, y)
rf_model = RandomForestRegressor(random_state=42, **best_rf_params)
rf_model.fit(X, y)


Optimizing Random Forest...
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV Score (Negative MSE): -1.0829319123699808


In [53]:
print("Optimizing XGBoost...")
best_xgb_params = fine_tune_xgboost(X, y)
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42, **best_xgb_params)
xgb_model.fit(X, y)

Optimizing XGBoost...




AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [54]:
# Step 4: Evaluate Models
print("Evaluating Linear Regression...")
lr_avg_train_mse, lr_avg_test_mse, lr_predictions_df = run_regression_model(X, y, linear_model)

print("Evaluating Random Forest...")
rf_avg_train_mse, rf_avg_test_mse, rf_predictions_df = run_regression_model(X, y, rf_model)

# print("Evaluating XGBoost...")
# xgb_avg_train_mse, xgb_avg_test_mse, xgb_predictions_df = run_regression_model(X, y, xgb_model)


Evaluating Linear Regression...
Evaluating Random Forest...


In [55]:
# Step 5: Plot Results
models_results = {
    "Linear Regression": (lr_avg_train_mse, lr_avg_test_mse, lr_predictions_df, linear_model),
    "Random Forest": (rf_avg_train_mse, rf_avg_test_mse, rf_predictions_df, rf_model),
    # "XGBoost": (xgb_avg_train_mse, xgb_avg_test_mse, xgb_predictions_df, xgb_model),
}

models_dir = "result_3"
os.makedirs(models_dir, exist_ok=True)

for model_name, (avg_train_mse, avg_test_mse, predictions_df, model) in models_results.items():
    print(f"{model_name} Metrics:")
    print(f"  Average Train MSE: {avg_train_mse}")
    print(f"  Average Test MSE: {avg_test_mse}")

    # Save plots to the result directory
    print(f"Plotting and saving predictions for {model_name}...")
    plot_model_predictions(X, y, model, output_dir=models_dir)


Linear Regression Metrics:
  Average Train MSE: 0.3414556781780052
  Average Test MSE: 0.4783961559230929
Plotting and saving predictions for Linear Regression...
Average Train MSE: 0.3414556781780052
Average Test MSE: 0.4783961559230929
Plot saved to result_3\LinearRegression_predictions.png
Random Forest Metrics:
  Average Train MSE: 0.022661136359702662
  Average Test MSE: 0.16417638464573756
Plotting and saving predictions for Random Forest...
Average Train MSE: 0.022661136359702662
Average Test MSE: 0.16417638464573756
Plot saved to result_3\RandomForestRegressor_predictions.png


In [None]:
# Step 6: Save the Best Model
best_model_name = min(models_results, key=lambda x: models_results[x][1])  # Based on lowest test MSE
best_model = models_results[best_model_name][3]  # Retrieve the model instance

model_path = os.path.join(models_dir, f"{best_model_name.replace(' ', '_').lower()}.pkl")
save_model(best_model, model_path)

print(f"The best model '{best_model_name}' has been saved to {model_path}.")

In [45]:
linear_weights = pd.DataFrame({
    "Feature": X.columns,
    "Weight": linear_model.coef_
}).sort_values(by="Weight", ascending=False)

print("Linear Regression Weights:")
print(linear_weights)

Linear Regression Weights:
                   Feature      Weight
7          max_area_change    1.732991
5             percent_area    0.439701
0               Unnamed: 0    0.047884
3        bulk_fluorescence    0.000583
2              max_n_spots    0.000228
1                  n_spots   -0.000325
6          max_spot_change   -0.000572
8  max_fluorescence_change   -0.002074
4            avg_spot_size -200.344815


In [46]:
rf_feature_importances = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("Random Forest Feature Importances:")
print(rf_feature_importances)

Random Forest Feature Importances:
                   Feature  Importance
0               Unnamed: 0    0.967326
1                  n_spots    0.009251
2              max_n_spots    0.006769
6          max_spot_change    0.004770
7          max_area_change    0.003757
4            avg_spot_size    0.003399
3        bulk_fluorescence    0.001876
5             percent_area    0.001844
8  max_fluorescence_change    0.001008


np.float64(1.0)