In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle


In [6]:
# Load the dataset
df = pd.read_csv('tvol_lidarLiveP.csv')
df.head()

Unnamed: 0,Status,origin,zmax,zmean,zsd,zskew,zkurt,pzabovezmean,pzabove2,zq5,...,zpcum1,zpcum2,zpcum3,zpcum4,zpcum5,zpcum6,zpcum7,zpcum8,zpcum9,Target
0,1,0,0.482936,0.393575,0.640931,0.417088,0.000799,0.653559,0.543409,0.0,...,0.228816,0.217799,0.216589,0.214684,0.208958,0.211244,0.275474,0.485716,0.702998,395.09503
1,1,0,0.619445,0.744484,0.669192,0.315335,0.035721,0.972526,0.83226,0.0,...,0.066177,0.098509,0.116252,0.120479,0.113527,0.11218,0.112576,0.120256,0.182446,846.770732
2,1,0,0.590965,0.552324,0.719417,0.387989,0.00511,0.749769,0.73854,0.0,...,0.217498,0.280967,0.292151,0.288404,0.281392,0.280213,0.282977,0.416297,0.870808,500.279343
3,1,0,0.585318,0.735132,0.211343,0.251785,0.194261,0.687752,0.986705,0.679823,...,0.001473,0.0,0.0,0.0,0.010708,0.162536,0.361051,0.722317,0.873967,526.744001
4,1,0,0.297815,0.381137,0.203122,0.251367,0.095055,0.959945,0.893755,0.0,...,0.050555,0.047751,0.05285,0.05643,0.053567,0.11009,0.504414,0.914992,0.984998,327.607715


In [8]:
X = df.drop('Target', axis=1)  # Drops the Target column and uses the rest as features
y = df['Target']               # Sets the Target column as the dependent variable


In [14]:
# Define the models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "XGBoost": xgb.XGBRegressor(objective='reg:squarederror')
}


In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, r2_score

# Helper function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Dictionary to store the cross-validation results for both RMSE and R2
results_rmse = {}
results_r2 = {}

# Perform cross-validation
for name, model in models.items():
    # Cross-validation for RMSE
    rmse_scores = cross_val_score(model, X, y, cv=5, scoring=make_scorer(rmse, greater_is_better=False))
    results_rmse[name] = rmse_scores
    
    # Cross-validation for R2
    r2_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    results_r2[name] = r2_scores
    
    # Output the results
    print(f"{name} RMSE: Mean = {np.mean(-rmse_scores):.3f}, Std Dev = {np.std(-rmse_scores):.3f}")
    print(f"{name} R2: Mean = {np.mean(r2_scores):.3f}, Std Dev = {np.std(r2_scores):.3f}")


Linear Regression RMSE: Mean = 109.908, Std Dev = 24.338
Linear Regression R2: Mean = 0.490, Std Dev = 0.125
Random Forest RMSE: Mean = 91.937, Std Dev = 15.869
Random Forest R2: Mean = 0.597, Std Dev = 0.241
XGBoost RMSE: Mean = 97.862, Std Dev = 17.459
XGBoost R2: Mean = 0.533, Std Dev = 0.286


In [20]:
import pickle

# Initialize variables to keep track of the best model and score
best_model_name = None
best_model_score = float('inf')

# Evaluate and select the best model based on RMSE
for name in models.keys():
    mean_rmse = np.mean(-results_rmse[name])  # Calculate mean RMSE (make sure to negate the scores)
    if mean_rmse < best_model_score:
        best_model_score = mean_rmse
        best_model_name = name

# Fit the best model on the entire dataset
best_model = models[best_model_name].fit(X, y)

# Save the best model as a pickle file
with open(f"{best_model_name}_Baseline.pkl", 'wb') as file:
    pickle.dump(best_model, file)

print(f"Saved the best model based on RMSE: {best_model_name}")


Saved the best model based on RMSE: Random Forest
