In [1]:
import pandas as pd
import numpy as np
from skopt import BayesSearchCV
import sys
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
def train_model(X, y):
    # Split the data into training and testing sets
    X_test = X[X['date'].dt.year == 2019].copy()
    y_test = y[X['date'].dt.year == 2019].copy()
    X_train = X[X['date'].dt.year != 2019].copy()
    y_train = y[X['date'].dt.year != 2019].copy()
    
    dates = y_test[['date']].copy()
    
    y_test = y_test.drop(columns=['date'])
    y_train = y_train.drop(columns=['date'])
    X_test = X_test.drop(columns=['date'])
    X_train = X_train.drop(columns=['date'])
    
    print('Checkpoint 0 passed')
    
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    param_grid = {
        'n_estimators': [100, 1000, 2000],
        'max_features': [None, 'sqrt', 'log2', 0.5],
        'max_depth': [None, 10, 20, 30, 40, 50, 60],
        'min_samples_split': [2, 5, 10, 20],
        'min_samples_leaf': [1, 2, 4, 8],
        'bootstrap': [True, False]
    }
    model = RandomForestRegressor()
    print('Checkpoint 1 passed')
    bayes_search = BayesSearchCV(
            estimator=model,
            search_spaces=param_grid,
            n_iter=20,  # Number of iterations for optimization
            cv=10,
            scoring = 'mean_squared_error',
            random_state=42
    )
    print('Checkpoint 2 passed')
    
    bayes_search.fit(X_train, y_train)
    best_model = bayes_search.best_estimator_
    best_model.fit(X_train, y_train)
    
    print("Model fitting done. Prediction initiated")
    
    predictions = best_model.predict(X_test)
    print("Model predicting done")
    return best_model, predictions, y_test, X_test, dates

In [None]:
with open('rfr_regressor.txt', 'w') as file:
    sys.stdout = file
    
    print("Data processing started")
    
    df = pd.read_csv(r'smps_output_combined.csv')
    df['date'] = pd.to_datetime(df['date'])
    df = df.drop(df.columns[df.columns.str.contains('Unnamed')], axis=1)
    df = df.dropna()

    X = df[['date', "pressure", "RH", "temperature", "SWD"]].copy()
    y = df.drop(columns=["pressure", "RH", "temperature", "SWD", "pm10", "CS"])
    
    print("Data processing done. Model generation started")

    best_model, predictions, y_test, X_test, dates = train_model(X=X, y=y)
    
    dates['date'] = pd.to_datetime(dates['date'])
    dates.reset_index(drop=True, inplace=True)
    
    print('Checkpoint 4 passed')
    
    predictions_df = pd.DataFrame(predictions)
    predictions_df.reset_index(drop=True, inplace=True)
    predictions_df['date'] = dates[['date']].copy()
    
    cols = predictions_df.columns.tolist()
    cols = ['date'] + [col for col in cols if col != 'date']
    predictions_df = predictions_df[cols]
    predictions_df.columns = y.columns
    predictions_df.to_csv(r'redictions_df.csv')
    
    print('Checkpoint 5 passed')
    
    y_test_df = pd.DataFrame(y_test)
    y_test_df.reset_index(drop=True, inplace=True)
    y_test_df['date'] = dates[['date']].copy()
    
    cols = y_test_df.columns.tolist()
    cols = ['date'] + [col for col in cols if col != 'date']
    y_test_df = y_test_df[cols]
    y_test_df.columns = y.columns
    y_test_df.to_csv(r'y_test_df.csv')

    print('Checkpoint 6 passed')
    print("Outputting done")

    mse = mean_squared_error(y_test, predictions)
    print(f"Mean Squared Error: {mse}")
    sys.stdout = sys.__stdout__