In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
import sys
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import SMOTE
import joblib

In [2]:
def train_model(X_train=X_train, y_train=y_train, X_test=X_test, y_test, y_test):
    
    print('Checkpoint 0 passed')
    
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    best_model = RandomForestRegressor(
        bootstrap=False,
        max_depth=20,
        max_features=0.5,
        min_samples_leaf=2,
        min_samples_split=2,
        n_estimators=2000
    )
    print("Model generation done. Model fitting initiated")
    print('Checkpoint 1 passed')
    best_model.fit(X_train, y_train)
    
    print("Model fitting done. Prediction initiated")
    
    predictions = best_model.predict(X_test)
    print("Model predicting done")
    
    #best_model.fit(X, y)
    
    #joblib.dump(best_model, 'best_model.pkl')
    #print("Model saved to best_model.pkl")
    
    return best_model, predictions, y_test, X_test

In [None]:
with open('rfr_regressor_test.txt', 'w') as file:
    sys.stdout = file
    
    print("Data processing started")
    
    df = pd.read_csv(r'smps_output_combined_test.csv')
    df['date'] = pd.to_datetime(df['date'])
    df = df.drop(df.columns[df.columns.str.contains('Unnamed')], axis=1)
    df = df.dropna()

    X = df[['date', "pressure", "RH", "temperature", "SWD", 'day.type']].copy()
    y = df.drop(columns=["pressure", "RH", "temperature", "SWD", "pm10", "CS", 'day.type'])
    
    X_test = X[X['date'].dt.year == 2019].copy()
    y_test = y[X['date'].dt.year == 2019].copy()
    X_train = X[X['date'].dt.year != 2019].copy()
    y_train = y[X['date'].dt.year != 2019].copy()
    dates = y_test[['date']].copy()
    y_test = y_test.drop(columns=['date'])
    #y_train = y_train.drop(columns=['date'])
    X_test = X_test.drop(columns=['date', 'day.type'])
    #X_train = X_train.drop(columns=['date'])
    C = pd.merge(X_train, y_train, on='date', how='outer')
    Z = C['day.type']
    C = C.drop(columns=['day.type', 'date'])
    smote = SMOTE(sampling_strategy='auto')
    C, Z = smote.fit_resample(C, Z)
    
    X_train = C[["pressure", "RH", "temperature", "SWD"]].copy()
    y_train = C.drop(columns=["pressure", "RH", "temperature", "SWD"])
    
    print("Data processing done. Model generation started")

    best_model, predictions, y_test, X_test = train_model(X_train=X_train, y_train=y_train, X_test=X_test, y_test, y_test)
    
    dates['date'] = pd.to_datetime(dates['date'])
    dates.reset_index(drop=True, inplace=True)
    
    print('Checkpoint 4 passed')
    
    predictions_df = pd.DataFrame(predictions)
    predictions_df.reset_index(drop=True, inplace=True)
    predictions_df['date'] = dates[['date']].copy()
    predictions_df.to_csv('predictions_df_test.csv')
    
    print('Checkpoint 4.1 passed')
    
    #cols = predictions_df.columns.tolist()
    #cols = ['date'] + [col for col in cols if col != 'date']
    #predictions_df = predictions_df[cols]
    #predictions_df.columns = y_columns
    #predictions_df.to_csv(r'redictions_df.csv')
    
    print('Checkpoint 5 passed')
    
    y_test_df = pd.DataFrame(y_test)
    y_test_df.reset_index(drop=True, inplace=True)
    y_test_df['date'] = dates[['date']].copy()
    y_test_df.to_csv('y_test_df_test.csv')
    
    print('Checkpoint 5.1 passed')    

    #cols = y_test_df.columns.tolist()
    #cols = ['date'] + [col for col in cols if col != 'date']
    #y_test_df = y_test_df[cols]
    #y_test_df.columns = y_columns
    #y_test_df.to_csv(r'y_test_df.csv')

    print('Checkpoint 6 passed')
    print("Outputting done")

    mse = mean_squared_error(y_test, predictions)
    print(f"Mean Squared Error: {mse}")
    sys.stdout = sys.__stdout__