In [None]:

import pandas as pd
import numpy as np
import joblib
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import gc
from sklearn.multioutput import MultiOutputRegressor

In [None]:
def train_model(X, y, X_test, y_test):
    try:
        dates = y_test[['date']].copy() if 'date' in y_test.columns else None
        y_test = y_test.drop(columns=['date'], errors='ignore')

        X_train = np.array(X)
        y_train = np.array(y)
        X_test = np.array(X_test)
        y_test = np.array(y_test)

        rf_param_dist = {
            'max_depth': [10, 20, None],
            'max_features': ['sqrt', 0.5],
            'min_samples_split': [2, 5],
            'n_estimators': [100, 500]
        }

        xgb_param_dist = {
            'n_estimators': [100, 500],
            'learning_rate': [0.1, 0.2],
            'max_depth': [3, 6],
            'subsample': [0.7, 1.0]
        }
        
        rf = RandomForestRegressor()
        xgb = XGBRegressor(objective="reg:squarederror", random_state=42)

        rf_search = RandomizedSearchCV(
            estimator=rf,
            param_distributions=rf_param_dist,
            n_iter=3,
            scoring='neg_mean_squared_error',
            cv=3,
            verbose=1,
            random_state=42,
            n_jobs=-1
        )
        xgb_search = RandomizedSearchCV(
            estimator=xgb,
            param_distributions=xgb_param_dist,
            n_iter=3,
            scoring='neg_mean_squared_error',
            cv=3,
            verbose=1,
            random_state=42,
            n_jobs=-1
        )

        rf_search.fit(X_train, y_train)
        xgb_search.fit(X_train, y_train)

        best_rf = rf_search.best_estimator_
        best_xgb = xgb_search.best_estimator_
        
        rf_multi = MultiOutputRegressor(best_rf)
        xgb_multi = MultiOutputRegressor(best_xgb)
        
        gc.collect()

        rf_multi.fit(X_train, y_train)
        xgb_multi.fit(X_train, y_train)

        rf_predictions = rf_multi.predict(X_test)
        xgb_predictions = xgb_multi.predict(X_test)
        
        gc.collect()

        predictions = (rf_predictions + xgb_predictions) / 2

        return rf_multi, predictions, y_test, X_test, dates

    except Exception as e:
        raise RuntimeError(f"Training failed: {str(e)}")

In [None]:
print("1. Data processing started")

df = pd.read_csv('smps_output_combined(backup).csv')
df['date'] = pd.to_datetime(df['date'])
df = df.drop(columns=df.columns[df.columns.str.contains('Unnamed')])
df = df.dropna()
print("2. Training data cleaned")

y = df['day.type']
X = df.drop(columns=['day.type', 'date'])

smote = SMOTE(sampling_strategy='auto')
X, y = smote.fit_resample(X, y)
print(f"3. SMOTE applied. New shape: {X.shape}")
y = X.copy()
y = y.drop(columns=["p", "RH", "t", "SRAD"])
X = X[["p", "RH", "t", "SRAD"]].copy()
features = ["p", "RH", "t", "SRAD"]

df_test = pd.read_csv('test_set(backup).csv')
df_test['date'] = pd.to_datetime(df_test['date'])
df_test = df_test.drop(columns=df_test.columns[df_test.columns.str.contains('Unnamed')])
df_test = df_test.dropna()
X_test = df_test[features].copy()
y_test = df_test.drop(columns=features)
print("5. Test data processed", flush=True)

print("6. Starting model training", flush=True)
best_model, predictions, y_test_arr, X_test_arr, dates = train_model(X, y, X_test, y_test)
joblib.dump(best_model, 'best_model.pkl')
print("7. Model saved", flush=True)

pred_df = pd.DataFrame(predictions, columns=y.columns)
pred_df['date'] = pd.to_datetime(dates['date'])
pred_df = pred_df[['date'] + list(y.columns)]
pred_df.to_csv('predictions_df.csv', index=False)
print("9. Predictions saved", flush=True)

y_test_df = pd.DataFrame(y_test_arr, columns=y.columns)
y_test_df['date'] = pd.to_datetime(dates['date'])
y_test_df = y_test_df[['date'] + list(y.columns)]
y_test_df.to_csv('y_test_df.csv', index=False)
print("10. Test values saved", flush=True)

mse = mean_squared_error(y_test_df.drop(columns='date'), pred_df.drop(columns='date'))
print(f"\n11. Final MSE: {mse:.4f}", flush=True)