In [None]:
import pandas as pd

df = pd.read_pickle('df_cleaned.pkl')

# Now 'df' contains the data from the pickled file.
# You can verify this by printing the first few rows:
print(df.head())


In [None]:
df.info()

In [None]:
# Part 1: pip install (Run this in a Jupyter Notebook cell)

!pip install pandas scikit-learn numpy xgboost

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def prepare_covid_data(df):
    """Prepares the COVID-19 DataFrame for time-series forecasting."""
    print("Preparing COVID-19 data...")

    # Calculate the 7-day change in total cases.
    df['total_cases_change_7'] = df.groupby('location')['total_cases'].shift(1).diff(7).fillna(0)

    # Use the original target variable.
    df['target'] = df['new_cases_smoothed_per_million']

    # Extract time-based features.
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['week'] = df['date'].dt.isocalendar().week.astype(int)
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek

    # Drop rows with NaN due to total_cases_change_7 creation.
    df = df.dropna(subset=['total_cases_change_7'])

    # Drop total cases and total cases per million to prevent data leakage.
    df = df.drop(columns=['total_cases', 'total_cases_per_million'])

    # Drop all new cases features except target.
    columns_to_drop = [col for col in df.columns if 'new_cases' in col and col != 'new_cases_smoothed_per_million']
    df = df.drop(columns=columns_to_drop)

    print("COVID-19 data prepared.")
    return df

# Assuming 'df' is already loaded in your Jupyter Notebook.

# Ensure the 'date' column is in datetime format.
df['date'] = pd.to_datetime(df['date'])

print("Starting data preparation...")
df = prepare_covid_data(df.copy())
print("Data preparation complete.")

print("Starting data scaling...")
# Prepare data for modeling (Corrected X definition)
X = df.drop(['target', 'date', 'location', 'new_cases_smoothed_per_million'], axis=1) # new_cases_smoothed_per_million is the target.
y = df['target']

# Scale the data.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Data scaling complete.")

print("Starting feature selection with Lasso...")
# Perform feature selection.
lasso = Lasso(alpha=0.01).fit(X_scaled, y)
lasso_selected = (np.abs(lasso.coef_) > 0).astype(int)
print("Lasso feature selection complete.")

print("Starting feature selection with Ridge...")
ridge = Ridge(alpha=0.01).fit(X_scaled, y)
ridge_selected = (np.abs(ridge.coef_) > 0).astype(int)
print("Ridge feature selection complete.")

print("Starting feature selection with Gradient Boosting...")
gb = GradientBoostingRegressor().fit(X_scaled, y)
gb_selected = (gb.feature_importances_ > 0).astype(int)
print("Gradient Boosting feature selection complete.")

print("Starting feature selection with Random Forest...")
rf = RandomForestRegressor().fit(X_scaled, y)
rf_selected = (rf.feature_importances_ > 0).astype(int)
print("Random Forest feature selection complete.")

print("Creating feature selection DataFrame...")
selection_df = pd.DataFrame({
    'Feature': X.columns,
    'Lasso': lasso_selected,
    'GradientBoost': gb_selected,
    'RandomForest': rf_selected,
    'Ridge': ridge_selected
})

selection_df['Sum'] = selection_df[['Lasso', 'GradientBoost', 'RandomForest', 'Ridge']].sum(axis=1)

print(selection_df)
print("Feature selection DataFrame created.")

# Save feature selection table to a text file.
selection_df.to_csv('feature_selection_table.txt', sep='\t', index=False)
print("Feature selection table saved to 'feature_selection_table.txt'")

print("Selecting final variables...")
# Select final variables.
final_var = selection_df[selection_df['Sum'] >= 4]['Feature'].tolist()
df_model = df[final_var].copy()
df_model['target'] = df['target'].copy()

print("Final variables selected.")
print("Final DataFrame info:")
df_model.info()
print("Process complete.")

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

def prepare_covid_data(df):
    """Prepares the COVID-19 DataFrame for time-series forecasting."""
    print("Preparing COVID-19 data...")

    # Calculate the 7-day change in total cases.
    df['total_cases_change_7'] = df.groupby('location')['total_cases'].shift(1).diff(7).fillna(0)

    # Use the original target variable.
    df['target'] = df['new_cases_smoothed_per_million']

    # Extract time-based features.
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['week'] = df['date'].dt.isocalendar().week.astype(int)
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek

    # Drop rows with NaN due to total_cases_change_7 creation.
    df = df.dropna(subset=['total_cases_change_7'])

    # Drop total cases and total cases per million to prevent data leakage.
    df = df.drop(columns=['total_cases', 'total_cases_per_million'])

    # Drop all new cases features except target.
    columns_to_drop = [col for col in df.columns if 'new_cases' in col and col != 'new_cases_smoothed_per_million']
    df = df.drop(columns=columns_to_drop)

    print("COVID-19 data prepared.")
    return df

# Assuming 'df' is already loaded in your Jupyter Notebook.

# Ensure the 'date' column is in datetime format.
df['date'] = pd.to_datetime(df['date'])

print("Starting data preparation...")
df = prepare_covid_data(df.copy())
print("Data preparation complete.")

print("Starting data scaling...")
# Prepare data for modeling (Corrected X definition)
X = df.drop(['target', 'date', 'location', 'new_cases_smoothed_per_million'], axis=1) # new_cases_smoothed_per_million is the target.
y = df['target']

# Scale the data.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Data scaling complete.")

print("Starting feature selection with Lasso...")
# Perform feature selection.
lasso = Lasso(alpha=0.01).fit(X_scaled, y)
lasso_selected = (np.abs(lasso.coef_) > 0).astype(int)
print("Lasso feature selection complete.")

print("Starting feature selection with Ridge...")
ridge = Ridge(alpha=0.01).fit(X_scaled, y)
ridge_selected = (np.abs(ridge.coef_) > 0).astype(int)
print("Ridge feature selection complete.")

print("Starting feature selection with Gradient Boosting...")
gb = GradientBoostingRegressor().fit(X_scaled, y)
gb_selected = (gb.feature_importances_ > 0).astype(int)
print("Gradient Boosting feature selection complete.")

print("Starting feature selection with Random Forest...")
rf = RandomForestRegressor().fit(X_scaled, y)
rf_selected = (rf.feature_importances_ > 0).astype(int)
print("Random Forest feature selection complete.")

print("Creating feature selection DataFrame...")
selection_df = pd.DataFrame({
    'Feature': X.columns,
    'Lasso': lasso_selected,
    'GradientBoost': gb_selected,
    'RandomForest': rf_selected,
    'Ridge': ridge_selected
})

selection_df['Sum'] = selection_df[['Lasso', 'GradientBoost', 'RandomForest', 'Ridge']].sum(axis=1)

print(selection_df)
print("Feature selection DataFrame created.")

# Save feature selection table to a text file.
selection_df.to_csv('feature_selection_table.txt', sep='\t', index=False)
print("Feature selection table saved to 'feature_selection_table.txt'")

print("Selecting final variables...")
# Select final variables.
final_var = selection_df[selection_df['Sum'] >= 4]['Feature'].tolist()
df_model = df[final_var].copy()
df_model['target'] = df['target'].copy()
df_model['date'] = df['date'].copy()
df_model['location'] = df['location'].copy()
df_model['continent'] = df['continent'].copy()

print("Final variables selected.")
print("Final DataFrame info:")
df_model.info()

print("Saving df_model to pickle file...")
# Save df_model to a pickle file.
with open('df_model.pkl', 'wb') as f:
    pickle.dump(df_model, f)
print("df_model saved to 'df_model.pkl'")

print("Process complete.")

In [None]:
df_model = df_model.drop('location', axis=1, errors='ignore')
df_model.info()

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import io
from sklearn.dummy import DummyRegressor # Added import

def rmsle(y_true, y_pred):
    """Calculates the Root Mean Squared Logarithmic Error."""
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    log_diff = np.log1p(y_true) - np.log1p(y_pred)
    return np.sqrt(np.mean(log_diff**2))

def evaluate_model(model, X, y, name, results, dataset_type):
    """Evaluates the model and stores the metrics."""
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    rmse = math.sqrt(mse)
    rmsle_val = rmsle(y, y_pred)

    if name not in results:
        results[name] = {}

    results[name][dataset_type] = {
        'MSE': mse, 'MAE': mae, 'R2': r2, "RMSE": rmse, "RMSLE": rmsle_val
    }
    print(f"  Evaluated {name} on {dataset_type}.")
    return f"  Evaluated {name} on {dataset_type}.\n    MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}, RMSE: {rmse:.4f}, RMSLE: {rmsle_val:.4f}\n"

def train_and_evaluate_models(df, start_year, start_month, end_year, end_month):
    """Trains and evaluates multiple regression models, with stratified location split."""

    print("Starting model training and evaluation...")

    # Debugging: Print min/max year and month values
    print("Min Year:", df['year'].min(), "Max Year:", df['year'].max())
    print("Min Month:", df['month'].min(), "Max Month:", df['month'].max())

    # Filter data for given period.
    df = df[((df['year'] == start_year) & (df['month'] >= start_month)) | ((df['year'] == end_year) & (df['month'] <= end_month))].copy()

    print(f"Data filtered for year {start_year} to {end_year} and month {start_month} to {end_month}. Shape:", df.shape)

    # Check for empty DataFrame after filtering.
    if df.empty:
        print("Error: DataFrame is empty after filtering. Adjust date range.")
        return None  # Or raise an exception, or return a default value

    # Extract week based feature.
    df['week'] = df['week'].astype(int)

    # Location encoding
    label_encoder = LabelEncoder()
    df['location'] = label_encoder.fit_transform(df['location'])
    df['continent'] = df['continent'].astype(int)

    # Stratified location split (KEEP DATE COLUMN DURING SPLIT)
    unique_locations = df['location'].unique()
    train_locations, temp_locations = train_test_split(unique_locations, test_size=0.3, random_state=42, stratify=df.groupby('location')['continent'].first().loc[unique_locations])
    val_locations, test_locations = train_test_split(temp_locations, test_size=0.5, random_state=42, stratify=df.groupby('location')['continent'].first().loc[temp_locations])
    print("Split locations into train/val/test.")

    train_df = df[df['location'].isin(train_locations)]
    val_df = df[df['location'].isin(val_locations)]
    test_df = df[df['location'].isin(test_locations)]
    print("Split data into train/val/test. Train shape:", train_df.shape, "Val shape:", val_df.shape, "Test shape:", test_df.shape)

    # Use the selected features from df_model.
    selected_features = [col for col in df.columns if col not in ['target', 'location', 'continent', 'year', 'month','date','iso_code']]

    X_train = train_df[selected_features]
    y_train = train_df['target']
    X_val = val_df[selected_features]
    y_val = val_df['target']
    X_test = test_df[selected_features]
    y_test = test_df['target']

    print("Prepared X and y train/val/test.")

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    print("Scaled features.")

    # Define models
    models = {
        'Dummy Regressor (Mean)': DummyRegressor(strategy='mean'),
        'Linear Regression': LinearRegression(),
        'Random Forest Regressor': RandomForestRegressor(random_state=42),
        'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
        'SVR': SVR(),
        'K-Nearest Neighbors Regressor': KNeighborsRegressor(),
        'XGBoost Regressor': xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    }

    results = {}
    summary_text = "Model Comparison:\n"

    # Train and evaluate models
    print("Starting model training loop...")
    for name, model in models.items():
        print(f"  Training {name}...")
        model.fit(X_train_scaled, y_train)

        summary_text += evaluate_model(model, X_train_scaled, y_train, name, results, 'train')
        summary_text += evaluate_model(model, X_val_scaled, y_val, name, results, 'val')
        summary_text += evaluate_model(model, X_test_scaled, y_test, name, results, 'test')

    # Display results
    print("\nModel Comparison:")
    print(summary_text)

    # Save summary to local file
    with open('model_summary.txt', 'w') as f:
        f.write(summary_text)
    print("Model summary saved to local file: model_summary.txt")

    print("Model training and evaluation complete.")
    return models

# Example usage (assuming your DataFrame is called 'df_model' and already loaded)
#df_model['date'] = pd.to_datetime(df_model['date']) #date column not in df_model, but in df.

# Set the desired period
start_year = 2021
start_month = 11
end_year = 2022
end_month = 4

trained_models = train_and_evaluate_models(df_model.copy(), start_year, start_month, end_year, end_month)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import io
from sklearn.dummy import DummyRegressor

def rmsle(y_true, y_pred):
    """Calculates the Root Mean Squared Logarithmic Error."""
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    log_diff = np.log1p(y_true) - np.log1p(y_pred)
    return np.sqrt(np.mean(log_diff**2))

def evaluate_model(model, X, y, name, results, dataset_type):
    """Evaluates the model and stores the metrics, clipping negative predictions."""
    y_pred = model.predict(X)
    y_pred_clipped = np.maximum(y_pred, 0) # Clip negative predictions

    mse = mean_squared_error(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    rmse = math.sqrt(mse)
    rmsle_val = rmsle(y, y_pred_clipped)

    if name not in results:
        results[name] = {}

    results[name][dataset_type] = {
        'MSE': mse, 'MAE': mae, 'R2': r2, "RMSE": rmse, "RMSLE": rmsle_val
    }
    print(f"  Evaluated {name} on {dataset_type}.")
    return f"  Evaluated {name} on {dataset_type}.\n    MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}, RMSE: {rmse:.4f}, RMSLE: {rmsle_val:.4f}\n"

def train_and_evaluate_models(df, start_year, start_month, end_year, end_month):
    """Trains and evaluates multiple regression models, with stratified iso_code split."""

    print("Starting model training and evaluation...")

    # Debugging: Print min/max year and month values
    print("Min Year:", df['year'].min(), "Max Year:", df['year'].max())
    print("Min Month:", df['month'].min(), "Max Month:", df['month'].max())

    # Filter data for given period.
    df = df[((df['year'] == start_year) & (df['month'] >= start_month)) | ((df['year'] == end_year) & (df['month'] <= end_month))].copy()

    print(f"Data filtered for year {start_year} to {end_year} and month {start_month} to {end_month}. Shape:", df.shape)

    # Check for empty DataFrame after filtering.
    if df.empty:
        print("Error: DataFrame is empty after filtering. Adjust date range.")
        return None

    # Extract week based feature.
    df['week'] = df['week'].astype(int)

    # iso_code encoding
    label_encoder = LabelEncoder()
    df['iso_code'] = label_encoder.fit_transform(df['iso_code'])
    df['continent'] = df['continent'].astype(int)

    # Stratified iso_code split
    unique_iso_codes = df['iso_code'].unique()
    train_iso_codes, temp_iso_codes = train_test_split(unique_iso_codes, test_size=0.3, random_state=42, stratify=df.groupby('iso_code')['continent'].first().loc[unique_iso_codes])
    val_iso_codes, test_iso_codes = train_test_split(temp_iso_codes, test_size=0.5, random_state=42, stratify=df.groupby('iso_code')['continent'].first().loc[temp_iso_codes])
    print("Split iso_codes into train/val/test.")

    train_df = df[df['iso_code'].isin(train_iso_codes)]
    val_df = df[df['iso_code'].isin(val_iso_codes)]
    test_df = df[df['iso_code'].isin(test_iso_codes)]
    print("Split data into train/val/test. Train shape:", train_df.shape, "Val shape:", val_df.shape, "Test shape:", test_df.shape)

    # Use the selected features.
    selected_features = [col for col in df.columns if col not in ['target', 'iso_code', 'continent', 'year', 'month', 'date', 'location']]

    X_train = train_df[selected_features]
    y_train = train_df['target']
    X_val = val_df[selected_features]
    y_val = val_df['target']
    X_test = test_df[selected_features]
    y_test = test_df['target']

    print("Prepared X and y train/val/test.")

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    print("Scaled features.")

    # Define models
    models = {
        'Dummy Regressor (Mean)': DummyRegressor(strategy='mean'),
        'Linear Regression': LinearRegression(),
        'Random Forest Regressor': RandomForestRegressor(random_state=42),
        'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
        'SVR': SVR(),
        'K-Nearest Neighbors Regressor': KNeighborsRegressor(),
        'XGBoost Regressor': xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    }

    results = {}
    summary_text = "Model Comparison:\n"

    # Train and evaluate models
    print("Starting model training loop...")
    for name, model in models.items():
        print(f"  Training {name}...")
        model.fit(X_train_scaled, y_train)

        summary_text += evaluate_model(model, X_train_scaled, y_train, name, results, 'train')
        summary_text += evaluate_model(model, X_val_scaled, y_val, name, results, 'val')
        summary_text += evaluate_model(model, X_test_scaled, y_test, name, results, 'test')

    # Display results
    print("\nModel Comparison:")
    print(summary_text)

    # Save summary to local file
    with open('model_summary.txt', 'w') as f:
        f.write(summary_text)
    print("Model summary saved to local file: model_summary.txt")

    print("Model training and evaluation complete.")
    return models

# Example usage (assuming your DataFrame is called 'df_model' and already loaded)
#df_model['date'] = pd.to_datetime(df_model['date']) #date column not in df_model, but in df.

# Set the desired period
start_year = 2021
start_month = 11
end_year = 2022
end_month = 4

trained_models = train_and_evaluate_models(df_model.copy(), start_year, start_month, end_year, end_month)

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.dummy import DummyRegressor

def rmsle(y_true, y_pred):
    """Calculates the Root Mean Squared Logarithmic Error."""
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    log_diff = np.log1p(y_true) - np.log1p(y_pred)
    return np.sqrt(np.mean(log_diff**2))

def evaluate_model(model, X, y, name, results, dataset_type):
    """Evaluates the model and stores the metrics, clipping negative predictions."""
    y_pred = model.predict(X)
    y_pred_clipped = np.maximum(y_pred, 0)
    mse = mean_squared_error(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    rmse = math.sqrt(mse)
    rmsle_val = rmsle(y, y_pred_clipped)

    if name not in results:
        results[name] = {}

    results[name][dataset_type] = {
        'MSE': mse, 'MAE': mae, 'R2': r2, "RMSE": rmse, "RMSLE": rmsle_val
    }
    print(f"  Evaluated {name} on {dataset_type}.")
    return f"  Evaluated {name} on {dataset_type}.\n    MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}, RMSE: {rmse:.4f}, RMSLE: {rmsle_val:.4f}\n"

def train_and_evaluate_models_cv(df, start_year, start_month, end_year, end_month, n_splits=5, test_size=0.15, val_size=0.15, output_file="results.txt"):
    """Trains and evaluates multiple regression models using 5-fold cross-validation and saves results to a file and prints to console."""

    with open(output_file, "w") as f:
        f.write("Starting model training and evaluation with cross-validation...\n")
        print("Starting model training and evaluation with cross-validation...")

        # Filter data for given period.
        df_filtered = pd.DataFrame()
        if start_year == end_year:
            df_filtered = df[((df['year'] == start_year) & (df['month'] >= start_month) & (df['month'] <= end_month))].copy()
        else:
            df_filtered = pd.concat([
                df[((df['year'] == start_year) & (df['month'] >= start_month))].copy(),
                df[((df['year'] == end_year) & (df['month'] <= end_month))].copy(),
                df[((df['year'] > start_year) & (df['year'] < end_year))].copy()
            ])
        df = df_filtered

        f.write(f"Data filtered for year {start_year} to {end_year} and month {start_month} to {end_month}. Shape: {df.shape}\n")
        print(f"Data filtered for year {start_year} to {end_year} and month {start_month} to {end_month}. Shape:", df.shape)

        if df.empty:
            f.write("Error: DataFrame is empty after filtering. Adjust date range.\n")
            print("Error: DataFrame is empty after filtering. Adjust date range.")
            return None

        df['week'] = df['week'].astype(int)
        label_encoder = LabelEncoder()
        df['iso_code'] = label_encoder.fit_transform(df['iso_code'])
        df['continent'] = df['continent'].astype(int)

        unique_iso_codes = df['iso_code'].unique()
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        results_cv = {}

        for fold, (train_index, val_test_index) in enumerate(kf.split(unique_iso_codes)):
            f.write(f"\nFold {fold + 1}/{n_splits}\n")
            print(f"\nFold {fold + 1}/{n_splits}")
            train_iso_codes = unique_iso_codes[train_index]
            val_test_iso_codes = unique_iso_codes[val_test_index]

            # Split val_test into val and test
            val_iso_codes, test_iso_codes = train_test_split(val_test_iso_codes, test_size=val_size / (val_size + test_size), random_state=42)

            train_df = df[df['iso_code'].isin(train_iso_codes)]
            val_df = df[df['iso_code'].isin(val_iso_codes)]
            test_df = df[df['iso_code'].isin(test_iso_codes)]

            selected_features = [col for col in df.columns if col not in ['target', 'iso_code', 'continent', 'year', 'month', 'date', 'location']]

            X_train, y_train = train_df[selected_features], train_df['target']
            X_val, y_val = val_df[selected_features], val_df['target']
            X_test, y_test = test_df[selected_features], test_df['target']

            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_val_scaled = scaler.transform(X_val)
            X_test_scaled = scaler.transform(X_test)

            models = {
                'Dummy Regressor (Mean)': DummyRegressor(strategy='mean'),
                'Linear Regression': LinearRegression(),
                'Random Forest Regressor': RandomForestRegressor(random_state=42),
                'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
                'SVR': SVR(),
                'K-Nearest Neighbors Regressor': KNeighborsRegressor(),
                'XGBoost Regressor': xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
            }

            results = {}
            summary_text = f"Fold {fold + 1}:\n"

            for name, model in models.items():
                model.fit(X_train_scaled, y_train)
                summary_text += evaluate_model(model, X_train_scaled, y_train, name, results, 'train')
                summary_text += evaluate_model(model, X_val_scaled, y_val, name, results, 'val')
                summary_text += evaluate_model(model, X_test_scaled, y_test, name, results, 'test')

            f.write(summary_text)
            print(summary_text, end='')
            results_cv[fold] = results

        # Average results across folds
        average_results = {}
        f.write("\nAverage Model Comparison:\n")
        print("\nAverage Model Comparison:")
        for model_name, avg_results in average_results.items():
            f.write(f"\n{model_name}:\n")
            print(f"\n{model_name}:")
            for dataset_type, metrics in avg_results.items():
                f.write(f"  {dataset_type}: {metrics}\n")
                print(f"  {dataset_type}: {metrics}")

        return average_results

# Load your data from df_model.pkl
try:
    df_model = pd.read_pickle("df_model.pkl")
except FileNotFoundError:
    print("Error: df_model.pkl file not found.")
    exit()

# Call the function with df_model and specify the output file
start_year = 2021
start_month = 11
end_year = 2022
end_month = 4

average_results = train_and_evaluate_models_cv(df_model.copy(), start_year, start_month, end_year, end_month, output_file="my_results.txt")

if average_results is not None:
    print("Results saved to my_results.txt")

Starting model training and evaluation with cross-validation...
Data filtered for year 2021 to 2022 and month 11 to 4. Shape: (44102, 28)

Fold 1/5
  Evaluated Dummy Regressor (Mean) on train.
  Evaluated Dummy Regressor (Mean) on val.
  Evaluated Dummy Regressor (Mean) on test.
  Evaluated Linear Regression on train.
  Evaluated Linear Regression on val.
  Evaluated Linear Regression on test.
  Evaluated Random Forest Regressor on train.
  Evaluated Random Forest Regressor on val.
  Evaluated Random Forest Regressor on test.
  Evaluated Gradient Boosting Regressor on train.
  Evaluated Gradient Boosting Regressor on val.
  Evaluated Gradient Boosting Regressor on test.
  Evaluated SVR on train.
  Evaluated SVR on val.
  Evaluated SVR on test.
  Evaluated K-Nearest Neighbors Regressor on train.
  Evaluated K-Nearest Neighbors Regressor on val.
  Evaluated K-Nearest Neighbors Regressor on test.
  Evaluated XGBoost Regressor on train.
  Evaluated XGBoost Regressor on val.
  Evaluated XGB