In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import numpy as np

# Install openpyxl if it's not already installed
!pip install openpyxl



In [None]:
# Read the Excel file and specify the sheet names
try:
    df_train = pd.read_excel("gdp_base.xlsx", sheet_name="df_current_train")
    df_test = pd.read_excel("gdp_base.xlsx", sheet_name="df_current_test")
except FileNotFoundError:
    print("Error: gdp_base.xlsx not found. Please make sure the file is in the same directory as your notebook or provide the correct path.")
    df_train, df_test = None, None
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    df_train, df_test = None, None

In [None]:
# Ensure 'Date' is in datetime format
df_train['date'] = pd.to_datetime(df_train['date'])
df_test['date'] = pd.to_datetime(df_test['date'])

# Drop rows where 'y' is NaN
df_train = df_train.dropna(subset=['y'])
df_test = df_test.dropna(subset=['y'])

# Drop features (columns) with NaN values, excluding 'y'
df_train = df_train.loc[:, df_train.columns.isin(['date', 'y']) | df_train.notna().all()]
df_test = df_test.loc[:, df_test.columns.isin(['date', 'y']) | df_test.notna().all()]

# Feature Engineering (using other columns as features)
features = [col for col in df_train.columns if col not in ['date', 'y']]  # all columns that are not date or y will be used as a feature
X_train = df_train[features]
y_train = df_train['y']
X_test = df_test[features]
y_test = df_test['y']

# Splitting the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define models to test
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Support Vector Machine": SVR()
}

# Evaluate each model
for name, model in models.items():
    print(f"Evaluating {name}...")

    # Train the model
    model.fit(X_train, y_train)

    # Predict on validation set
    y_val_pred = model.predict(X_val)

    # Evaluate on validation set
    val_mse = mean_squared_error(y_val, y_val_pred)
    val_rmse = np.sqrt(val_mse)
    print(f"{name} Validation Mean Squared Error: {val_mse}")
    print(f"{name} Validation Root Mean Squared Error: {val_rmse}")

    # Predict on test set
    y_test_pred = model.predict(X_test)

    # Evaluate on test set
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_rmse = np.sqrt(test_mse)
    print(f"{name} Test Mean Squared Error: {test_mse}")
    print(f"{name} Test Root Mean Squared Error: {test_rmse}\n")


Evaluating Linear Regression...
Linear Regression Validation Mean Squared Error: 8.356758612351275
Linear Regression Validation Root Mean Squared Error: 2.8908058759368944
Linear Regression Test Mean Squared Error: 2.2347821647460187
Linear Regression Test Root Mean Squared Error: 1.4949187819898506

Evaluating Random Forest...
Random Forest Validation Mean Squared Error: 14.353429100000005
Random Forest Validation Root Mean Squared Error: 3.7885919680007776
Random Forest Test Mean Squared Error: 0.12771942857142857
Random Forest Test Root Mean Squared Error: 0.357378550799329

Evaluating Support Vector Machine...
Support Vector Machine Validation Mean Squared Error: 11.194374701823062
Support Vector Machine Validation Root Mean Squared Error: 3.345799560915606
Support Vector Machine Test Mean Squared Error: 0.11185510614493374
Support Vector Machine Test Root Mean Squared Error: 0.33444746395350905

