**Basics:**
Decision trees iterate over each feature and its possible split points and select the split point that minimizes the loss function. Then, split the previously identified regions until maximum tree depth or minimum observations per area are reached (stopping criterion)

To prevent overfitting, use cost complexity pruning (a complexity parameter alpha that punished the model for being too complex).

Using bagging (resampling) to decrease variance (trees are high variance, low bias). Calculate bagging error over out of bag error (OOB) as some data is not part of the model. 

Random Forests create multiple decision trees using different subsets of the data and random subsets of features, introducing variation to each tree. The model aggregates their predictions by averaging the result of each tree. 

In [16]:
#import dependencies
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.tree import plot_tree
from sklearn.tree import DecisionTreeRegressor

from importing_rf import features 

features.head()

ImportError: cannot import name 'features' from 'importing_rf' (/Users/rickenmann/Documents/GitHub/DSF-Project/importing_rf.py)

In [4]:
#import and define the features and target


#target = df["Gewicht in Tonnen"]
#feature1 = df[""]
#

#for time-series random forest you need to lag features and target to preserve temporal order 

Unnamed: 0,Date,Day,Workday,Total Pedestrians,Location,Temperature mean,Temperature max,Temperature min,Precipitation in mm,Snow amount in cm
0,2024-03-24,7,0,420.0,St. Gallen,1.7,3.8,-0.2,5.2,3.0
1,2024-03-25,1,1,1766.0,St. Gallen,4.7,9.3,0.4,0.0,0.0
2,2024-03-26,2,1,1701.0,St. Gallen,9.2,15.3,2.8,0.0,0.0
3,2024-03-28,4,1,1669.0,St. Gallen,5.3,9.9,0.7,1.3,0.0
4,2024-03-29,5,1,665.0,St. Gallen,15.0,20.0,8.1,0.0,0.0


In [None]:
def create_lagged_features(df, target_column, lags):
    lagged_df = pd.DataFrame()
    for lag in range(1, lags + 1):
        for col in df.columns:
            lagged_df[f'{col}_lag_{lag}'] = df[col].shift(lag)
    lagged_df[target_column] = df[target_column]  # Keep the target variable
    return lagged_df

# Generate lagged features
#afterwards test for optimal lags
lags = 3  # Use 3 lag steps
lagged_data = create_lagged_features(data, 'target', lags)

# Drop NaN values caused by lagging
lagged_data = lagged_data.dropna()


In [None]:

from sklearn.model_selection import TimeSeriesSplit

# Define the number of splits for rolling cross-validation
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

# Initialize lists to store mean squared errors
mse_train_list = []
mse_test_list = []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Create and train the decision tree regressor
    tree_regressor = DecisionTreeRegressor(random_state=42)
    tree_regressor.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = tree_regressor.predict(X_train)
    y_test_pred = tree_regressor.predict(X_test)
    
    # Evaluate the model
    mse_train = mean_squared_error(y_train, y_train_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    
    mse_train_list.append(mse_train)
    mse_test_list.append(mse_test)

# Calculate the average mean squared error for train and test sets
avg_mse_train = np.mean(mse_train_list)
avg_mse_test = np.mean(mse_test_list)

print(f'Average Mean Squared Error for the decision tree (train): {avg_mse_train}')
print(f'Average Mean Squared Error for the decision tree (test): {avg_mse_test}')

# Plot the decision tree
plt.figure(figsize=(20,10))
plot_tree(tree_regressor, filled=True, feature_names=X_train.columns, rounded=True)
plt.show()

# Initialize lists to store mean squared errors for random forest
mse_train_rf_list = []
mse_test_rf_list = []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Create and train the random forest regressor
    rf_regressor = RandomForestRegressor(n_estimators=B, random_state=42)
    rf_regressor.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred_rf = rf_regressor.predict(X_train)
    y_test_pred_rf = rf_regressor.predict(X_test)
    
    # Evaluate the model
    mse_train_rf = mean_squared_error(y_train, y_train_pred_rf)
    mse_test_rf = mean_squared_error(y_test, y_test_pred_rf)
    
    mse_train_rf_list.append(mse_train_rf)
    mse_test_rf_list.append(mse_test_rf)

# Calculate the average mean squared error for train and test sets
avg_mse_train_rf = np.mean(mse_train_rf_list)
avg_mse_test_rf = np.mean(mse_test_rf_list)

print(f'Average Mean Squared Error for the random forest (train): {avg_mse_train_rf}')
print(f'Average Mean Squared Error for the random forest (test): {avg_mse_test_rf}')