## Random Forest:

In [1]:
# Importe
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib

from preprocessing import data_preprocessing

In [2]:
# Vordefinierte Farbpaletten
color_palette_1 = ['#763DFF']    # 1 Farbe für Diagramm
color_palette_2 = ['#763DFF', '#FF3D65']    # 2 Farben für Diagramm
color_palette_3 = ['#763DFF', '#FF3D65', '#C6FF3D']    # 3 Farben für Diagramm
color_palette_4 = ['#763DFF', '#FF3D65', '#C6FF3D', '#3DFFD7']    # 4 Farben für Diagramm

# Vordefinierte Schriftgrößen für Achsen und titel
fontsize_title = 20
fontsize_axes =15

In [3]:
def get_tuned_model_random_forest():

    # Daten vorbereiten
    merge_train, merge_test = data_preprocessing()

    # Features und Target festlegen
    X = merge_train.drop(['Date', 'Year', 'Month', 'Week', 'Weekly_Sales'], axis=1)
    y = merge_train['Weekly_Sales']  # Target

    # Daten in Trainings- und Testsets aufteilen
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

    # Hyperparameter für GridSearch festlegen
    param_grid = {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [10, 15, 20, 25, None],
        'min_samples_split': [2, 5, 10],
        'max_features': ['sqrt', 'log2', None],
        'max_leaf_nodes': [None, 10, 50, 100]
    }

    # GridSearchCV mit RandomForestRegressor
    grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Bestes Modell finden und speichern
    tuned_model = grid_search.best_estimator_
    joblib.dump(tuned_model, 'random_forest_model.pkl')
    print(grid_search.best_estimator_)
    print(grid_search.best_params_)

    
    

In [19]:
get_tuned_model_random_forest()

In [17]:
def visualizing_forecasts(y_test_future):
    # merge_train aus preprocessing.py importieren
    merge_train, merge_test = data_preprocessing()

    # 'Date' aus merge_test in Variable abspeichern
    future_dates = merge_test['Date']  # für spätere Visualisierung in visualizing_forecasts

    # Predictions mit 'Date' anreichern für Visualisierung im Diagramm
    y_test_future_with_date = y_test_future.copy()
    y_test_future_with_date['Date'] = future_dates

    # historische Werte für Diagramm beschaffen
    historical_values_for_diagram = merge_train[['Date', 'Weekly_Sales']]

    # Erstellung Liniendiagramm
    sns.set_style('darkgrid')
    fig1, ax1 = plt.subplots(figsize=(15, 6))
    
    # Historische Werte plotten
    historical_values_for_diagram.groupby('Date')['Weekly_Sales'].mean().plot(label='Historisch', ax=ax1)
    # Prognostizierte Werte plotten
    y_test_future_with_date.groupby('Date')['Weekly_Sales'].mean().plot(color='orange', label='Prognostiziert', ax=ax1)

    ax1.legend(loc='best', fontsize=12)  # Setze die Schriftgröße direkt
    ax1.set_ylabel('Sales', fontsize=12)  # Korrigierte Methode
    ax1.set_xlabel('Date', fontsize=12)    # Korrigierte Methode
    ax1.set_title('Historische vs. Prognostizierte Sales', fontsize=14)  # Korrigierte Methode

    plt.show()

    # Erstellung Tabellen
    table1 = y_test_future_with_date.set_index('Date')

    




In [6]:
# Bewertung der Modellleistung auf dem Trainings- und Testdatensatz
def evaluate_model(tuned_model, X_train, y_train, X_test, y_test):

    # Vorhersagen auf Trainings- und Testdaten (basierend auf train.csv)
    y_train_pred = tuned_model.predict(X_train)
    y_test_pred = tuned_model.predict(X_test)  

    # Berechnung der Metriken auf Trainingsdaten (basierend auf train.csv)
    mse_train = mean_squared_error(y_train, y_train_pred)
    rmse_train = np.sqrt(mse_train)
    mae_train = mean_absolute_error(y_train, y_train_pred)
    r2_train = r2_score(y_train, y_train_pred)

    # Berechnung der Metriken auf Testdaten (basierend auf train.csv)
    mse_test = mean_squared_error(y_test, y_test_pred)  
    rmse_test = np.sqrt(mse_test)
    mae_test = mean_absolute_error(y_test, y_test_pred)  
    r2_test = r2_score(y_test, y_test_pred) 

    return mse_train, rmse_train, mae_train, r2_train, mse_test, rmse_test, mae_test, r2_test

In [13]:
def sales_forecast():

    # Import merge_train
    merge_train, merge_test = data_preprocessing()

    # Tuned_Model laden
    tuned_model = joblib.load('random_forest_model.pkl')

    # Splitte Trainingsdaten (train.csv) in train und test 
    X = merge_train.drop(['Date','Year', 'Month', 'Week', 'Weekly_Sales'], axis=1) # Alle Originalfeatures (ohne Weekly_Sales = Target und ohne Zeitkomponenten) 
    y = merge_train['Weekly_Sales']     # Nur Weekly_Sales (Target) 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101) 

    # Überprüfung auf Overfitting und Underfitting
    mse_train, rmse_train, mae_train, r2_train, mse_test, rmse_test, mae_test, r2_test = evaluate_model(tuned_model, X_train, y_train, X_test, y_test)

    
    # Prognosen für Zukunft erstellen (basierend auf train.csv)

    # Einführung zusätzlicher Testdatensatz aus merge_test für Schätzung und Visualisierung der Prognosen
    X_test_future = merge_test.drop(['Date','Year', 'Month', 'Week'], axis=1)     # Features aus Testdatensatz (test.csv)

    predictions = tuned_model.predict(X_test_future)
    y_test_future = pd.DataFrame({'Weekly_Sales': predictions})

    # Historische Daten und Forecasts visualisieren
    visualizing_forecasts(y_test_future)

    # Modellperformance bewerten
    print(f'\nEvaluation Results:')
    print(f'Training MSE: {mse_train:.4f}, RMSE: {rmse_train:.4f}, MAE: {mae_train:.4f}, R²: {r2_train:.4f}')
    print(f'Test MSE: {mse_test:.4f}, RMSE: {rmse_test:.4f}, MAE: {mae_test:.4f}, R²: {r2_test:.4f}')

    if rmse_test > rmse_train and (rmse_test - rmse_train) > 0.1 * rmse_train:
        print(f'\nWarning: Potential overfitting. Test RMSE is significantly higher than Train RMSE.')
    elif rmse_train > rmse_test:
        print(f'\nWarning: Potential underfitting. Train RMSE is higher than Test RMSE.')
    else:
        print(f'\nThe model seems to be well-balanced.')


 
    

In [18]:
# Aufruf
#tuned_model = get_tuned_model_random_forest() # Random Forest
y_test_future = sales_forecast()




Evaluation Results:
Training MSE: 3837604.2181, RMSE: 1958.9804, MAE: 727.5964, R²: 0.9925
Test MSE: 31860302.6035, RMSE: 5644.4931, MAE: 1968.5644, R²: 0.9391



  plt.show()
