In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib


data = pd.read_csv("../data/main_data/project.csv")

data["is_festival"] = data["is_festival"].apply(lambda x: 1 if x else 0)
data["sale_per_guest"] = data["sales"] / data["guests"]


X = data[["weekday","guests","total_staff_level","staff_count","sale_per_guest"]]
y = data["sales"]


numeric_features = [ "guests", "total_staff_level", "staff_count","sale_per_guest"]
categorical_features = ["weekday"]


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)


sale_pre_model = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ("RandomForest", RandomForestRegressor(n_estimators=100, random_state=42))
    ])


sale_pre_model.fit(X, y)

joblib.dump(sale_pre_model,"sales_model.pkl")