In [1]:
import pandas as pd

import numpy as np
import copy
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline

from my_splitter import MySplitter
from my_transformer import MyTransformer
from recursive_pipeline import RecursivePipeline

ID_SIZE = 20

In [2]:
import os

if os.getcwd() != "/root/restaurants":
    os.chdir("..")
os.getcwd()

'/root/restaurants'

In [3]:
from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, \
    load_robot_execution_failures
download_robot_execution_failures()
timeseries, y = load_robot_execution_failures()

In [None]:
timeseries

In [None]:
data.sort_values("date")

In [None]:
data.groupby("store_id")["date"].max()

In [None]:
pd.merge(data.groupby("store_id")["date"].max(), data, on=["date", "store_id"], how="left")

In [3]:
data = pd.read_csv("data/processed/data_clear.csv")
data["date"] = pd.to_datetime(data["date"])
data.head()

Unnamed: 0,store_id,date,day_of_week,holiday_flg,genre_name,area_name,latitude,longitude,year,month,day,visitors
0,air_00a91d42b08b08d9,2016-07-01,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,1,24
1,air_0241aa3964b7f861,2016-01-03,Sunday,1,Izakaya,Tōkyō-to Taitō-ku Higashiueno,35.712607,139.779996,2016,1,3,28
2,air_034a3d5b40d5b1b1,2016-07-01,Friday,0,Cafe/Sweets,Ōsaka-fu Ōsaka-shi Ōhiraki,34.692337,135.472229,2016,7,1,9
3,air_036d4f1ee7285390,2016-07-01,Friday,0,Cafe/Sweets,Hyōgo-ken Takarazuka-shi Tōyōchō,34.799767,135.360073,2016,7,1,10
4,air_04341b588bde96cd,2016-01-01,Friday,1,Izakaya,Tōkyō-to Nerima-ku Toyotamakita,35.735623,139.651658,2016,1,1,12


In [None]:
data.drop(columns=["visitors", "area_name", "genre_name"]).info()

In [5]:
data = data.drop(columns=["latitude", "longitude"])

In [None]:
from tsfresh import extract_features
extracted_features = extract_features(data.drop(columns=["visitors", "area_name", "genre_name", "day_of_week"]), column_id="store_id", column_sort="date")

In [None]:
extracted_features


In [None]:
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

impute(extracted_features)
features_filtered = select_features(extracted_features, data["visitors"])

In [4]:
data = data.drop(columns=["latitude", "longitude"])

In [5]:
splitter = MySplitter()
train_index, test_index = splitter.split_data(data, 1)

data_train, data_test = data.loc[train_index], data.loc[test_index]

In [13]:
pipeline = RecursivePipeline(
    pipeline=Pipeline(
        steps=[
            ("transformer", MyTransformer()),
            (
                "model",
                XGBRegressor(
                    objective="reg:squaredlogerror",
                    # eval_metric = "rmsle",
                    random_state=42,
                    enable_categorical=True,
                ),
            ),
        ]
    )
)


param_grid = {
    # "pipeline__model__n_estimators": [1500],
    # "pipeline__model__learning_rate": [0.005],
    # "pipeline__model__max_depth": [1],
    # "pipeline__model__subsample": [0.5],
    # "pipeline__model__colsample_bytree": [0.5],
    # "pipeline__model__grow_policy": ["depthwise", "lossguide"],
    "pipeline__model__booster": ["gbtree"],
}


validation_size = 39  # new_submission["date"].nunique()
cv = MySplitter(test_size=validation_size, n_splits=3)
rscv = GridSearchCV(
    estimator=pipeline,
    cv=cv,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_log_error",
    n_jobs=1,
    verbose=5,
)

# air_c8a657c8c5c93d69
X = data_train.drop(columns=["visitors"]).reset_index(drop=True)
y = data_train["visitors"].reset_index(drop=True)

rscv.fit(X, y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END ..pipeline__model__booster=gbtree;, score=-0.813 total time=  16.3s
[CV 2/3] END ..pipeline__model__booster=gbtree;, score=-0.811 total time=  16.8s
[CV 3/3] END ..pipeline__model__booster=gbtree;, score=-0.808 total time=  19.0s


In [15]:
X_test = data_test.drop(columns=["visitors"]).reset_index(drop=True)
y_test = data_test["visitors"].reset_index(drop=True)
pred = rscv.predict(X_test)

In [16]:
predicted_data = data_test[["store_id", "date", "visitors"]].copy()
predicted_data.loc[:, "predicted_visitors"] = pred

In [17]:
from sklearn.metrics import root_mean_squared_log_error
scores = predicted_data.groupby(by=["store_id"]).apply(lambda group: root_mean_squared_log_error(group["visitors"], group["predicted_visitors"]), include_groups=False).sort_values()
best = scores.head()
worst = scores.tail()

In [26]:
root_mean_squared_log_error(predicted_data["visitors"], predicted_data["predicted_visitors"]).mean()


0.8153307274455137

In [27]:
scores.median()

0.8138869181490398

In [20]:
import plotly.graph_objects as go
import pandas as pd


def show_predictions(data, predictions, scores):
    for id in scores.index:
        temp_train = data[data["store_id"] == id]
        temp_pred = predictions[predictions["store_id"] == id]

        fig = go.Figure()

        fig.add_trace(
            go.Scatter(
                x=temp_train["date"],
                y=temp_train["visitors"],
                mode="lines+markers",
                name="Training Visitors",
                line=dict(color="blue")
            )
        )

        fig.add_trace(
            go.Scatter(
                x=temp_pred["date"],
                y=temp_pred["visitors"],
                mode="lines+markers",
                name="Test Visitors",
                line=dict(color="green")
            )
        )

        fig.add_trace(
            go.Scatter(
                x=temp_pred["date"],
                y=temp_pred["predicted_visitors"],
                mode="lines+markers",
                name="Predicted Visitors",
                line=dict(color="red")
            )
        )

        fig.update_layout(
            title=f"store_id: {id}  score: {scores[id]}",
            xaxis_title="Date",
            yaxis_title="Number of Visitors",
            legend_title="Legend",
            template="plotly_white",
            width=900,
            height=500
        )

        fig.show()


In [21]:
show_predictions(data, predicted_data, worst)

In [22]:
show_predictions(data, predicted_data, best)

In [None]:
# feature_importances = rscv.best_estimator_.pipeline.named_steps["model"].feature_importances_
# rscv.best_estimator_.pipeline.named_steps["model"]
feature_importances = rscv.best_estimator_.pipeline.named_steps["model"].get_booster().get_score(
    # importance_type='gain' # the average gain across all splits the feature is used in.
    importance_type='weight' # the number of times a feature is used to split the data across all trees.
    # importance_type='cover' # the average coverage across all splits the feature is used in.
    # importance_type='total_gain'
    # importance_type='total_cover'
    )
feature_importances = dict(sorted(feature_importances.items(), key=lambda item: item[1], reverse=True))
feature_importances

{'visitors_1_lag': 436.0,
 'area_name': 435.0,
 'visitors_14_lag': 408.0,
 'visitors_7_lag': 400.0,
 'store_id': 359.0,
 'sin_day': 356.0,
 'cos_day': 314.0,
 'genre_name': 207.0,
 'cos_month': 193.0,
 'sin_day_of_week': 150.0,
 'sin_month': 145.0,
 'visitors_365_lag': 108.0,
 'cos_day_of_week': 86.0,
 'year': 31.0,
 'holiday_flg': 20.0}

In [None]:
# importance_types = ["gain", "weight", "cover", "total_gain", "total_cover"]

In [28]:
def show_feature_importances():
    importance_types = ["gain", "weight", "cover", "total_gain", "total_cover"]

    for type in importance_types:
        feature_importances = rscv.best_estimator_.pipeline.named_steps["model"].get_booster().get_score(importance_type=type)
        feature_importances = dict(sorted(feature_importances.items(), key=lambda item: item[1], reverse=True))
        feature_importances

        fig = go.Figure()

        fig.add_trace(
            go.Bar(
            x=list(feature_importances.keys()),
            y=list(feature_importances.values()),
            # mode="lines+markers",
            # name=,
            # line=dict(color="blue")
            )   
        )

        fig.update_layout(
            title=type,
            # xaxis_title="Date",
            # yaxis_title="Number of Visitors",
            # legend_title="Legend",
            # template="plotly_white",
            # width=900,
            # height=500
        )


        fig.show()

show_feature_importances()

In [17]:
show_predictions(data, predicted_data, worst)

In [None]:
store_info = pd.read_csv("data/raw/air_store_info.csv")
store_info = store_info.rename(
    columns={
        "air_store_id": "store_id",
        "air_genre_name": "genre_name",
        "air_area_name": "area_name",
    }
)

date_info = pd.read_csv("data/raw/date_info.csv")
date_info = date_info.rename(columns={"calendar_date": "date"})
date_info["date"] = date_info["date"].astype("string")

submission = pd.read_csv("data/raw/sample_submission.csv")

new_submission = copy.deepcopy(submission)
new_submission["store_id"] = new_submission["id"].str[:ID_SIZE]
new_submission["date"] = new_submission["id"].str[ID_SIZE + 1 :]

new_submission = pd.merge(new_submission, date_info, on="date")
new_submission = pd.merge(new_submission, store_info, on="store_id")

new_submission["date"] = pd.to_datetime(new_submission["date"])
new_submission["year"] = new_submission["date"].dt.year
new_submission["month"] = new_submission["date"].dt.month
new_submission["day"] = new_submission["date"].dt.day

new_submission_ = new_submission.drop(columns=["id", "visitors"])

# predictions = rscv.predict(new_submission_.drop(columns=["latitude", "longitude"]))
# predictions

In [None]:
data_test = data_test.sort_values(["store_id", "date"])
data_test

Unnamed: 0,store_id,date,day_of_week,holiday_flg,genre_name,area_name,year,month,day,visitors
168955,air_00a91d42b08b08d9,2017-03-15,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,2017,3,15,59
169381,air_00a91d42b08b08d9,2017-03-16,Thursday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,2017,3,16,29
169805,air_00a91d42b08b08d9,2017-03-17,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,2017,3,17,34
170229,air_00a91d42b08b08d9,2017-03-18,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,2017,3,18,53
170865,air_00a91d42b08b08d9,2017-03-21,Tuesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,2017,3,21,6
...,...,...,...,...,...,...,...,...,...,...
205404,air_fff68b929994bfbd,2017-04-18,Tuesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,2017,4,18,48
205608,air_fff68b929994bfbd,2017-04-19,Wednesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,2017,4,19,38
205812,air_fff68b929994bfbd,2017-04-20,Thursday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,2017,4,20,15
206016,air_fff68b929994bfbd,2017-04-21,Friday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,2017,4,21,2


In [105]:
# Пример full_df (с полными комбинациями id и date)
full_df = pd.DataFrame({
    'id': [1, 1, 1, 2, 2],
    'date': ['2024-11-01', '2024-11-02', '2024-11-03', '2024-11-01', '2024-11-02'],
    'value': [10.0, 12.0, 15.0, 20.0, 22.0]
})

# Пример df (оригинальный)
df = pd.DataFrame({
    'id': [1, 1, 2],
    'date': ['2024-11-01', '2024-11-03', '2024-11-02'],
    'value': [10, 15, 20]
})

# Выполняем внутреннее объединение
result_df = pd.merge(full_df, df, on=['id', 'date'], how='inner')

print(result_df)


   id        date  value_x  value_y
0   1  2024-11-01     10.0       10
1   1  2024-11-03     15.0       15
2   2  2024-11-02     22.0       20


In [8]:
store_info = pd.read_csv("data/raw/air_store_info.csv")
store_info = store_info.rename(
    columns={
        "air_store_id": "store_id",
        "air_genre_name": "genre_name",
        "air_area_name": "area_name",
    }
)

date_info = pd.read_csv("data/raw/date_info.csv")
date_info = date_info.rename(columns={"calendar_date": "date"})
date_info["date"] = date_info["date"].astype("string")

data_test['date'] = pd.to_datetime(data_test['date'])

# Получаем уникальные значения id и создаем полный список дат
ids = data_test['store_id'].unique()
date_range = pd.date_range(start=data_test['date'].min(), end=data_test['date'].max(), freq='D')
id_date_combinations = pd.MultiIndex.from_product([ids, date_range], names=['store_id', 'date'])
full_df = pd.DataFrame(index=id_date_combinations).reset_index()

date_info["date"] = pd.to_datetime(date_info["date"])
full_df = pd.merge(full_df, date_info, on="date")
full_df = pd.merge(full_df, store_info, on="store_id")

full_df["date"] = pd.to_datetime(full_df["date"])
full_df["year"] = full_df["date"].dt.year
full_df["month"] = full_df["date"].dt.month
full_df["day"] = full_df["date"].dt.day
full_df["visitors"] = 5
full_df = full_df[data_test.columns]
full_df["visitors"] = 5
# full_df = pd.merge(full_df, store_info, on="store_id")
full_df

mask = full_df[['store_id', 'date']].apply(tuple, axis=1).isin(data_test[['store_id', 'date']].apply(tuple, axis=1))
result_df = full_df[mask]

result_df
# data_test

Unnamed: 0,store_id,date,day_of_week,holiday_flg,genre_name,area_name,year,month,day,visitors
0,air_1c0b150f9e696a5f,2017-03-15,Wednesday,0,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,2017,3,15,5
1,air_1c0b150f9e696a5f,2017-03-16,Thursday,0,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,2017,3,16,5
2,air_1c0b150f9e696a5f,2017-03-17,Friday,0,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,2017,3,17,5
3,air_1c0b150f9e696a5f,2017-03-18,Saturday,0,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,2017,3,18,5
4,air_1c0b150f9e696a5f,2017-03-19,Sunday,0,Okonomiyaki/Monja/Teppanyaki,Tōkyō-to Shibuya-ku Shibuya,2017,3,19,5
...,...,...,...,...,...,...,...,...,...,...
32207,air_ef47430bcd6f6a89,2017-04-16,Sunday,0,Cafe/Sweets,Fukuoka-ken Fukuoka-shi Daimyō,2017,4,16,5
32208,air_ef47430bcd6f6a89,2017-04-17,Monday,0,Cafe/Sweets,Fukuoka-ken Fukuoka-shi Daimyō,2017,4,17,5
32211,air_ef47430bcd6f6a89,2017-04-20,Thursday,0,Cafe/Sweets,Fukuoka-ken Fukuoka-shi Daimyō,2017,4,20,5
32212,air_ef47430bcd6f6a89,2017-04-21,Friday,0,Cafe/Sweets,Fukuoka-ken Fukuoka-shi Daimyō,2017,4,21,5


In [124]:
def predict(X=None):
        X = copy.deepcopy(X)

        ids = X['store_id'].unique()
        date_range = pd.date_range(start=X['date'].min(), end=X['date'].max(), freq='D')
        id_date_combinations = pd.MultiIndex.from_product([ids, date_range], names=['store_id', 'date'])
        full_df = pd.DataFrame(index=id_date_combinations).reset_index()

        full_df = pd.merge(full_df, date_info, on="date")
        date_info["date"] = pd.to_datetime(date_info["date"])
        full_df = pd.merge(full_df, store_info, on="store_id")

        full_df["year"] = full_df["date"].dt.year
        full_df["date"] = pd.to_datetime(full_df["date"])
        full_df["month"] = full_df["date"].dt.month
        full_df["day"] = full_df["date"].dt.day
        print(X.columns)
        full_df = full_df[X.columns]

        # full_df["visitors"] = 5

        cols = full_df.columns
        predictions = []

        for name, group in full_df.sort_values("date").groupby(by=["date"], group_keys=False)[cols]:
            predictions.append(group["date"].dt.day)

        predictions = np.concatenate(predictions)
        predictions[predictions < 0] = 0

        full_df = full_df.sort_values("date")
        full_df["pred"] = predictions
        full_df = full_df.sort_index()

        mask = full_df[['store_id', 'date']].apply(tuple, axis=1).isin(data_test[['store_id', 'date']].apply(tuple, axis=1))
        result_df = full_df[mask]

        return result_df["pred"].values

In [125]:
data_test

Unnamed: 0,store_id,date,day_of_week,holiday_flg,genre_name,area_name,year,month,day,visitors
168955,air_00a91d42b08b08d9,2017-03-15,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,2017,3,15,59
169381,air_00a91d42b08b08d9,2017-03-16,Thursday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,2017,3,16,29
169805,air_00a91d42b08b08d9,2017-03-17,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,2017,3,17,34
170229,air_00a91d42b08b08d9,2017-03-18,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,2017,3,18,53
170865,air_00a91d42b08b08d9,2017-03-21,Tuesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,2017,3,21,6
...,...,...,...,...,...,...,...,...,...,...
205404,air_fff68b929994bfbd,2017-04-18,Tuesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,2017,4,18,48
205608,air_fff68b929994bfbd,2017-04-19,Wednesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,2017,4,19,38
205812,air_fff68b929994bfbd,2017-04-20,Thursday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,2017,4,20,15
206016,air_fff68b929994bfbd,2017-04-21,Friday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,2017,4,21,2


In [126]:
predict(data_test.drop(columns=["visitors"]))

Index(['store_id', 'date', 'day_of_week', 'holiday_flg', 'genre_name',
       'area_name', 'year', 'month', 'day'],
      dtype='object')


array([15, 16, 17, ..., 20, 21, 22], dtype=int32)

In [None]:
new_submission["visitors"] = predictions

In [None]:
predictions

In [None]:
new_submission[["id", "visitors"]].to_csv("data/submissions/lag.csv", index=False)

In [None]:
submission["visitors"] = predictions
# submission.to_csv("data/submissions/xgb1500_1_005_5_5_no_transformer.csv", index=False)