## Import

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ignore warnings when graphs are plotted
import warnings

warnings.filterwarnings("ignore")

## Read data

In [7]:
# Read csv files
data = pd.read_csv("../data/train.csv", index_col=0).reset_index(drop=True)

# Transform city names to english format
cities_rus_to_eng = {
    "Москва": "Moscow",
    "Санкт-Петербург": "St.Petersburg",
    "Краснодар": "Krasnodar",
    "Самара": "Samara",
    "Нижний Новгород": "Nizhny.Novgorod",
    "Ростов-на-Дону": "Rostov-on-Don",
    "Волгоград": "Volgograd",
    "Воронеж": "Voronezh",
    "Казань": "Kazan",
    "Екатеринбург": "Yekaterinburg",
}
data = data.replace(cities_rus_to_eng)

# Transform weather description to english format
weather_rus_to_eng = {
    "переменная облачность, небольшой дождь": "partly cloudy, light rain",
    "переменная облачность": "partly cloudy",
    "облачно, небольшой дождь": "cloudy, light rain",
    "дождь, гроза": "rain, thunderstorm",
    "облачно, без существенных осадков": "cloudy, no significant precipitation",
    "переменная облачность, дождь": "partly cloudy, rain",
    "дождь": "rain",
    "облачно": "cloudy",
    "ясно": "clear",
    "облачно, небольшой снег": "cloudy, light snow",
    "переменная облачность, небольшие осадки": "partly cloudy, light precipitation",
    "облачно, небольшие осадки": "cloudy, slight precipitation",
    "снег": "snow",
    "метель": "blizzard",
    "осадки": "rainfall",
    "переменная облачность, небольшой снег": "partly cloudy, light snow",
}
data = data.replace(weather_rus_to_eng)

In [8]:
# check data sample
data.head()

Unnamed: 0,date,city_name,store_id,category_id,product_id,price,weather_desc,humidity,temperature,pressure,sales
0,2021-07-29,Moscow,1,1,1,4.79,"partly cloudy, light rain",61.9375,23.1875,741.0,26
1,2021-07-30,Moscow,1,1,1,4.79,"partly cloudy, light rain",70.25,22.1875,740.3125,37
2,2021-07-31,Moscow,1,1,1,4.79,partly cloudy,52.625,21.8125,741.625,25
3,2021-08-01,Moscow,1,1,1,4.79,"cloudy, light rain",87.4375,20.0625,743.3125,26
4,2021-08-02,Moscow,1,1,1,4.79,partly cloudy,66.1875,23.4375,739.625,22


## Data transformation

### Feature engineering

In [9]:
# categorical time features
data["date"] = pd.to_datetime(data.date)
data["day_of_week"] = data["date"].dt.dayofweek + 1
data["weekend"] = data["date"].dt.dayofweek > 4
data["dayofyear"] = data["date"].dt.dayofyear
data["day"] = data["date"].dt.day
data["month"] = data["date"].dt.month
data["is_month_start"] = data["date"].dt.is_month_start
data["is_month_end"] = data["date"].dt.is_month_end
data["year"] = data["date"].dt.year
data["quarter"] = data["date"].dt.quarter
data["season"] = np.where(
    data["date"].dt.month.isin([1, 2, 12]),
    "winter",
    np.where(
        data["date"].dt.month.isin([3, 4, 5]),
        "spring",
        np.where(data["date"].dt.month.isin([6, 7, 8]), "summer", "autumn"),
    ),
)

In [10]:
# lag features for sales
for i in [7, 14, 21]:
    data[f"sales_lag_{int(i/7)}_week"] = data.groupby(
        [
            "store_id",
            "product_id",
        ],
        as_index=False,
    )["sales"].shift(i)

data["median_lag"] = data[[f"sales_lag_{int(i/7)}_week" for i in [7, 14, 21]]].median(
    axis=1
)

In [11]:
data.isna().sum()

date                    0
city_name               0
store_id                0
category_id             0
product_id              0
price                   0
weather_desc            0
humidity                0
temperature             0
pressure                0
sales                   0
day_of_week             0
weekend                 0
dayofyear               0
day                     0
month                   0
is_month_start          0
is_month_end            0
year                    0
quarter                 0
season                  0
sales_lag_1_week    24836
sales_lag_2_week    49672
sales_lag_3_week    74508
median_lag          24836
dtype: int64

In [12]:
data.dropna(axis=0, inplace=True)

In [None]:
# one hot encoding
# skip for CatBoost

### Spliting

In [13]:
test_cut_date = str(data.date.max() - pd.DateOffset(days=6))[:10]
eval_cut_date = str(data.date.max() - pd.DateOffset(days=13))[:10]
train_df = (
    data[(data.date < eval_cut_date)]
    # .drop(columns=['date'])
    .reset_index(drop=True)
)

eval_df = (
    data[(data.date >= eval_cut_date) & (data.date < test_cut_date)].reset_index(
        drop=True
    )
    # .drop(columns=['date'])
)
test_df = (
    data[(data.date >= test_cut_date)].reset_index(drop=True)
    # .drop(columns=['date'])
)

In [14]:
print("Train df shape:\t", train_df.shape)
print("Eval df shape:\t", eval_df.shape)
print("Test df shape:\t", test_df.shape)

Train df shape:	 (542496, 25)
Eval df shape:	 (24836, 25)
Test df shape:	 (24836, 25)


### removing outliers
check for anomalies in training data

In [15]:
def zscore(window):

    def zscore_(x):
        r = x.rolling(window=window)
        m = r.mean().shift(1)
        s = r.std(ddof=0).shift(1)
        z = ((x - m) / s).round(2)
        return z

    zscore_.__name__ = "zscore_"
    return zscore_

In [16]:
# in train_df only

train_df = train_df.assign(
    zscore=train_df.groupby(["store_id", "product_id"]).sales.transform(zscore(21))
)

In [17]:
train_df = train_df[
    # high sales can be associated with promo and marketing features
    # (train_df.zscore<=3)|
    # low sales can be more of anomaly
    (train_df.zscore >= -2)
]

train_df.drop(columns="zscore", inplace=True)

In [18]:
print("Train df shape:\t", train_df.shape)
print("Eval df shape:\t", eval_df.shape)
print("Test df shape:\t", test_df.shape)

Train df shape:	 (465238, 25)
Eval df shape:	 (24836, 25)
Test df shape:	 (24836, 25)


In [22]:
train_df.to_parquet("../data/train_df.parquet")
eval_df.to_parquet("../data/eval_df.parquet")
test_df.to_parquet("../data/test_df.parquet")