In [1]:
from google.colab import drive
drive.mount('/content/drive')
import os
!pip install catboost
try:
    os.chdir("./drive/My Drive/Кванториум/Big-challenges/Большие вызовы")
except: pass

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
holidays = pd.read_csv("holidays.csv")
arr = []
for element in holidays.values:
    year = element[0]
    for month, days in enumerate(element[1:13], start=1):
        days = days.replace("*", "").replace("+", "").split(",")
        for day in days:
            arr.append([pd.to_datetime(f"{year}-{month}-{day}"), "Выходные"])

df_holidays = pd.DataFrame(arr, columns=["ds", "holiday"])
df_holidays.head()

Unnamed: 0,ds,holiday
0,1999-01-01,Выходные
1,1999-01-02,Выходные
2,1999-01-03,Выходные
3,1999-01-04,Выходные
4,1999-01-06,Выходные


In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter("ignore")

df = pd.read_excel("Sample - Superstore.xls")

In [4]:
df.head(3)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714


# Preprocessing

In [5]:
class Data_Preproccesing():
    def __init__(self, df, holidays):
        self.df = df
        self.holidays = holidays

    def date_to_data(self, drop=False, name="date", suffix=""):
        frame, holidays = self.df, self.holidays
        assert name in frame, "Нужна колонка с датой"
        date = frame[name].dt

        info = pd.concat([date.days_in_month,
                          date.month,
                          date.quarter], axis=1)

        info.columns = ["days_in_month", "month", "quarter"]

        temp = pd.concat([date.isocalendar(),
                          info], axis=1)

        temp.columns = temp.columns + suffix
        frame = pd.concat([frame, temp], axis=1)
        if "holiday" not in frame:
            frame = frame.merge(holidays, left_on=name, right_on="ds", how="left") \
                .drop(["ds"], axis=1)
            frame["holiday"] = frame["holiday"].apply(lambda x: 1 if type(x) == str else 0)
        if drop:
            return frame.drop([name], axis=1)
        return frame

    def fit(self, **kwargs):
        return self.date_to_data(**kwargs)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


class DataSet():
    def __init__(self, df, names, target):
        self.df = df
        self.target = target
        self.names = names

    def get_means(self, frame, names: list) -> list:
        arr = []
        for name in names:
            means = frame.groupby(by=name) \
                .aggregate(["median", "min", "mean",
                            "max", "count", "sum", "std", "var"])[self.target]
            means.columns += f"_{name}"
            arr.append(means)
        return arr

    def split_data(self, names: list, test_size: float, shuffle: bool):
        X_train, X_test = train_test_split(self.df,
                                           test_size=test_size,
                                           random_state=42,
                                           shuffle=shuffle)
        self.means = self.get_means(names=names,
                                    frame=X_train)

    def concat_means(self, frame) -> list:
        for name, mean in zip(self.names, self.means):
            frame = frame.merge(mean, on=name, how="left")
        return frame

    def cleaning_frame(self):
        for column in self.df.drop(["Sales"], axis=1):
            try:
                self.df[column] = self.df[column].astype(np.int)
            except:
                self.df[column] = self.df[column].astype(np.str)

        data_categorical = self.df.select_dtypes(include=['object'])
        data_numeric = self.df.select_dtypes(exclude=['object'])

        for column in data_categorical:
            data_categorical[column] = LabelEncoder() \
                .fit_transform(data_categorical[column])

        self.df = pd.concat([data_numeric, data_categorical], axis=1)

    def fit(self, **kwargs):
        test_size = (1 - kwargs["val_size"]) * kwargs["test_size"]
        shuffle = kwargs["shuffle"]

        self.split_data(self.names, test_size, shuffle)
        self.df = self.concat_means(self.df)
        self.df = self.df.drop(kwargs["trash"], axis=1).dropna(axis=1)
        self.cleaning_frame()

        X, y = self.df.drop([self.target], axis=1), self.df[self.target]

        X_rest, X_val, y_rest, y_val = train_test_split(X, y, test_size=kwargs["val_size"],
                                                        random_state=42, shuffle=shuffle)
        X_train, X_test, y_train, y_test = train_test_split(X_rest, y_rest, test_size=kwargs["test_size"],
                                                            random_state=42, shuffle=shuffle)

        n = len(X)
        print("X_train: {:.2f} X_val: {:.2f} X_test: {:.2f}".format(len(X_train) / n, len(X_val) / n, len(X_test) / n))

        # TODO
        X_train = X_train[list(X_test)]
        X_val = X_val[list(X_test)]

        return X_train, X_test, X_val, y_train, y_test, y_val

In [7]:
data_agg = ["Customer ID", "Product ID", ["Customer ID", "Product ID"], ["Postal Code", "Product ID"],
            "Ship Mode", "Category", "Sub-Category", ["Category", "Sub-Category"], "Order Date", "Ship Date",
            "Region", "Postal Code", "State", "City", "Segment", "Quantity",
            ["Region", "State", "City"], ["Ship Mode", "Segment"], "Discount", "Profit",
            ["Ship Mode", "Segment", "Quantity"], ["Region", "State"], ["Region", "State", "City", "Postal Code"],
            ["Region", "City"], ["Region", "State", "City", "Quantity"],
            ["Customer ID", "Product ID", "Postal Code"], ["Discount", "Profit"],
            ["Discount", "Profit", "Category", "Sub-Category"], ["Discount", "Profit", "Category"],
            ["Discount", "Profit", "Sub-Category"], "week_Order", "day_Order", ["week_Order", "day_Order"],
            "days_in_month_Order", ["week_Order", "day_Order", "days_in_month_Order"],
            ["day_Order", "days_in_month_Order"], ["week_Order", "days_in_month_Order"]]

df_new = Data_Preproccesing(df, df_holidays).fit(name="Order Date",
                                             suffix="_Order", drop=False)

In [8]:
trash = ["Country", "Row ID",
         "Order ID", "Customer ID",
         "Product ID", "Order Date", "Ship Date"]

X_train, X_test, X_val, y_train, y_test, y_val = DataSet(df_new.sort_values(by="Order Date"),
                                                         data_agg, "Sales").fit(val_size=0.1,
                                                                                test_size=0.2,
                                                                                shuffle=False,
                                                                                trash=trash)

X_train: 0.72 X_val: 0.10 X_test: 0.18


In [9]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(iterations=1000,
                          random_state=42,
                          verbose=False,
                          task_type="GPU",
                          loss_function="RMSE",
                          grow_policy='Depthwise',
                          depth=9)

In [10]:
model.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x7efce01d4290>

In [11]:
from sklearn.metrics import (r2_score,
                             mean_squared_error,
                             mean_absolute_error)

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [12]:
# валидация

predictions = pd.DataFrame(model.predict(X_val), columns=["pred"])
predictions[predictions["pred"] < 0] = 1
predictions = predictions["pred"].values

print(f"R2: {r2_score(y_val, predictions)}")
print(f"MAE: {mean_absolute_error(y_val, predictions)}")
print(f"MSE: {mean_squared_error(y_val, predictions)}")
print(f"MAPE: {mean_absolute_percentage_error(y_val, predictions)}")

R2: 0.5737609839724256
MAE: 79.59260049584296
MSE: 127430.010101284
MAPE: 54.08974493384938


In [13]:
from scipy.optimize import minimize
from sklearn.metrics import mean_absolute_error


def f(coef, arg):
    score = mean_absolute_error(arg[0], arg[1] * coef[0])
    return score

minimized = minimize(fun=f, x0=[0],
                     args=[y_val, predictions],
                     bounds=[(0, 2)],
                     options = {'eps': 0.0001})


print("Минимум: {:.4f}".format(minimized.fun))
print("x = {:.4f}".format(*minimized.x))

Минимум: 77.5756
x = 0.9148


In [14]:
# тест с учетом минимизации

predictions = model.predict(X_test) * minimized.x[0]
predictions = pd.DataFrame(predictions, columns=["pred"])
predictions[predictions["pred"] < 0] = 1
predictions = predictions["pred"].to_list()

print(f"R2: {r2_score(y_test, predictions)}")
print(f"MAE: {mean_absolute_error(y_test, predictions)}")
print(f"MSE: {mean_squared_error(y_test, predictions)}")
print(f"MAPE: {mean_absolute_percentage_error(y_test, predictions)}")

R2: 0.7726211018545112
MAE: 65.30004000085913
MSE: 63497.261927082276
MAPE: 50.2438138056412
