In [21]:
import pandas as pd
import numpy as np

import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

from category_encoders import BinaryEncoder, cat_boost
from sklearn import metrics, model_selection

from scipy import stats

from sklearn.decomposition import PCA
from sklearn.svm import SVC

import builtins


from sklearn.model_selection import cross_val_score

import optuna

import torch
from torch import nn, optim

# Helpfull functions

In [2]:
def type_casting(df_in):
    df = df_in.copy()
    cat_features = ['store_id','region_id','status_id']
    float_feats = ['profit','order_price','max_price','min_price','avg_price','planned_prep_time']
    int_feats = ['delivery_distance','products_count','unique_products_sold_by_store']
    df[cat_features] = df[cat_features].astype('object')
    df[float_feats] = df[float_feats].astype('float')
    df[int_feats] = df[int_feats].astype('int')
    return df

In [3]:
def boxplots_top_stores(df, columns, num_stores=5):
    top_stores = df.value_counts('store_id').sort_values(ascending=False)[:num_stores].index

    _, axes = plt.subplots(nrows=len(columns), 
                        ncols=num_stores+2, figsize=(num_stores*5, len(columns) * 3))
    for idx, feature in enumerate(columns):
        sns.histplot(df[feature], 
                    stat='density', 
                    kde=True,
                    bins=35, ax=axes[idx, 0])
        sns.boxplot(x=feature, data=df, ax=axes[idx, 1]).set_xlabel(feature + ' for all stores')
        for shifted_idx, store_id in enumerate(top_stores, 2):
            sns.boxplot(x=feature, 
                        data=df[df['store_id']==store_id], 
                        ax=axes[idx, shifted_idx]).set_xlabel(feature + ' for ' + str(store_id))
    plt.tight_layout()

In [4]:
def sort_dates(df_in, date_columns):
    df = df_in.copy()
    mask_any_na_dates = df[date_columns].isna().any(axis=1)
    df.loc[~mask_any_na_dates, date_columns] = np.sort(df.loc[~mask_any_na_dates, date_columns].values, axis=1)
    return df

In [5]:
def rm_outliers(df_in, features, quantiles: np.array, verbose=1):
    assert len(features) == len(quantiles)
    df = df_in.copy()
    if verbose:
        print('Shape before removing outliers: ', df.shape)
    IQR = stats.iqr(df[features], axis=0, nan_policy='omit')

    mask_non_outliers = df.groupby("store_id")[features]\
        .apply(lambda x : (x>=(np.diag(x.quantile(quantiles[:,0])) - 1.5 * IQR)) 
                        & (x<=(np.diag(x.quantile(quantiles[:,1])) + 1.5 * IQR))\
              ).all(axis=1).values
    df = df[mask_non_outliers]
    if verbose:
        print('Shape after removing outliers: ', df.shape)
    return df

In [6]:
def calc_statistic(frame, method):
    match method:
        case 'mode':
            return frame.mode()
        case 'mean':
            return frame.mean()
        case 'median':
            return frame.median()
        case _:
            raise NameError("Method is not exist") 

In [7]:
def impute_dates_by_store(x_in, method):
    x = x_in.copy()
    # calculate statistics value for recover null observation in regarding the neighbour date feature
    diff_date = x['order_pickup'] - x['order_ready']
    diff_timestamp_statistic = calc_statistic(diff_date[diff_date > pd.to_timedelta("0 days")], method)    

    # From EDA, this column hasn't null rows, but to be sure this code will be left 
    # order_pickup = order_ready + method(order_pickup - order_ready)
    mask_null_order_pickup = x['order_pickup'].isnull() 
    x.loc[mask_null_order_pickup, 'order_pickup'] = \
        x.loc[mask_null_order_pickup, 'order_ready'] + diff_timestamp_statistic
    
    # order_ready = order_pickup - method(order_pickup - order_ready)
    mask_null_order_ready = x['order_ready'].isnull()
    x.loc[mask_null_order_ready, 'order_ready'] = \
        x.loc[mask_null_order_ready, 'order_pickup'] - diff_timestamp_statistic

    # New feature that relfects waiting time for the order to be accepted for processing
    x['order_delay'] = x['order_start_prepare'] - x['date_create']
    diff_timestamp_statistic = calc_statistic(x['order_delay'][x['order_delay'] > pd.to_timedelta("0 days")], method)    
    x['order_delay'] = x['order_delay'].dt.total_seconds() / 60

    # order_start_prepare = date_create + method(order_start_prepare - date_create)
    mask_null_order_start_prepare = x['order_start_prepare'].isnull()
    x.loc[mask_null_order_start_prepare, 'order_start_prepare'] = \
        x.loc[mask_null_order_start_prepare, 'date_create'] + diff_timestamp_statistic
    
    # date_create = order_start_prepare - method(order_start_prepare - date_create)
    mask_null_date_create = x['date_create'].isnull()
    x.loc[mask_null_date_create, 'date_create'] = \
        x.loc[mask_null_date_create, 'order_start_prepare'] - diff_timestamp_statistic

    return x

In [8]:
def impute_dates(df_in, method='mode'):
    df = df_in.copy()
    # Restore dates within a certain store_id by method of differences neighboor columns 
    # (order_pickup - order_ready) & (order_start_prepare - date_create)
    df.loc[:,['date_create','order_start_prepare','order_ready','order_pickup','order_delay']] = df.groupby("store_id")\
            .apply(lambda x: impute_dates_by_store(x, method))\
            .reset_index(allow_duplicates=True)\
            .set_index('level_1')[['date_create','order_start_prepare','order_ready','order_pickup','order_delay']]
    return df

In [9]:
def extract_target(df_in, q_target=0.75, test_subset=False, verbose=1):
    df = df_in.copy()
    df['actual_prep_time'] = (df['order_ready'] - df['order_start_prepare']).dt.total_seconds() / 60
    
    mask_null_prep_time = df['actual_prep_time'].isnull() 
    if not test_subset:
        df['diff_time'] = df['actual_prep_time'] - df['planned_prep_time']
        
        mask_neg_prep_time = df['actual_prep_time'] < 0
        IQR = stats.iqr(df['diff_time'], nan_policy='omit')
        mask_outliers_in_diff_time = df['diff_time'] > (df['diff_time'].quantile(q_target) + 1.5 * IQR)

        if verbose:
            print("Nan's in actual_prep_time: ", mask_null_prep_time.sum(),
                "\nNegatives: ", mask_neg_prep_time.sum(),
                '\nOutliers in time difference: ', mask_outliers_in_diff_time.sum())
        
        df.drop(columns='diff_time', inplace=True)
        df.dropna(subset=['date_create','planned_prep_time'], inplace=True)
        mask_anomaly = mask_neg_prep_time | mask_null_prep_time | mask_outliers_in_diff_time
    else:
        mask_anomaly = mask_null_prep_time
    
    if verbose:
        print("All anomalyes: ", df.loc[mask_anomaly].shape[0])
    df = df.loc[~mask_anomaly]
    df['on_time'] = np.where(abs(df['planned_prep_time'] - df['actual_prep_time']) <= 5, 1, 0)
    return df.drop(columns=['order_ready','order_pickup','actual_prep_time', 'on_time']), df['on_time']


In [10]:
def encode_cyclical_feat(df_in: pd.DataFrame, col: pd.Series, col_name: str):
    df = df_in.copy()
    df[col_name + '_sin'] = np.sin(2 * np.pi * col/col.max())
    df[col_name + '_cos'] = np.cos(2 * np.pi * col/col.max())
    return df

In [11]:
def extract_date_feat(df_in: pd.DataFrame, data_features: list, drop=True, encode_cyclical=True):
    df = df_in.copy()
    if encode_cyclical:
        for col_name in data_features:
            df = encode_cyclical_feat(df, df[col_name].dt.month, col_name + '_month')
            df = encode_cyclical_feat(df, df[col_name].dt.day, col_name + '_day')
            df = encode_cyclical_feat(df, df[col_name].dt.weekday, col_name + '_weekday')
            df = encode_cyclical_feat(df, df[col_name].dt.hour, col_name + '_hour')
            df = encode_cyclical_feat(df, df[col_name].dt.minute, col_name + '_minute')
    else:
        for col_name in data_features:
            df[col_name + '_month'] = df[col_name].dt.month
            df[col_name + '_day'] = df[col_name].dt.day
            df[col_name + '_weekday'] = df[col_name].dt.weekday
            df[col_name + '_hour'] = df[col_name].dt.hour
            df[col_name + '_minute'] = df[col_name].dt.minute
            
    if drop:
        df = df.drop(columns=data_features)
    return df

In [12]:
def simple_imputer(X_train, X_test, y_train, y_test, restore_numerical_nans=True, 
                   imputing_method_for_numerical='mode', 
                   imputing_method_for_simple_imputer='most_frequent', verbose=1):
    na_columns_train = X_train.columns[X_train.isna().any()]
    na_columns_test = X_test.columns[X_test.isna().any()]

    if verbose: 
        print("Numbers of NaN values\n\tTrain:\n", X_train[na_columns_train].isna().sum())
        print("\n\tTest:\n", X_test[na_columns_test].isna().sum())

    if restore_numerical_nans:
        X_train.loc[:,na_columns_train] = X_train.groupby("store_id")\
            [na_columns_train].transform(lambda x: x.fillna(calc_statistic(x, imputing_method_for_numerical)))
        X_test.loc[:,na_columns_test] = X_test.groupby("store_id")\
            [na_columns_test].transform(lambda x: x.fillna(calc_statistic(x, imputing_method_for_numerical)))

        ####################################### Imputting #####################################
        col_imputing = X_test.columns[X_test.isna().any()].union(X_train.columns[X_train.isna().any()])
        if verbose:
            print("Remaining numbers of NaN values\n\tTrain:\n", X_train[col_imputing].isna().sum())
            print("\n\tTest:\n", X_test[col_imputing].isna().sum())

        imputer = SimpleImputer(strategy=imputing_method_for_simple_imputer)
        X_train[col_imputing] = pd.DataFrame(imputer.fit_transform(X_train[col_imputing]), 
                                            columns=col_imputing, index=X_train.index)
        X_test[col_imputing] = pd.DataFrame(imputer.transform(X_test[col_imputing]), 
                                            columns=col_imputing, index=X_test.index)
    else:
        index_train_na, index_test_na = X_train[na_columns_train].isnull().any(axis=1), \
                                        X_test[na_columns_test].isnull().any(axis=1)    
        X_train, y_train, X_test, y_test = X_train[~index_train_na], y_train[~index_train_na], \
                                        X_test[~index_test_na], y_test[~index_test_na]
    return X_train, X_test, y_train, y_test

In [13]:
def scores(estimator, X_test, y_test, X_train, y_train, cv=5, cv_scoring=['f1'], threshold=0.5, plot=True, ax=None):
    pass

In [14]:
from os.path import join, exists
from pathlib import Path

COLORMAP = [[0.0, '#3f7f93'],
            [0.1, '#6397a7'],
            [0.2, '#88b1bd'],
            [0.3, '#acc9d2'],
            [0.4, '#d1e2e7'],
            [0.5, '#f2f2f2'],
            [0.6, '#f6cdd0'],
            [0.7, '#efa8ad'],
            [0.8, '#e8848b'],
            [0.9, '#e15e68'],
            [1.0, '#da3b46']]

PLOT_THEME='plotly_dark'
# PLOT_THEME='none'
VERBOSE=False
SHOW_PLOTS=False
APPEND_TO_EXISTS=False
DROP_OLD_COLUMNS=True
TARGET='Добыча воды за 2 ч ,м3, лаг -1'
TIME_AXIS='YY-MM-DD HH:00'
INSERT_NEARBY=True
LOG=(False, False) # log_x=LOG[0], log_y=LOG[1]
TEXTFONT_SIZE=10
FILEPATH=None

ROOT = Path('/home/prog3/innopolis/ML/assignment/')
DATA = join(ROOT, 'content')
PLOTS = join(ROOT, 'plots')
MODELS = join(ROOT, 'models')

In [15]:
from statsmodels.nonparametric.smoothers_lowess import lowess
from phik.report import plot_correlation_matrix
from scipy import signal

from sklearn import preprocessing


# slice функция по индексу для словарей
slice = lambda d, start=0, stop=None, step=1: dict(itertools.islice(d.items(), start, stop, step))


class Dataset:

    #region base
    def __init__(self, df: pd.DataFrame=None, name='dataset', verbose=VERBOSE, features_names=None, targets_names=None):
        self.verbose = verbose
        self.name = name
        if isinstance(df, pd.DataFrame):
            self.df = df.copy()
            self.original = df.copy()

        self.features_names = features_names
        self.targets_names = targets_names
        

    def __getitem__(self, index):
        return self.data.loc[index]
        

    def load(self, path, dropna=True, parse_dates=None):
        print("Чтение датасета...") if self.verbose else None
        match path.split('.')[-1]:
            case 'xlsx':
                self.df: pd.DataFrame = pd.read_excel(path, parse_dates=parse_dates)
            case 'csv':
                self.df: pd.DataFrame = pd.read_csv(path, parse_dates=parse_dates)
        
        self.original = self.df.copy()
        self.df.columns = [col.replace('|', ',') for col in self.df]
        if dropna:
            print(f"Удаленые записи\n{self.df.isna().sum()}") if self.verbose else None
            self.df = self.df.dropna().reset_index(drop=True)
        print("Завершено успешно") if self.verbose else None

    def __iter__(self):
        return self.cols.__iter__()
    
    def astype(self, columns, type):
        columns = self.columns(columns)
        for col in columns:
            self.df[col] = self.df[col].astype(type)
    
    @property
    def cols(self):
        return self.df.columns
    
    @property
    def data(self):
        return self.df.loc[:, self.features_names] \
            if self.features_names else self.df.iloc[:,1:-1]

    
    @property
    def target(self):
        return self.df.loc[:, self.targets_names] \
            if self.targets_names else self.df.iloc[:,-1]
    

    def columns(self, pattern: list|str):
        match type(pattern):
            case builtins.list | pd.Index:
                return [col for col in pattern if col in self.cols]
            case builtins.str:
                return [col for col in self.df if re.search(pattern, col)]
            case _:
                if pattern==None:
                    return self.cols
                else:
                    raise TypeError("Передан некорректный параметр 'pattern'")
                
    def drop(self, columns):
        columns = self.columns(columns)
        self.df.drop(columns, axis=1, inplace=True)
    
    def save(self, filename=None, extension='csv'):
        match extension:
            case 'csv':
                self.df.to_csv(filename, index=False)
            case 'xslx':
                self.df.to_excel(filename, index=False)

    def __insert_or_replace(self, column, insert, method, data):
        if insert:
            index = self.cols.get_loc(column) + 1
            new_col = column \
                    if column.endswith(method) \
                    else f'smoothed,{column},{method}'
            print('Вставка нового столбца: ', new_col) if self.verbose else None
            try:
                self.df.insert(index, new_col, data)
            except ValueError as err:
                print(err) if self.verbose else None
                self.df.loc[:, new_col] = data
        else:
            self.df.loc[:, column] = data

    #endregion

    #region time processing
    def parse_datetime(self, parse_date=True, drop_date=DROP_OLD_COLUMNS, 
                       parse_time=True, drop_time=DROP_OLD_COLUMNS, 
                       replace_time_0_to_24=False):
        """
                Расчленяет фичу "Дата" на 3 колонки: ['Год','Месяц','День']
                и преобразует фичу "Время" (datetime) в фичу "Час" (int)
            parse_date: bool (default True) - флаг на разрешение расчленения фичи "Дата"
            drop_date: bool (default True) - флаг на удаление фичи "Дата" после расчленения
            parse_time: bool (default True) - флаг на разрешение преобразования фичи "Время"
            drop_time: bool (default True) - флаг на удаление фичи "Время" после преобразования
            replace_time_0_to_24: bool (default False) - флаг замены 0 (полночь) на 24
        """
        print("Парсинг Даты и Времени...") if self.verbose else None
        if parse_date:
            loc=self.cols.get_loc('Дата')
            self.df.insert(loc, 'День', self.df['Дата'].dt.day)
            self.df.insert(loc, 'Месяц', self.df['Дата'].dt.month)
            self.df.insert(loc, 'Год', self.df['Дата'].dt.year)
            
        if parse_time:
            self.df.insert(self.cols.get_loc('Время'), 'Час', self.df['Время'].apply(lambda x: x.hour))
            if replace_time_0_to_24:
                self.df.loc[self.df['Час'] == 0, 'Час'] = 24
        if drop_date:
            self.df.drop(columns='Дата', inplace=True)
        if drop_time:
            self.df.drop(columns='Время', inplace=True)

    def set_lag(self, lag=0, drop=DROP_OLD_COLUMNS, target='Добыча воды за 2 ч ,м3'):
        """
                Добавления смещения значений по оси 0 (по строкам)
                Отрительные значения -> смещение вверх, положительные -> вниз
            lag: (default 0) int - значение смещение (лага, шага) по строкам
            drop: bool (default False) - флаг на удаление смещаемой фичи (оригинальной)
            target: str (default Добыча воды за 2 ч ,м3) - название фичи для сдвига
        """
        print(f"Смещение признака '{target}' на lag {lag}") if self.verbose else None
        if lag:
            self.df = pd.concat([self.df, 
                                self.df[target].shift(lag).
                                rename(f"{target}, лаг {lag}")], axis=1).dropna().reset_index(drop=True)
            if drop:
                self.df.drop(columns=target, inplace=True)
            
            print(f"'{target}' смещено на {lag}") if self.verbose else None
        else:
            print('Смещение установлено в 0. Пропускаем...') if self.verbose else None

    def filter_by_hours(self, hours: list=[8, 20]):
        """
                Фильтрует датасет по `hours` часам
                Необходим признак 'Час'
                *Метод parse_datetime достает фичу 'Час'*
            hours: list (default [8, 20]) - int значения часов для фильтрации
        """
        print(f"Фильтрация по {hours} часам") if self.verbose else None
        if 'Час' in self.cols:
            self.df = self.df[self.df['Час'].isin(hours)].reset_index(drop=True)
            print(self.df['Час'].value_counts()) if self.verbose else None
        else:
            print("Признак 'Час' не найден")

    def convert_datetime(self, drop=DROP_OLD_COLUMNS):
        datetimes = ['Год', 'Месяц','День','Час']
        self.df.insert(0, TIME_AXIS, self.df[datetimes].apply(
            lambda x: f'{x.iloc[0]:04}-{x.iloc[1]:02}-{x.iloc[2]:02} {x.iloc[3]:02}:00', 
            axis = 1)) 
        self.df.sort_values(TIME_AXIS, inplace=True)
        if drop:
            self.df.drop(columns=datetimes, inplace=True)

    #endregion

    #region preprocessing
    def new_feature(self, columns, newcol, agg_f:str='max', drop=DROP_OLD_COLUMNS):
        columns = self.columns(columns)
        match agg_f:
            case 'max':
                self.df[newcol] = self.df[columns].max(axis=1)
            case 'median':
                self.df[newcol] = self.df[columns].median(axis=1)
            case 'mean':
                self.df[newcol] = self.df[columns].mean(axis=1)
        if drop:
            self.drop(columns)


    def recovery_outliers(self, columns, save_interval, insert=True):
    
        columns = self.columns(columns)
        print("Восстановление выбросов...") if self.verbose else None
        for idx, col in enumerate(columns):
            mask = (
                    (self.df[col].between(*save_interval) )
                )
            
            data = pd.Series(np.where(mask, self.df[col], np.nan))
            print("Выбросы: ", self.df.loc[~mask, col]) if self.verbose else None
            data = data.ffill()
            self.__insert_or_replace(col, insert, 'outlier', data)
            


    def smooth(self, columns, frac=0.001, window=5, polyorder=3, insert=INSERT_NEARBY, method='lowess'):
        """
            Сглаживание значений в указанных признаках
            Поддерживаемые методы:
            - lowess - Сглаживание по методу Лоусса
            - rolling - Скользящее среднее окно
            - savgol_filter (https://en.wikipedia.org/wiki/Savitzky%E2%80%93Golay_filter) - Фильтр Савицкого-Голея 
                - подгоняет последующие окна смежных данных с полиномом низкого порядка
        """
        columns = self.columns(columns)
        print("Выполняется сглаживание значений рядов: ", columns) if self.verbose else None
        for col in columns:
            match method:
                case 'lowess':
                    smoothed = lowess(self.df[col], range(len(self.df)), frac=frac)[:, 1]
                case 'rolling':
                    smoothed = self.df[col].rolling(window=window).mean()
                case 'savgol_filter':
                    smoothed = signal.savgol_filter(self.df[col],
                               window, # window size used for filtering
                               polyorder) # order of fitted polynomial
                case _:
                    raise ValueError("Указан неподдерживаемый метод")

            self.__insert_or_replace(col, insert, method, smoothed)
           

    def zscore(self, s, window, thresh=3, return_all=False, coeff=1.0):
        roll = s.rolling(window=window, min_periods=1)
        avg = roll.mean()
        max = roll.max()
        min = roll.min()
        std = roll.std(coeff)
        z = s.sub(avg).div(std)   
        m = z.between(-thresh, thresh)
        
        if return_all:
            return s.where(m, avg), max, min
        return s.where(m, avg)

    def scale(self, columns, method='standard', insert=False):
        columns = self.columns(columns)
        print("Выполняется масштабирование значений признаков: ", columns) if self.verbose else None
        match method:
            case 'standard':
                scaled = preprocessing.StandardScaler().fit_transform(self.df[columns])
            case 'minmax':
                scaled = preprocessing.MinMaxScaler().fit_transform(self.df[columns])
            case _:
                raise ValueError("Указан неподдерживаемый метод")
        self.__insert_or_replace(columns, insert, method, scaled)
            

        

    #endregion

    #region plots 
    @staticmethod 
    def plot_template(fig: go.Figure, filepath=FILEPATH, show=SHOW_PLOTS, append=APPEND_TO_EXISTS, 
                      verbose=VERBOSE, mlflow_track=False, run_id=None, **layout_params):
        fig.update_layout(template=PLOT_THEME, **layout_params)
        if filepath:
            filepath = filepath if filepath.endswith('.html') else f'{filepath}.html'
            if not os.path.exists(filepath) or not append:
                fig.write_html(filepath)
            else:
                with open(filepath, 'a') as f:
                    f.write(fig.to_html(full_html=False, include_plotlyjs=False))
            print("Файл сгенерирован: ", filepath) if verbose else None
        if show:
            fig.show()
        if mlflow_track:
            mlflow.log_artifact(local_path=filepath, run_id=run_id)

    def report(self, columns=None, filepath=FILEPATH, show=SHOW_PLOTS):
        columns = self.columns(columns)
        report = ProfileReport(self.df[columns], 
                      title="Profiling Report")
        if filepath:
            report.to_file(output_file=filepath)
        if show:
            return report    

    def time_series(self, columns=None, appendix_cols=None, log=LOG, time_axis=TIME_AXIS,
                    **kwargs): # filepath, show, height, width, append, title and other (layout_params)
        columns = self.columns(columns)
        if appendix_cols:
            columns += appendix_cols
        if time_axis not in self.cols:
            columns.append(time_axis)
        print("График отображает признаки: ", columns) if self.verbose else None
    
        try:
            fig = px.line(self.df, x=time_axis, y=columns,
                          log_x=log[0], log_y=log[1])
            self.plot_template(fig, **kwargs)
        except BaseException as err:
            print(err)

        
    def scatter_matrix(self, columns=None, **kwargs):
        columns = self.columns(columns)
        fig = px.scatter_matrix(self.df[columns])
        fig.update_traces(diagonal_visible=False, showlowerhalf=False)
        self.plot_template(fig, **kwargs)

    def difference_with_smoothed(self, column, time_axis=TIME_AXIS, method='rolling',mode='markers',
                                 **kwargs): # filepath, show, height, width, append, title and other (layout_params)
        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=self.df[time_axis],
            y=self.df[column],
            marker=dict(size=2, color='black',),
            opacity=0.25,
            name=column
        ))
        xaxis = self.df[time_axis]
        try:
            ylabel = self.columns(column+","+method)[0]
            yaxis = self.df[ylabel]
        except BaseException as err:
            print(err, f'ylabel: {self.columns(column)}')
        
        fig.add_trace(go.Scatter(
            x=xaxis,
            y=yaxis,
            marker=dict(
                size=6,
                color='royalblue',
                symbol='circle-open'
            ),
            name=ylabel
        ))

        if mode:
            fig.add_trace(go.Scatter(
                x=xaxis,
                y=yaxis,
                mode=mode,
                marker=dict(
                    size=6,
                    color='mediumpurple',
                    symbol='triangle-up'
                ),
                name='Smoothed scatter'
            ))

        self.plot_template(fig, **kwargs)


    def corr_matrix(self, columns=None, target=TARGET, 
                    method='pearson', textfont_size=TEXTFONT_SIZE, 
                    **kwargs): # filepath, show, height, width, append, title and other (layout_params)
        columns = self.columns(columns)
        if target not in columns:
            columns.append(target)
       
        if target:
            corr = pd.concat([pd.DataFrame(self.df[columns].corr('pearson')[target]).rename(columns={target:'pearson'}).T,
                            pd.DataFrame(self.df[columns].corr('kendall')[target]).rename(columns={target:'kendall'}).T,
                            pd.DataFrame(self.df[columns].corr('spearman')[target]).rename(columns={target:'spearman'}).T,
                            pd.DataFrame(self.df[columns].phik_matrix()[target]).rename(columns={target:'phik'}).T]) 
        elif method=='phik': 
            corr = self.df[columns].phik_matrix()
        else:
            corr = self.df[columns].corr(method) 
       
        fig = px.imshow(corr, text_auto=True,  color_continuous_scale=COLORMAP)
        fig.update_traces(textfont_size=textfont_size,  texttemplate = "%{z:.2f}")
        self.plot_template(fig, **kwargs)

    #endregion

# EDA

In [16]:
date_columns = ['date_create', 'order_start_prepare', 'order_ready', 'order_pickup']
orders = pd.read_csv('../content/aggregated_df.csv', parse_dates=date_columns)
orders = type_casting(orders)

In [24]:
orders.columns

Index(['store_id', 'profit', 'delivery_distance', 'date_create',
       'order_start_prepare', 'planned_prep_time', 'order_ready',
       'order_pickup', 'region_id', 'status_id', 'products_count',
       'order_price', 'max_price', 'min_price', 'avg_price',
       'unique_products_sold_by_store'],
      dtype='object')

In [18]:
df = Dataset(orders)

In [22]:
df.time_series()

TypeError: all inputs must be Index

In [30]:
df.time_series(columns=df.cols,log=(False,False), show=True)

Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['store_id', 'profit', 'delivery_distance', 'date_create', 'order_start_prepare', 'planned_prep_time', 'order_ready', 'order_pickup', 'region_id', 'status_id', 'products_count', 'order_price', 'max_price', 'min_price', 'avg_price', 'unique_products_sold_by_store'] but received: YY-MM-DD HH:00


# OUTLIER DETECTION & FUTURE ORDERS FORECASTING