In [136]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from darts import TimeSeries
from darts.dataprocessing import Pipeline
from darts.dataprocessing.transformers import Scaler, InvertibleMapper, StaticCovariatesTransformer
from darts.dataprocessing.transformers.missing_values_filler import MissingValuesFiller
from darts.metrics import rmsle
from sklearn.metrics import mean_squared_log_error
from darts.models import LinearRegressionModel, LightGBMModel, XGBModel, CatBoostModel
from sklearn.preprocessing import OneHotEncoder
from tqdm.notebook import tqdm_notebook
from darts.models.filtering.moving_average_filter import MovingAverageFilter

pd.set_option('display.max_columns', None)

In [137]:
data = pd.read_csv(r'originalni_datasetovi\train_test_final.csv', parse_dates=['date'])
data.drop(columns=['work_day', 'lag_16_sales', 'lag_17_sales', 'lag_18_sales', 'lag_19_sales', 'lag_20_sales', 'lag_30_sales', 'lag_365_sales', 'lag_730_sales', 'lag_1_oil', 'lag_2_oil', 'lag_3_oil', 'lag_4_oil'], inplace=True)
data.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,N Batalla de Pichincha,N Carnaval,N Cyber Monday,N Dia de Difuntos,N Dia de la Madre,N Dia del Trabajo,N Futbol,N Independencia de Cuenca,N Independencia de Guayaquil,N Navidad,N Primer dia del ano,N Terremoto Manabi,N Viernes Santo,oil_price,transactions,month,day_of_month,day_of_year,week_of_month,week_of_year,day_of_week,year,is_wknd,quarter,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,date_index,season,workday,wageday,day_to_nearest_holiday,day_from_nearest_holiday
0,2013-01-01,1,AUTOMOTIVE,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,1,1,1,1,1,2,2013,0,1,1,0,1,0,1,0,0,0,0,0,0,0
1,2013-01-01,1,BABY CARE,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,1,1,1,1,1,2,2013,0,1,1,0,1,0,1,0,0,0,0,0,0,0
2,2013-01-01,1,BEAUTY,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,1,1,1,1,1,2,2013,0,1,1,0,1,0,1,0,0,0,0,0,0,0
3,2013-01-01,1,BEVERAGES,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,1,1,1,1,1,2,2013,0,1,1,0,1,0,1,0,0,0,0,0,0,0
4,2013-01-01,1,BOOKS,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,1,1,1,1,1,2,2013,0,1,1,0,1,0,1,0,0,0,0,0,0,0


In [138]:
def get_pipeline(static_covs_transform=False, log_transform=False):
    '''
    Function used for preprocessing time series data, fills missing values using interpolation. \n
    Optional keyword arguments:
        static_covs_transform:  one hot encodes static covariates; defaults to False.
        log_transform:  log transforms data; defaults to False.
    '''
    # contains all steps of our pipeline
    lst = []
    
    # fills missing values, uses interpolation
    # n_jobs = -1 -> allows for parallel processing using all available CPU cores
    filler = MissingValuesFiller(n_jobs=-1)
    lst.append(filler)
    
    # one hot encoding static covariates
    if static_covs_transform:
        static_covs_transformer = StaticCovariatesTransformer(
            transformer_cat=OneHotEncoder(), 
            n_jobs=-1,
        )
        lst.append(static_covs_transformer)

    # perform log transformation on sales
    if log_transform:
        log_transformer = InvertibleMapper(
            fn=np.log1p,
            inverse_fn=np.expm1,
            n_jobs=-1,
        )
        lst.append(log_transformer)

    # rescale time series
    scaler = Scaler()
    lst.append(scaler)

    pipeline = Pipeline(lst)
    return pipeline

In [139]:
def get_target_series(data, static_cols, log_transform=True):
    '''
    Extract and preprocess time series data for different product families. It applies transformations such as filling missing values, 
    log transformation, and scaling, and organizes the transformed data into dictionaries for further use.\n
    Parameters:
        data: DataFrame from which target series will be extracted.
        static_cols:  A string or list of strings representing static variable columns from the DataFrame that should be appended as static covariates to the resulting TimeSeries groups.
    Optional keyword arguments:
        log_transform:  log transforms data; defaults to True.  
    '''    
    target_dict = {} # key is family, value is array of time series of sales by stores
    pipe_dict = {} # key is family, value is pipeline
    id_dict = {} # key is family, value is pair of store and family

    for fam in tqdm_notebook(data.family.unique(), desc="Extracting target series"):
        # using train and splitting by family
        df = data[(data.date.le('2017-08-15')) & (data.family.eq(fam))] 
        
        # initialize transformation pipeline for target series
        pipe = get_pipeline(static_covs_transform=True, log_transform=log_transform)
        
        # The TimeSeries.from_group_dataframe method is used to create a list of time series objects from the dataframe, 
        # grouped by store_nbr and including static covariates specified in static_cols
        target = TimeSeries.from_group_dataframe(
            df=df,
            time_col="date",
            value_cols="sales",
            group_cols="store_nbr",
            static_cols=static_cols,
        )
        
        # record identity of each target series
        target_id = [{"store_nbr": t.static_covariates.store_nbr, "family": fam} # pair family store
                     for t in target]
        id_dict[fam] = target_id

        # apply transformations
        target = pipe.fit_transform(target)
        target_dict[fam] = [t.astype(np.float32) for t in target]

        pipe_dict[fam] = pipe[2:]  # without MissingValuesFiller and OHEnc
        
    return target_dict, pipe_dict, id_dict

In [140]:
static_cols = ["city", "state", "type", "cluster"]
target_dict, pipe_dict, id_dict = get_target_series(data, static_cols)

Extracting target series:   0%|          | 0/33 [00:00<?, ?it/s]

In [141]:
def get_covariates(data, past_cols, future_cols, future_ma_cols=None, future_window_sizes=[7, 28]):
    '''
    Extract and preprocess time series data for different product families. It applies transformations such as filling missing values, 
    log transformation, and scaling, and organizes the transformed data into dictionaries for further use.\n
    Parameters:
        data: DataFrame from which future and past covariates will be extracted.
        past_cols:  covariates known only into the past.
        future_cols:  covariates known into the future.
    Optional parameters:
        future_ma_cols: columns that moving average will be computed on.
        future_window_sizes: window size for moving average.
    '''
    past_dict = {} #key is family, value is array of time series of transactions by stores
    future_dict = {} #key is family, value is array of array of time series of future covariate by stores for all future covariates
    
    # initialize transformation pipeline for covariates
    covs_pipe = get_pipeline()

    for fam in tqdm_notebook(data.family.unique(), desc="Extracting covariates"):
        # filter data for each model
        df = data[data.family.eq(fam)]
        
        # extract past covariates
        past_covs = TimeSeries.from_group_dataframe(
            df=df[df.date.le('2017-08-15')],
            time_col="date",
            value_cols=past_cols,
            group_cols="store_nbr",
        )
        past_covs = [p.with_static_covariates(None) for p in past_covs]
        past_covs = covs_pipe.fit_transform(past_covs)
        
        past_dict[fam] = [p.astype(np.float32) for p in past_covs]

        # extract future covariates
        future_covs = TimeSeries.from_group_dataframe(
            df=df,
            time_col="date",
            value_cols=future_cols,
            group_cols="store_nbr",
        )
        future_covs = [f.with_static_covariates(None) for f in future_covs]
        future_covs = covs_pipe.fit_transform(future_covs)
        
        if future_ma_cols is not None:
            for size in future_window_sizes:
                ma_filter = MovingAverageFilter(window=size)
                old_names = [f"rolling_mean_{size}_{col}" for col in future_ma_cols]
                new_names = [f"{col}_ma{size}" for col in future_ma_cols]
                future_ma_covs = [
                    ma_filter.filter(f[future_ma_cols]).with_columns_renamed(old_names, new_names) 
                    for f in future_covs
                ]
                future_covs = [f.stack(f_ma) for f, f_ma in zip(future_covs, future_ma_covs)]
        
        future_dict[fam] = [f.astype(np.float32) for f in future_covs]
            
    return past_dict, future_dict

In [142]:
# past covariates
past_cols = ["transactions"]

# future covariates
future_cols = [
    'N Dia de Difuntos', 'N Dia de la Madre', 'N Dia del Trabajo', 'N Futbol', 'N Navidad', 'N Primer dia del ano', 'N Terremoto Manabi', "oil_price", "month", "day_of_year", "week_of_year", "day_of_week", "year", "date_index", "workday", "onpromotion"]

# columns for moving average
future_ma_cols = ["oil_price", "onpromotion"]

past_dict, future_dict = get_covariates(data, past_cols, future_cols, future_ma_cols=future_ma_cols)

Extracting covariates:   0%|          | 0/33 [00:00<?, ?it/s]

In [244]:
TRAINER_CONFIG = {
    "target_dict": target_dict,
    "pipe_dict": pipe_dict,
    "id_dict": id_dict,
    "past_dict": past_dict,
    "future_dict": future_dict,
    
    # time series cross-validation using a rolling forecasting origin
    "forecast_horizon": 16, # the length of the validation set
    "folds": 1, # the number of training sets (setting to 1 means the standard train-validation split)
    
    # the number of previous days to check for zero sales; if all are zero, generate zero forecasts
    "zero_fc_window": 15,
    
    "static_covs": "keep_all",
    "past_covs": "keep_all",
    "future_covs": "keep_all",
}

In [227]:
from dataclasses import dataclass

@dataclass
class Trainer:
    target_dict: dict 
    pipe_dict: dict 
    id_dict: dict 
    past_dict: dict 
    future_dict: dict 
    forecast_horizon: int 
    folds: int
    zero_fc_window: int 
    static_covs: str
    past_covs: str
    future_covs: str
        
    def clip(self, array):
        '''
        Changes negative values of an array to zeroes.
        '''
        return np.clip(array, a_min=0., a_max=None)
    
    def train_valid_split(self, target, length):
        # length is (self.folds - j) * self.forecast_horizon 
        train = [timeseries[:-length] for timeseries in target]
        valid_end_idx = -length + self.forecast_horizon
        
        if valid_end_idx >= 0:
            valid_end_idx = None
        
        valid = [t[-length:valid_end_idx] for t in target]
        return train, valid
    
    def get_models(self, model_names, model_configs):
        models = {
            "lr": LinearRegressionModel,
            "lgbm": LightGBMModel,
            "cat": CatBoostModel,
            "xgb": XGBModel,
        }
        assert isinstance(model_names, list) and isinstance(model_configs, list),\
        "Both the model names and model configurations must be specified in lists."
        assert all(name in models for name in model_names),\
        f"Model names '{model_names}' not recognized."
        assert len(model_names) == len(model_configs),\
        "The number of model names and the number of model configurations do not match."
        
        if "xgb" in model_names:
            xgb_idx = np.where(np.array(model_names)=="xgb")[0]
            for idx in xgb_idx:
                # change to histogram-based method for XGBoost to get faster training time
                model_configs[idx] = {"tree_method": "hist", **model_configs[idx]}
        
        return [models[name](**model_configs[j]) for j, name in enumerate(model_names)]
    
    def generate_forecasts(self, models, train, pipe, past_covs, future_covs, drop_before):
        if drop_before is not None: 
            date = pd.Timestamp(drop_before) - pd.Timedelta(days=1) 
            # train without specifed dates
            train = [t.drop_before(date) for t in train]
             
        # inputs for a model
        inputs = {
            "series": train,
            "past_covariates": past_covs,
            "future_covariates": future_covs,
        }

        # generates validation dates and all zero values
        zero_pred = pd.DataFrame({ 
            "date": pd.date_range(train[0].end_time(), periods=self.forecast_horizon+1)[1:],
            "sales": np.zeros(self.forecast_horizon),
        })
        
        # transforming that df to time series
        zero_pred = TimeSeries.from_dataframe( 
            df=zero_pred,
            time_col="date",
            value_cols="sales",
        )
        
        pred_list = []
        ens_pred = [0 for _ in range(len(train))] # zero for every store 
        
        for m in models:
            # fit training data to model
            m.fit(**inputs)

            # generate forecasts
            pred = m.predict(n=self.forecast_horizon, **inputs, show_warnings=False)
            # apply inverse transformations
            pred = pipe.inverse_transform(pred)

            for j in range(len(train)):
                # if there is all zeros in j time series in the last specifed period of time predict zeros
                if train[j][-self.zero_fc_window:].values().sum() == 0:
                    pred[j] = zero_pred
            
            # clip negative forecasts to 0s
            pred = [p.map(self.clip) for p in pred]
            pred_list.append(pred)
            
            # ensemble averaging
            for j in range(len(ens_pred)): # 54
                ens_pred[j] += pred[j] / len(models) 

        return pred_list, ens_pred
    
    def metric(self, valid, pred):
        valid_df = pd.concat([ts.pd_dataframe() for ts in valid], axis=1)
        pred_df = pd.concat([ts.pd_dataframe() for ts in pred], axis=1)

        # calculate RMSLE for each pair of valid and predicted values
        rmsle_values = [mean_squared_log_error(valid_df[col], pred_df[col],squared=False) for col in valid_df.columns]

        # calculate the mean of RMSLE values of all series of that family
        mean_rmsle = np.mean(rmsle_values)

        return mean_rmsle
    
    def validate(self, model_names, model_configs, drop_before=None):
        # helper value to align printed text below
        longest_len = len(max(self.target_dict.keys(), key=len)) #33 kao broj prodavnica
        
        # store metric values for each model
        model_metrics_history = []
        ens_metric_history = []
        
        for fam in tqdm_notebook(self.target_dict, desc="Performing validation"):
            target = self.target_dict[fam]
            pipe = self.pipe_dict[fam]
            past_covs = self.past_dict[fam]
            future_covs = self.future_dict[fam]
            
            # record average metric value over all folds
            model_metrics = []
            ens_metric = 0
            
            for j in range(self.folds):    # folds = 1
                # perform train-validation split and apply transformations
                length = (self.folds - j) * self.forecast_horizon # 16
                train, valid = self.train_valid_split(target, length) 
                valid = pipe.inverse_transform(valid) 

                # generate forecasts and compute metric
                models = self.get_models(model_names, model_configs)
                pred_list, ens_pred = self.generate_forecasts(models, train, pipe, past_covs, future_covs, drop_before) 
                metric_list = [self.metric(valid, pred) / self.folds for pred in pred_list]
                model_metrics.append(metric_list)
                if len(models) > 1:
                    ens_metric_fold = self.metric(valid, ens_pred) / self.folds
                    ens_metric += ens_metric_fold
                
            # store final metric value for each model
            model_metrics = np.sum(model_metrics, axis=0)
            model_metrics_history.append(model_metrics)
            ens_metric_history.append(ens_metric)
            
            # print metric value for each family
            print(
                fam,
                " " * (longest_len - len(fam)),
                " | ",
                " - ".join([f"{model}: {metric:.5f}" for model, metric in zip(model_names, model_metrics)]),
                f" - ens: {ens_metric:.5f}" if len(models) > 1 else "",
                sep="",
            )
            
        # print overall metric value
        print(
            "Average RMSLE | "
            + " - ".join([f"{model}: {metric:.5f}" 
                          for model, metric in zip(model_names, np.mean(model_metrics_history, axis=0))])
            + (f" - ens: {np.mean(ens_metric_history):.5f}" if len(models) > 1 else ""),
        )
        
    def ensemble_predict(self, model_names, model_configs, drop_before=None):        
        forecasts = []
        for fam in tqdm_notebook(self.target_dict.keys(), desc="Generating forecasts"):
            target = self.target_dict[fam]
            pipe = self.pipe_dict[fam]
            target_id = self.id_dict[fam]
            past_covs = self.past_dict[fam]
            future_covs = self.future_dict[fam]
            
            models = self.get_models(model_names, model_configs)
            pred_list, ens_pred = self.generate_forecasts(models, target, pipe, past_covs, future_covs, drop_before)
            ens_pred = [p.pd_dataframe().assign(store_nbr=i['store_nbr']['sales'], family=i['family']) for p, i in zip(ens_pred, target_id)]
            ens_pred = pd.concat(ens_pred, axis=0)
            forecasts.append(ens_pred)
            
        # combine all forecasts into one dataframe
        forecasts = pd.concat(forecasts, axis=0)
        forecasts = forecasts.rename_axis(None, axis=1).reset_index(names="date")
        
        return forecasts

In [245]:
trainer = Trainer(**TRAINER_CONFIG)

In [229]:
BASE_CONFIG = {
    "random_state": 0,
    
    # the number of lag values of the target series
    "lags": 63,
    
    # the number of lag values of the past covariates
    "lags_past_covariates": list(range(-16, -23, -1)),
    
    # the number of (past, future-1) lag values of the future covariates
    "lags_future_covariates": (14, 1),
    
    # the number of days ahead that the model is forecasting given today's input data
    "output_chunk_length": 1,
}

In [246]:
trainer.validate(["lr"], [BASE_CONFIG], drop_before="2015-01-01")

Performing validation:   0%|          | 0/33 [00:00<?, ?it/s]

AUTOMOTIVE                 | lr: 0.49327
BABY CARE                  | lr: 0.18225
BEAUTY                     | lr: 0.48382
BEVERAGES                  | lr: 0.20918
BOOKS                      | lr: 0.03108
BREAD/BAKERY               | lr: 0.19334
CELEBRATION                | lr: 0.53190
CLEANING                   | lr: 0.30859
DAIRY                      | lr: 0.17460
DELI                       | lr: 0.18786
EGGS                       | lr: 0.27028
FROZEN FOODS               | lr: 0.27630
GROCERY I                  | lr: 0.16795
GROCERY II                 | lr: 0.53997
HARDWARE                   | lr: 0.51593
HOME AND KITCHEN I         | lr: 0.49408
HOME AND KITCHEN II        | lr: 0.45205
HOME APPLIANCES            | lr: 0.28248
HOME CARE                  | lr: 0.21183
LADIESWEAR                 | lr: 0.50623
LAWN AND GARDEN            | lr: 0.44412
LINGERIE                   | lr: 0.61564
LIQUOR,WINE,BEER           | lr: 0.48699
MAGAZINES                  | lr: 0.49007
MEATS           

In [230]:
GBDT_CONFIG1 = {
    **BASE_CONFIG,
    "verbose":-1
    # the additional hyperparameters to be specified
#     "n_estimators": 100,
#     "learning_rate": 0.1,
#     "max_depth": 6,
    
}

GBDT_CONFIG2 = GBDT_CONFIG1.copy()
GBDT_CONFIG2["lags"] = 7

GBDT_CONFIG3 = GBDT_CONFIG1.copy()
GBDT_CONFIG3["lags"] = 365

GBDT_CONFIG4 = GBDT_CONFIG1.copy()
GBDT_CONFIG4["lags"] = 730



# 'lgbm' for LightGBM, 'xgb' for XGBoost, 'cat' for CatBoost
ENS_MODELS = ["lgbm", "lgbm", "lgbm", "lgbm"]
ENS_CONFIGS = [GBDT_CONFIG1, GBDT_CONFIG2, GBDT_CONFIG3, GBDT_CONFIG4]

trainer.validate(
    model_names=ENS_MODELS, 
    model_configs=ENS_CONFIGS,
    drop_before="2015-01-01",
)


Performing validation:   0%|          | 0/33 [00:00<?, ?it/s]

AUTOMOTIVE                 | lgbm: 0.50570 - lgbm: 0.50604 - lgbm: 0.51086 - lgbm: 0.51247 - ens: 0.50254
BABY CARE                  | lgbm: 0.20649 - lgbm: 0.20381 - lgbm: 0.21096 - lgbm: 0.20666 - ens: 0.20542
BEAUTY                     | lgbm: 0.46667 - lgbm: 0.50958 - lgbm: 0.47006 - lgbm: 0.47430 - ens: 0.47147
BEVERAGES                  | lgbm: 0.24228 - lgbm: 0.25068 - lgbm: 0.23535 - lgbm: 0.24958 - ens: 0.23261
BOOKS                      | lgbm: 0.05721 - lgbm: 0.05388 - lgbm: 0.05237 - lgbm: 0.04573 - ens: 0.05195
BREAD/BAKERY               | lgbm: 0.18822 - lgbm: 0.19438 - lgbm: 0.17740 - lgbm: 0.16981 - ens: 0.17652
CELEBRATION                | lgbm: 0.52838 - lgbm: 0.54156 - lgbm: 0.52413 - lgbm: 0.53648 - ens: 0.52407
CLEANING                   | lgbm: 0.32624 - lgbm: 0.36302 - lgbm: 0.31425 - lgbm: 0.34321 - ens: 0.32493
DAIRY                      | lgbm: 0.14960 - lgbm: 0.17488 - lgbm: 0.14309 - lgbm: 0.15696 - ens: 0.15003
DELI                       | lgbm: 0.19366 - l

In [247]:
# predictions1 = trainer.ensemble_predict(
#     model_names=ENS_MODELS, 
#     model_configs=ENS_CONFIGS,
# )

# generate forecasts for model trained on a subset of the data
predictions2 = trainer.ensemble_predict(
    model_names=ENS_MODELS, 
    model_configs=ENS_CONFIGS,
    drop_before="2015-01-01",
)

Generating forecasts:   0%|          | 0/33 [00:00<?, ?it/s]

In [238]:
predictions1.head()

Unnamed: 0,date,sales,store_nbr,family
0,2017-08-16,3.293292,1.0,AUTOMOTIVE
1,2017-08-17,2.968894,1.0,AUTOMOTIVE
2,2017-08-18,3.360378,1.0,AUTOMOTIVE
3,2017-08-19,4.467464,1.0,AUTOMOTIVE
4,2017-08-20,1.770783,1.0,AUTOMOTIVE


In [248]:
# compute the average of the ensemble models
final_predictions = predictions1.merge(
    predictions2, on=["date", 'store_nbr', "family"], how="left",
)
final_predictions["sales"] = final_predictions[["sales_x", "sales_y"]].mean(axis=1)
final_predictions = final_predictions.drop(columns=["sales_x", "sales_y"])

KeyError: "None of [Index(['sales_x', 'sales_y'], dtype='object')] are in the [columns]"

In [240]:
test=data[data['date']>= '2017-08-16']

In [241]:
test=test.copy()
start_id = 3000888
end_id = start_id + len(test)
ids = range(start_id, end_id)

test['id'] = ids
test.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,N Batalla de Pichincha,N Carnaval,N Cyber Monday,N Dia de Difuntos,N Dia de la Madre,N Dia del Trabajo,N Futbol,N Independencia de Cuenca,N Independencia de Guayaquil,N Navidad,N Primer dia del ano,N Terremoto Manabi,N Viernes Santo,oil_price,transactions,month,day_of_month,day_of_year,week_of_month,week_of_year,day_of_week,year,is_wknd,quarter,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,date_index,season,workday,wageday,day_to_nearest_holiday,day_from_nearest_holiday,id
3008016,2017-08-16,1,AUTOMOTIVE,,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,0,0,46.8,0.0,8,16,228,3,33,3,2017,0,3,0,0,0,0,0,0,1688,2,1,0,5,54,3000888
3008017,2017-08-16,1,BABY CARE,,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,0,0,46.8,0.0,8,16,228,3,33,3,2017,0,3,0,0,0,0,0,0,1688,2,1,0,5,54,3000889
3008018,2017-08-16,1,BEAUTY,,2.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,0,0,46.8,0.0,8,16,228,3,33,3,2017,0,3,0,0,0,0,0,0,1688,2,1,0,5,54,3000890
3008019,2017-08-16,1,BEVERAGES,,20.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,0,0,46.8,0.0,8,16,228,3,33,3,2017,0,3,0,0,0,0,0,0,1688,2,1,0,5,54,3000891
3008020,2017-08-16,1,BOOKS,,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,0,0,46.8,0.0,8,16,228,3,33,3,2017,0,3,0,0,0,0,0,0,1688,2,1,0,5,54,3000892


In [249]:
final = predictions2.copy()
submission = test.merge(
    final, on=["date", "store_nbr", "family"], how="left",
)
submission=submission.reset_index()
submission=submission[['id','sales_y']]
submission.rename(columns={'sales_y': 'sales'}, inplace=True)
submission.head()

Unnamed: 0,id,sales
0,3000888,3.26674
1,3000889,0.0
2,3000890,3.955883
3,3000891,2218.753332
4,3000892,0.0


In [250]:
submission.to_csv("submission2.csv", index=False)