In [1]:
import pandas as pd
import os

PATH_CSV = '/kaggle/input/applications-of-deep-learning-wustlfall-2022/beach_demand_forecast/'

df_sales_train = pd.read_csv(os.path.join(PATH_CSV,"sales_train.csv"))
df_items = pd.read_csv(os.path.join(PATH_CSV,"items.csv"))
df_resturant = pd.read_csv(os.path.join(PATH_CSV,"resturants.csv"))
df_sales_test = pd.read_csv(os.path.join(PATH_CSV,"sales_test.csv"))

df_sales_train.date = pd.to_datetime(df_sales_train.date, errors='coerce') 
df_sales_test.date = pd.to_datetime(df_sales_test.date, errors='coerce') 

We begin by creating one long sequence that combines training and test data. The test data occurs just after the training "in time".

In [2]:
df_sales = pd.concat([df_sales_train,df_sales_test])
df_sales.columns = ['date','item_id','price','item_count','submit_id']
df_sales.loc[~df_sales.submit_id.isna(),'submit_id'] = df_sales[~df_sales.submit_id.isna()].submit_id.astype(int)

df_sales

Unnamed: 0,date,item_id,price,item_count,submit_id
0,2019-01-01,3,29.22,2.0,
1,2019-01-01,4,26.42,22.0,
2,2019-01-01,12,4.87,7.0,
3,2019-01-01,13,4.18,12.0,
4,2019-01-01,16,3.21,136.0,
...,...,...,...,...,...
9195,2021-12-31,96,21.93,,9195.0
9196,2021-12-31,97,28.65,,9196.0
9197,2021-12-31,98,5.00,,9197.0
9198,2021-12-31,99,5.32,,9198.0


### Extract Image Data

In [3]:
import sys

!git clone https://github.com/ultralytics/yolov5 --tag 6.2  # clone
!mv /kaggle/working/6.2 /kaggle/working/yolov5
%pip install -qr /kaggle/working/yolov5/requirements.txt  # install
sys.path.insert(0,'/kaggle/working/yolov5/')

import torch
import utils
display = utils.notebook_init()  # checks

YOLOv5 🚀 v7.0-10-g10c025d Python-3.7.12 torch-1.11.0 CUDA:0 (Tesla P100-PCIE-16GB, 16281MiB)


Setup complete ✅ (2 CPUs, 15.6 GB RAM, 3964.5/4030.6 GB disk)


In [4]:
from os import walk
import datetime
import tqdm
import torch

# Model
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s')  # or yolov5n - yolov5x6, custom

PATH_CAM = '/kaggle/input/applications-of-deep-learning-wustlfall-2022/beach_demand_forecast/cam/'
filenames = next(walk(PATH_CAM), (None, None, []))[2]  

list_date = []
list_people = []

for file in tqdm.tqdm(filenames):
    if file=='1.jpg': continue
    filename = os.path.join(PATH_CAM, file)
    results = yolo_model(filename)
    df = results.pandas().xyxy[0]
    people = len(df[df.name=='person'])
    dt = datetime.datetime.strptime(file[:10], '%Y_%m_%d')
    list_date.append(dt)
    list_people.append(people)

df_street_view = pd.DataFrame({'date':list_date,'people':list_people})
df_street_view

Downloading: "https://github.com/ultralytics/yolov5/archive/master.zip" to /root/.cache/torch/hub/master.zip
YOLOv5 🚀 v7.0-10-g10c025d Python-3.7.12 torch-1.11.0 CUDA:0 (Tesla P100-PCIE-16GB, 16281MiB)

Downloading https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt to yolov5s.pt...


  0%|          | 0.00/14.1M [00:00<?, ?B/s]


Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 
100%|██████████| 1097/1097 [00:31<00:00, 34.64it/s]


Unnamed: 0,date,people
0,2020-01-04,14
1,2019-07-15,18
2,2021-09-13,18
3,2021-06-15,20
4,2019-05-31,18
...,...,...
1091,2020-08-29,21
1092,2020-08-06,16
1093,2019-03-07,16
1094,2019-09-18,19


Utility function to create sequences.

In [5]:
def process_title(model, name):
    v = None
    i = 0
    for word in name.split(' '):
        word = word.lower()
    if word == 'vegi': word = "vegetable"
    if word == 'smoothy': word = "malt"
    i+=1
    if v is None and word in model:
        v=model[word].copy()
    elif word in model:
        v+=model[word]
    v/=i
    return v

def series_to_supervised(data, window=1, lag=1, dropnan=True):
    cols, names = list(), list()
    # Input sequence (t-n, ... t-1)
    for i in range(window, 0, -1):
        cols.append(data.shift(i))
        names += [('%s(t-%d)' % (col, i)) for col in data.columns]
    # Current timestep (t=0)
    cols.append(data)
    names += [('%s(t)' % (col)) for col in data.columns]
    # Target timestep (t=lag)
    cols.append(data.shift(-lag))
    names += [('%s(t+%d)' % (col, lag)) for col in data.columns]
    # Put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # Drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg
    
def drop_column(df, col):
    columns_to_drop = [('%s(t+%d)' % (col, lag_size))]
    for i in range(window, 0, -1):
        columns_to_drop += [('%s(t-%d)' % (col, i))]
    df.drop(columns_to_drop, axis=1, inplace=True, errors='ignore')
    df.drop([f"{col}(t)"], axis=1, inplace=True, errors='ignore')

def cat_seq(df, col):
    return to_categorical(df[col].values)

### Load the Glove Embeddings

In [6]:
!wget -c "https://nlp.stanford.edu/data/glove.6B.zip"
!unzip glove.6B.zip

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = 'glove.6B.300d.txt'
tmp_file = get_tmpfile("test_word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)
w2vec_model = KeyedVectors.load_word2vec_format(tmp_file)

--2022-11-29 16:20:45--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-11-29 16:20:46--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-11-29 16:23:27 (5.13 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


  # Remove the CWD from sys.path while we load stuff.


### Engineer Time Series Features

In [7]:
item_lookup = {}
for i, name in zip(list(df_items.id),list(df_items.name)):
    v = process_title(w2vec_model,name)
    item_lookup[i] = v

# Join the items and sales tables so that we can look up the store id and kcal for each item.
df_items2 = df_items[['id', 'store_id', 'kcal']]
df_train = df_sales.merge(df_items2,left_on='item_id',right_on='id')
df_train[['date','item_id','item_count','store_id']]

# Merge people counts (new)
temp = len(df_train)
df_train = df_train.merge(df_street_view)
assert len(df_train) == temp

# Sort/agg
df_train = df_train.sort_values('date').groupby(['item_id', 'store_id', 'date'], as_index=False)
df_train = df_train.agg({'item_count':['mean'], 'people':['mean'], 'price':['mean'], 'kcal':['mean'],'submit_id':['mean']})
df_train.columns = ['item', 'store', 'date', 'sales', 'people', 'price', 'kcal', 'submit_id']
df_train['dow'] = df_train['date'].dt.dayofweek
df_train['doy'] = df_train['date'].dt.dayofyear
df_train.head()

Unnamed: 0,item,store,date,sales,people,price,kcal,submit_id,dow,doy
0,1,4,2019-01-01,0.0,17.0,6.71,554.0,,1,1
1,1,4,2019-01-02,0.0,16.0,6.71,554.0,,2,2
2,1,4,2019-01-03,0.0,18.0,6.71,554.0,,3,3
3,1,4,2019-01-04,0.0,16.0,6.71,554.0,,4,4
4,1,4,2019-01-05,0.0,18.0,6.71,554.0,,5,5


Join the items and sales tables so that we can look up the store id for each item.

Determine the time gap between the last day from training set from the last day of the test set, this will be out lag (the amount of day that need to be forecast).

In [8]:
lag_size = (df_sales_test['date'].max().date() - df_sales_train['date'].max().date()).days
print('Max date from train set: %s' % df_sales_train['date'].max().date())
print('Max date from test set: %s' % df_sales_test['date'].max().date())
print('Forecast lag size', lag_size)

Max date from train set: 2021-09-30
Max date from test set: 2021-12-31
Forecast lag size 92


Build the sequence data.

In [9]:
window = 30
series = series_to_supervised(df_train.drop('date', axis=1), window=window, lag=lag_size, dropnan=False)
series.head()

Unnamed: 0,item(t-30),store(t-30),sales(t-30),people(t-30),price(t-30),...,price(t+92),kcal(t+92),submit_id(t+92),dow(t+92),doy(t+92)
0,,,,,,...,6.71,554.0,,2.0,93.0
1,,,,,,...,6.71,554.0,,3.0,94.0
2,,,,,,...,6.71,554.0,,4.0,95.0
3,,,,,,...,6.71,554.0,,5.0,96.0
4,,,,,,...,6.71,554.0,,6.0,97.0


Remove sequences that did not have enough data.

In [10]:
# Remove edge cases, where there were not enough values to complete a series
last_item = 'item(t-%d)' % window
last_store = 'store(t-%d)' % window
series = series[(series['store(t)'] == series[last_store])]
series = series[(series['item(t)'] == series[last_item])]
series = series[(series['store(t+%d)' % lag_size] == series[last_store])]
series = series[(series['item(t+%d)' % lag_size] == series[last_item])]

Split the training series and submit series

In [11]:
submit_id_col = 'submit_id(t+%d)' % lag_size
labels_col = 'sales(t+%d)' % lag_size

series_train = series.loc[series[submit_id_col].isna()].copy(deep=True)
series_submit = series.loc[~series[submit_id_col].isna()].copy(deep=True)
series_submit = series_submit.drop(labels_col, axis=1)

In [12]:
# Drop all columns except sales
drop_column(series_train,'submit_id')
series_train.dropna(inplace=True)
#series_submit.dropna(inplace=True)
submit_id = series_submit[submit_id_col].astype(int)
drop_column(series_submit,'submit_id')
#series_submit.drop(labels_col, axis=1, inplace=True)

Extract the predictors (x sequences) and the label (future prediction)

##### Training set

In [13]:
import numpy as np
from keras.utils.np_utils import to_categorical   

# Label
labels = series_train[labels_col]

# item information and predicting information
series1_train = series_train[['item(t+%d)' % lag_size, 'store(t+%d)' % lag_size, 'dow(t+%d)' % lag_size, 'doy(t+%d)' % lag_size, 'people(t+%d)' % lag_size, 'price(t+%d)' % lag_size, 'kcal(t+%d)' % lag_size]]

series_train.drop(labels_col, axis=1, inplace=True)
series_train.drop('item(t+%d)' % lag_size, axis=1, inplace=True)
series_train.drop('store(t+%d)' % lag_size, axis=1, inplace=True)
series_train.drop('dow(t+%d)' % lag_size, axis=1, inplace=True)
series_train.drop('doy(t+%d)' % lag_size, axis=1, inplace=True)
series_train.drop('people(t+%d)' % lag_size, axis=1, inplace=True)
series_train.drop('price(t+%d)' % lag_size, axis=1, inplace=True)
series_train.drop('kcal(t+%d)' % lag_size, axis=1, inplace=True)

# Get sales sequences
series2_train = series_train.copy()
drop_column(series2_train, "item")
drop_column(series2_train, "store")
drop_column(series2_train, "dow")
drop_column(series2_train, "doy")
drop_column(series2_train, "people")
drop_column(series2_train, "price")
drop_column(series2_train, "kcal")
sales_series_train = series2_train.values

# Day of week as a number
series2_train = series_train.copy()
drop_column(series2_train, "item")
drop_column(series2_train, "store")
drop_column(series2_train, "sales")
drop_column(series2_train, "doy")
drop_column(series2_train, "people")
drop_column(series2_train, "price")
drop_column(series2_train, "kcal")
dow_series_train = series2_train.values

# Get day of year sequences
series2_train = series_train.copy()
drop_column(series2_train, "item")
drop_column(series2_train, "store")
drop_column(series2_train, "dow")
drop_column(series2_train, "sales")
drop_column(series2_train, "people")
drop_column(series2_train, "price")
drop_column(series2_train, "kcal")
doy_series_train = series2_train.values

# Get number of people sequences
series2_train = series_train.copy()
drop_column(series2_train, "item")
drop_column(series2_train, "store")
drop_column(series2_train, "dow")
drop_column(series2_train, "doy")
drop_column(series2_train, "sales")
drop_column(series2_train, "price")
drop_column(series2_train, "kcal")
people_series_train = series2_train.values


# Create x
t1_train = sales_series_train.reshape(sales_series_train.shape + (1,))
t2_train = dow_series_train.reshape(dow_series_train.shape + (1,)) 
t3_train = doy_series_train.reshape(doy_series_train.shape + (1,))
t4_train = people_series_train.reshape(people_series_train.shape + (1,))
x1_train = np.concatenate([t1_train,t2_train,t3_train,t4_train],axis=2)

In [14]:
print(t1_train.shape)
print(t2_train.shape)
print(t3_train.shape)
print(t4_train.shape)

(88200, 31, 1)
(88200, 31, 1)
(88200, 31, 1)
(88200, 31, 1)


In [15]:
# Create predictors (x)
vec_size = w2vec_model['test'].shape[0]

lst = []
for item in list(series_train['item(t-1)']):
    lst.append(item_lookup[item])

x2_train = np.concatenate(lst).reshape((series_train.shape[0],vec_size))

x3_train = series1_train.values

#x_train = [x1_train,x2_train,x3_train]
x_train = [x1_train,x3_train]

##### Submit set

In [16]:
import numpy as np
from keras.utils.np_utils import to_categorical   


# item information and predicting information
series1_submit = series_submit[['item(t+%d)' % lag_size, 'store(t+%d)' % lag_size, 'dow(t+%d)' % lag_size, 'doy(t+%d)' % lag_size, 'people(t+%d)' % lag_size, 'price(t+%d)' % lag_size, 'kcal(t+%d)' % lag_size]]

series_submit.drop('item(t+%d)' % lag_size, axis=1, inplace=True)
series_submit.drop('store(t+%d)' % lag_size, axis=1, inplace=True)
series_submit.drop('dow(t+%d)' % lag_size, axis=1, inplace=True)
series_submit.drop('doy(t+%d)' % lag_size, axis=1, inplace=True)
series_submit.drop('people(t+%d)' % lag_size, axis=1, inplace=True)
series_submit.drop('price(t+%d)' % lag_size, axis=1, inplace=True)
series_submit.drop('kcal(t+%d)' % lag_size, axis=1, inplace=True)

# Get sales sequences
series2_submit = series_submit.copy()
drop_column(series2_submit, "item")
drop_column(series2_submit, "store")
drop_column(series2_submit, "dow")
drop_column(series2_submit, "doy")
drop_column(series2_submit, "people")
drop_column(series2_submit, "price")
drop_column(series2_submit, "kcal")
sales_series_submit = series2_submit.values

# Day of week as a number
series2_submit = series_submit.copy()
drop_column(series2_submit, "item")
drop_column(series2_submit, "store")
drop_column(series2_submit, "sales")
drop_column(series2_submit, "doy")
drop_column(series2_submit, "people")
drop_column(series2_submit, "price")
drop_column(series2_submit, "kcal")
dow_series_submit = series2_submit.values

# Get day of year sequences
series2_submit = series_submit.copy()
drop_column(series2_submit, "item")
drop_column(series2_submit, "store")
drop_column(series2_submit, "dow")
drop_column(series2_submit, "sales")
drop_column(series2_submit, "people")
drop_column(series2_submit, "price")
drop_column(series2_submit, "kcal")
doy_series_submit = series2_submit.values

# Get number of people sequences
series2_submit = series_submit.copy()
drop_column(series2_submit, "item")
drop_column(series2_submit, "store")
drop_column(series2_submit, "dow")
drop_column(series2_submit, "doy")
drop_column(series2_submit, "sales")
drop_column(series2_submit, "price")
drop_column(series2_submit, "kcal")
people_series_submit = series2_submit.values


# Create x
t1_submit = sales_series_submit.reshape(sales_series_submit.shape + (1,))
t2_submit = dow_series_submit.reshape(dow_series_submit.shape + (1,)) 
t3_submit = doy_series_submit.reshape(doy_series_submit.shape + (1,))
t4_submit = people_series_submit.reshape(people_series_submit.shape + (1,))
x1_submit = np.concatenate([t1_submit,t2_submit,t3_submit,t4_submit],axis=2)

In [17]:
print(t1_submit.shape)
print(t2_submit.shape)
print(t3_submit.shape)
print(t4_submit.shape)

(9200, 31, 1)
(9200, 31, 1)
(9200, 31, 1)
(9200, 31, 1)


In [18]:
# Create predictors (x)
vec_size = w2vec_model['test'].shape[0]

lst = []
for item in list(series_submit['item(t-1)']):
    lst.append(item_lookup[item])

x2_submit = np.concatenate(lst).reshape((series_submit.shape[0],vec_size))

x3_submit = series1_submit.values

#x_submit = [x1_submit,x2_submit,x3_submit]
x_submit = [x1_submit,x3_submit]

### Train the Network
Extract the predictors (x sequences) and the label (future prediction)

In [19]:
TEST_SIZE = 0.25

mask = np.random.random(size=x_train[0].shape[0])<TEST_SIZE

X_train = []
X_valid = []

for subx in x_train:
    X_train.append(subx[~mask])
    X_valid.append(subx[mask])

Y_train = labels.values[~mask]
Y_valid = labels.values[mask]

print('Train set shape x1:', X_train[0].shape)
print('Train set shape x2:', X_train[1].shape)
print('Validation set shape x1:', X_valid[0].shape)
print('Validation set shape x2:', X_valid[1].shape)

Train set shape x1: (66036, 31, 4)
Train set shape x2: (66036, 7)
Validation set shape x1: (22164, 31, 4)
Validation set shape x2: (22164, 7)


### Construct the neural network

In [42]:
import tensorflow as tf 
from keras.models import Sequential, Model
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Flatten, Dropout, concatenate, Input
import keras

epochs = 500
batch = 256
lr = 0.0003
adam = tf.keras.optimizers.Adam(lr)
sgd = tf.keras.optimizers.SGD(learning_rate=lr)

model = Sequential()

A1 = Input(shape=(X_train[0].shape[1], X_train[0].shape[2]),name='A1')
A2 = Conv1D(filters=64, kernel_size=8, activation='relu')(A1)
A3 = MaxPooling1D(pool_size=2)(A2)
A4 = Conv1D(filters=32, kernel_size=8, activation='relu')(A3)
A5 = MaxPooling1D(pool_size=2)(A4)
A6 = Flatten()(A5)
A7 = Dense(64, activation='relu')(A6)
A8 = Dropout(0.4)(A7)
A9 = Dense(32, activation='relu')(A8)
A10 = Dropout(0.4)(A9)

#B1 = Input(shape=X_train[1].shape[1],name='B1')
#B2 = Dense(16, activation='relu',name='B2')(B1)
#B3 = Dropout(0.2)(B2)
#B4 = Dense(8, activation='relu',name='B4')(B3)

C1 = Input(shape=X_train[1].shape[1], name='C1')
C2 = Dense(16, activation='relu', name='C2')(C1)
#C3 = Dropout(0.2)(C2)
#C4 = Dense(8, activation='relu',name='C4')(C3)

#M1 = concatenate([A8,B4,C4])
M1 = concatenate([A10,C2])
M2 = Dense(16,name='M2')(M1)
M3 = Dropout(0.2)(M2)
M4 = Dense(1,name='M4')(M3)

#model = Model(inputs=[A1, B1, C1],outputs=[M4])
model = Model(inputs=[A1, C1],outputs=[M4])
model.compile(loss='mse', optimizer=adam)
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
A1 (InputLayer)                 [(None, 31, 4)]      0                                            
__________________________________________________________________________________________________
conv1d_10 (Conv1D)              (None, 24, 64)       2112        A1[0][0]                         
__________________________________________________________________________________________________
max_pooling1d_10 (MaxPooling1D) (None, 12, 64)       0           conv1d_10[0][0]                  
__________________________________________________________________________________________________
conv1d_11 (Conv1D)              (None, 5, 32)        16416       max_pooling1d_10[0][0]           
____________________________________________________________________________________________

Fit the neural network.

In [45]:
from keras.callbacks import EarlyStopping

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=10, 
        verbose=1, mode='auto', restore_best_weights=True)

cnn_history = model.fit(X_train, Y_train, callbacks=[monitor],
    validation_data=(X_valid, Y_valid), epochs=epochs, verbose=2)

Epoch 1/500
2064/2064 - 7s - loss: 205.7678 - val_loss: 195.8160
Epoch 2/500
2064/2064 - 7s - loss: 194.8998 - val_loss: 111.4221
Epoch 3/500
2064/2064 - 6s - loss: 199.4159 - val_loss: 114.7372
Epoch 4/500
2064/2064 - 7s - loss: 193.4206 - val_loss: 125.6539
Epoch 5/500
2064/2064 - 7s - loss: 190.2381 - val_loss: 107.3292
Epoch 6/500
2064/2064 - 7s - loss: 194.0457 - val_loss: 194.6439
Epoch 7/500
2064/2064 - 7s - loss: 195.5252 - val_loss: 109.6966
Epoch 8/500
2064/2064 - 6s - loss: 187.1138 - val_loss: 102.1987
Epoch 9/500
2064/2064 - 7s - loss: 195.7028 - val_loss: 106.3573
Epoch 10/500
2064/2064 - 7s - loss: 204.4639 - val_loss: 101.2723
Epoch 11/500
2064/2064 - 6s - loss: 192.9635 - val_loss: 106.1486
Epoch 12/500
2064/2064 - 7s - loss: 184.1046 - val_loss: 102.8355
Epoch 13/500
2064/2064 - 6s - loss: 192.8688 - val_loss: 108.7577
Epoch 14/500
2064/2064 - 7s - loss: 180.4367 - val_loss: 177.4986
Epoch 15/500
2064/2064 - 7s - loss: 181.2008 - val_loss: 105.2366
Epoch 16/500
2064/2

Predict and evaluate the validation data.

In [46]:
from sklearn.metrics import mean_squared_error
import numpy as np

cnn_train_pred = model.predict(X_train)
cnn_valid_pred = model.predict(X_valid)
print('Train rmse:', np.sqrt(mean_squared_error(Y_train, cnn_train_pred)))
print('Validation rmse:', np.sqrt(mean_squared_error(Y_valid, cnn_valid_pred)))

Train rmse: 10.759702571207375
Validation rmse: 9.615046609777272


Plot the training curve.

In [35]:
import matplotlib.pyplot as plt

fig = plt.figure()
plt.plot(cnn_history.history['loss'], label='Train loss')
plt.plot(cnn_history.history['val_loss'], label='Validation loss')
fig.legend()
fig.suptitle('CNN')
plt.xlabel("Epochs")
plt.ylabel("MSE")

plt.show()

## Build a Submission File

In [36]:
submit_pred = model.predict(x_submit)

In [37]:
df_submit = pd.DataFrame()
df_submit['id'] = submit_id.to_list()
df_submit['item_count'] = submit_pred

In [38]:
df_submit.to_csv('submit.csv',index=False)

## Download the Submit File

You only need to do this if you wish to view it locally. Otherwise, submit through Kaggle.

In [39]:
from IPython.display import FileLink

FileLink(r'submit.csv')