# References


### Techniques for Feature Engineering in time series data
https://www.analyticsvidhya.com/blog/2019/12/6-powerful-feature-engineering-techniques-time-series/

### Video References
https://www.youtube.com/watch?v=c0k-YLQGKjY&t=180s&ab_channel=GregHogg
https://www.youtube.com/watch?v=S8tpSG6Q2H0&ab_channel=NachiketaHebbar

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM

In [2]:
train = pd.read_csv('train.csv', parse_dates=['date'])
train

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10
...,...,...,...,...
912995,2017-12-27,10,50,63
912996,2017-12-28,10,50,59
912997,2017-12-29,10,50,74
912998,2017-12-30,10,50,62


In [3]:
test = pd.read_csv('test.csv', parse_dates=['date'])
test

Unnamed: 0,id,date,store,item
0,0,2018-01-01,1,1
1,1,2018-01-02,1,1
2,2,2018-01-03,1,1
3,3,2018-01-04,1,1
4,4,2018-01-05,1,1
...,...,...,...,...
44995,44995,2018-03-27,10,50
44996,44996,2018-03-28,10,50
44997,44997,2018-03-29,10,50
44998,44998,2018-03-30,10,50


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   date    913000 non-null  datetime64[ns]
 1   store   913000 non-null  int64         
 2   item    913000 non-null  int64         
 3   sales   913000 non-null  int64         
dtypes: datetime64[ns](1), int64(3)
memory usage: 27.9 MB


In [5]:
train['store'].unique

<bound method Series.unique of 0          1
1          1
2          1
3          1
4          1
          ..
912995    10
912996    10
912997    10
912998    10
912999    10
Name: store, Length: 913000, dtype: int64>

In [21]:
# Convert the "date" column to datetime object
# train['date'] = pd.to_datetime(train['date'])

In [None]:
# train['date']

In [11]:
train[["store"]].nunique() 

store    10
dtype: int64

In [12]:
train[["item"]].nunique() 

item    50
dtype: int64

In [13]:
train.groupby(["store", "item"]).agg({"sales": ["sum", "mean", "median", "std"]}) 

Unnamed: 0_level_0,Unnamed: 1_level_0,sales,sales,sales,sales
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,median,std
store,item,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,1,36468,19.971522,19.0,6.741022
1,2,97050,53.148959,52.0,15.005779
1,3,60638,33.208105,33.0,10.072529
1,4,36440,19.956188,20.0,6.640618
1,5,30335,16.612815,16.0,5.672102
...,...,...,...,...,...
10,46,120601,66.046550,65.0,18.114991
10,47,45204,24.755750,24.0,7.924820
10,48,105570,57.814896,57.0,15.898538
10,49,60317,33.032311,32.0,10.091610


# Feature Engineering

In [23]:
# Extract year, month, day, and day of the week from the "date" column
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['dayofweek'] = train['date'].dt.dayofweek

In [24]:
# Group the data by store, item, and date, and calculate the mean sales
train = train.groupby(['store', 'item', 'date']).mean().reset_index()

In [39]:
train

Unnamed: 0,store,item,date,sales,year,month,day,dayofweek
0,1,1,2013-01-01,13.0,2013.0,1.0,1.0,1.0
1,1,1,2013-01-02,11.0,2013.0,1.0,2.0,2.0
2,1,1,2013-01-03,14.0,2013.0,1.0,3.0,3.0
3,1,1,2013-01-04,13.0,2013.0,1.0,4.0,4.0
4,1,1,2013-01-05,10.0,2013.0,1.0,5.0,5.0
...,...,...,...,...,...,...,...,...
912995,10,50,2017-12-27,63.0,2017.0,12.0,27.0,2.0
912996,10,50,2017-12-28,59.0,2017.0,12.0,28.0,3.0
912997,10,50,2017-12-29,74.0,2017.0,12.0,29.0,4.0
912998,10,50,2017-12-30,62.0,2017.0,12.0,30.0,5.0


In [25]:
# Split the data into training and validation sets
train_size = int(len(train) * 0.8)
train_set = train[:train_size]
val_set = train[train_size:]

In [27]:
# Scale the data using MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

In [28]:
# Scale the training data
train_set_scaled = scaler.fit_transform(train_set[['sales']])

In [29]:
# Create a function to generate training data
def generate_train_data(data, lookback):
    X, y = [], []
    for i in range(len(data)-lookback):
        X.append(data[i:(i+lookback)])
        y.append(data[i+lookback])
    return np.array(X), np.array(y)

In [30]:
# Generate training data with a lookback of 60 days
lookback = 60
X_train, y_train = generate_train_data(train_set_scaled, lookback)

# Reshape the training data to be 3-dimensional
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

# Scale the validation data
val_set_scaled = scaler.transform(val_set[['sales']])

# Generate validation data
X_val, y_val = generate_train_data(val_set_scaled, lookback)

# Reshape the validation data to be 3-dimensional
X_val = np.reshape(X_val, (X_val.shape[0], X_val.shape[1], 1))

In [34]:
tf.random.set_seed(1)

# Define the LSTM model
model = Sequential()
model.add(LSTM(units=50, input_shape=(X_train.shape[1], 1)))
model.add(Dense(1))
model.add(Dense(units=1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=1, batch_size=32, validation_data=(X_val, y_val))



<keras.callbacks.History at 0x1b0a396aa40>

In [38]:
# Scale the test data
test_scaled = scaler.transform(test[['sales']])

# Generate test data
X_test, y_test = generate_train_data(test_scaled, lookback)

# Reshape the test data to be 3-dimensional
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

# Make predictions on the test data
y_pred = model.predict(X_test)

# Inverse transform the predictions
y_pred = scaler.inverse_transform(y_pred)

# Create a dataframe with the predicted sales
predictions = pd.DataFrame(y_pred, columns=['sales'])

# Save the predictions to a CSV file
predictions.to_csv('predictions.csv', index=False)

KeyError: "None of [Index(['sales'], dtype='object')] are in the [columns]"

In [None]:
# Evaluate the model on the training set
train_loss = model.evaluate(X_train, y_train, verbose=0)
print('Training loss: %.4f' % train_loss)

# Evaluate the model on the validation set
val_loss = model.evaluate(X_val, y_val, verbose=0)
print('Validation loss: %.4f' % val_loss)