In [1]:
import numpy as np
import pandas as pd
import sys; sys.path.insert(0, '..')
from src.utils import get_memory_usage, reduce_memory_usage

In [2]:
def categorical_encoder(data, columns):
    df = data.copy()
    for col in columns:
        df[col] = df[col].astype("category")
    return df

In this notebook we merge the three different datasets so we have one single table to work with.

In [3]:
%%time
sales = pd.read_csv("../data/raw/sales_train_validation.csv")

CPU times: user 4.54 s, sys: 690 ms, total: 5.23 s
Wall time: 5.3 s


In [4]:
%%time
sample = pd.read_csv("../data/raw/sample_submission.csv")

CPU times: user 109 ms, sys: 29 ms, total: 138 ms
Wall time: 138 ms


In [5]:
%%time
calendar = pd.read_csv("../data/raw/calendar.csv")

CPU times: user 5.72 ms, sys: 1.9 ms, total: 7.62 ms
Wall time: 7.18 ms


In [6]:
%%time
prices = pd.read_csv("../data/raw/sell_prices.csv")

CPU times: user 1.84 s, sys: 249 ms, total: 2.09 s
Wall time: 2.12 s


In [7]:
%%time
submission = pd.read_csv("../data/raw/sample_submission.csv")

CPU times: user 107 ms, sys: 47.5 ms, total: 154 ms
Wall time: 154 ms


Before merging the datasets we change column types for more efficient memory usage.

In [8]:
%%time
reduced_sales = reduce_memory_usage(sales)

Memory usage decreased to 95.0 Mb(78.72% decrease)
CPU times: user 2min 34s, sys: 2min 9s, total: 4min 44s
Wall time: 4min 45s


In [9]:
%%time
reduced_calendar = reduce_memory_usage(calendar)

Memory usage decreased to 0.12 Mb(41.94% decrease)
CPU times: user 10.2 ms, sys: 14.4 ms, total: 24.5 ms
Wall time: 24.2 ms


In [10]:
%%time
reduced_prices = reduce_memory_usage(prices)

Memory usage decreased to 130.48 Mb(37.5% decrease)
CPU times: user 228 ms, sys: 167 ms, total: 395 ms
Wall time: 407 ms


For the first part of the competition the days 1914-1941 will be used for calculating the score on the leaderboard. For the second part of the competition the sales for 1914-1941 will be available and 1942-1969 will be used for calculating the public score. We therefore modify naming of columns in the submission table to make it consistent with the sales table.

In [14]:
validation = submission[submission["id"].str.contains("validation")]
validation.columns = ['id', 'd_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919', 'd_1920', 'd_1921', 'd_1922', 'd_1923', 'd_1924', 'd_1925', 'd_1926', 'd_1927', 'd_1928', 'd_1929', 'd_1930', 'd_1931', 
                      'd_1932', 'd_1933', 'd_1934', 'd_1935', 'd_1936', 'd_1937', 'd_1938', 'd_1939', 'd_1940', 'd_1941']

In [19]:
evaluation = submission[submission["id"].str.contains("evaluation")]
evaluation.columns = ['id', 'd_1942', 'd_1943', 'd_1944', 'd_1945', 'd_1946', 'd_1947', 'd_1948', 'd_1949', 'd_1950', 'd_1951', 'd_1952', 'd_1953', 'd_1954', 'd_1955', 'd_1956', 'd_1957', 'd_1958', 'd_1959', 
                      'd_1960', 'd_1961', 'd_1962', 'd_1963', 'd_1964', 'd_1965', 'd_1966', 'd_1967', 'd_1968', 'd_1969']

In [24]:
#product table to merge with validation/evaluation tables
product = reduced_sales[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()

In [25]:
# merge with product table
evaluation['id'] = evaluation['id'].str.replace('_evaluation','_validation')
validation = validation.merge(product, how = 'left', on = 'id')
evaluation = evaluation.merge(product, how = 'left', on = 'id')
evaluation['id'] = evaluation['id'].str.replace('_validation','_evaluation')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Next we melt the sales data such that the days columns are merged into one column and the values in that column indicates the day.

In [28]:
%%time
melt_sales = pd.melt(reduced_sales, 
                  id_vars = ['id','item_id','dept_id','cat_id','store_id','state_id'], 
                  var_name = "d", 
                  value_name = "sales")

CPU times: user 11.4 s, sys: 7.81 s, total: 19.2 s
Wall time: 22.6 s


In [29]:
%%time
melt_validation = pd.melt(validation, 
                  id_vars = ['id','item_id','dept_id','cat_id','store_id','state_id'], 
                  var_name = "d", 
                  value_name = "sales")

CPU times: user 162 ms, sys: 47 ms, total: 209 ms
Wall time: 222 ms


In [30]:
%%time
melt_evaluation = pd.melt(evaluation, 
                  id_vars = ['id','item_id','dept_id','cat_id','store_id','state_id'], 
                  var_name = "d", 
                  value_name = "sales")

CPU times: user 150 ms, sys: 27.9 ms, total: 178 ms
Wall time: 180 ms


Create column that indicates if the row belongs to train, validation or test(evaluation)

In [34]:
melt_sales["data"] = "train"
melt_validation["data"] = "val"
melt_evaluation["data"] = "test"

Now are ready to merge the three tables into one. They can easily be separated again using the "data" column.

In [35]:
merge_df = pd.concat([melt_sales, melt_validation, melt_evaluation], axis = 0)

In [36]:
merge_df.shape

(60034810, 9)

Now we can join the sales data with the calendar data and the price data.

In [38]:
merge_df = merge_df.merge(reduced_calendar, on = "d", how = "left")

In [39]:
merge_df.shape

(60034810, 22)

In [40]:
merge_df = merge_df.merge(reduced_prices, on = ["store_id", "item_id", "wm_yr_wk"], how = "left")

In [41]:
merge_df.shape

(60034810, 23)

In [42]:
get_memory_usage(merge_df)

7958.258237838745

Before saving the merged table we change the columns to their correct data types.

In [43]:
#change date column to datetime
merge_df["date"] = pd.to_datetime(merge_df["date"])

In [44]:
#change weekday column to integer
weekday_str = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
weekday_int = np.arange(1, 8)
weekday_dict = dict(zip(weekday_str, weekday_int))

In [45]:
merge_df["weekday"] = merge_df["weekday"].map(weekday_dict)

In [46]:
#change day column to integer
merge_df["d"] = merge_df["d"].str[2:]
merge_df["d"] = pd.to_numeric(merge_df["d"])

We label encode all categorical columns in order to save memory.

In [47]:
categorical_cols =  list(merge_df.select_dtypes(include = ["object"]).columns)
categorical_cols.remove("id")

In [48]:
get_memory_usage(merge_df)

7958.258237838745

In [49]:
merge_df = categorical_encoder(merge_df, categorical_cols)

In [50]:
get_memory_usage(merge_df)

4007.860279083252

In [51]:
merge_df = reduce_memory_usage(merge_df)

Memory usage decreased to 2920.04 Mb(27.14% decrease)


In [52]:
%%time
merge_df.to_pickle("../data/interim/merged_raw_data.pkl")

CPU times: user 5.12 s, sys: 9.27 s, total: 14.4 s
Wall time: 18.7 s


In [53]:
%%time
df = pd.read_pickle("../data/interim/merged_raw_data.pkl")

CPU times: user 2.88 s, sys: 3.04 s, total: 5.92 s
Wall time: 6.18 s
