## 1. Set up

### 1 - Drive

1 - Mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


2 - Move to the data folder

In [2]:
cd "gdrive/MyDrive/Projects/1 - Numericals/Predict Future Sales/2 - Production/data"

/content/gdrive/MyDrive/Projects/1 - Numericals/Predict Future Sales/2 - Production/data


### 2. Libraries

In [3]:
# Load data
import pandas as pd
import numpy as np
import io
import os
import glob

# Meta
import time

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sb
from matplotlib.lines import Line2D

# Analysis
from scipy.stats import zscore

## 3. Data

1 - List file names

In [4]:
ls

cleaned_1.csv                                      kaggle.json
cleaned_2.csv                                      sales_train.csv
competitive-data-science-predict-future-sales.zip  sample_submission.csv
item_categories.csv                                shops.csv
items.csv                                          test.csv


In [5]:
# Load cleaned_1
df = pd.read_csv('cleaned_1.csv')

In [12]:
# Load test
test = pd.read_csv('test.csv')

## 4. Clean Data

1. Aggregate item_cnt_day to item_cnt_month. Average prices since the prices changes for different shops.

In [13]:
groups = df.groupby(['shop_id', 'item_id','item_category_id','month','date_block_num','year','day_of_week','is_weekend'])
train = groups.agg({'item_cnt_day':'sum', 'item_price':'mean'}).reset_index()
train = train.rename(columns = {'item_cnt_day' : 'item_cnt_month'})

In [14]:
train

Unnamed: 0,shop_id,item_id,item_category_id,month,date_block_num,year,day_of_week,is_weekend,item_cnt_month,item_price
0,0,30,40,2,1,2013,0,False,4.0,265.0
1,0,30,40,2,1,2013,1,False,3.0,265.0
2,0,30,40,2,1,2013,2,False,2.0,265.0
3,0,30,40,2,1,2013,3,False,2.0,265.0
4,0,30,40,2,1,2013,4,False,4.0,265.0
...,...,...,...,...,...,...,...,...,...,...
2509039,59,22164,37,11,25,2015,0,False,1.0,749.0
2509040,59,22167,49,3,11,2013,1,False,1.0,299.0
2509041,59,22167,49,6,17,2014,5,True,1.0,299.0
2509042,59,22167,49,10,9,2013,4,False,1.0,299.0


In [15]:
train.isnull().values.any()


False

2 - Compared to the train set, the test set contains 363 unknown shop_id, item_id pairs. Add these to the data set and set their values to 0

In [16]:
len(list(set(test.item_id) - set(test.item_id).intersection(set(train.item_id)))), len(list(set(test.item_id))), len(test)

(644, 5100, 214200)

In [17]:
cols = train.columns

In [18]:
train = pd.concat([train, test], ignore_index=True, sort=False, keys=cols)
train.fillna(0, inplace=True)

In [22]:
train = train.drop(columns=['ID', 'is_weekend'])

In [23]:
train

Unnamed: 0,shop_id,item_id,item_category_id,month,date_block_num,year,day_of_week,item_cnt_month,item_price
0,0,30,40.0,2.0,1.0,2013.0,0.0,4.0,265.0
1,0,30,40.0,2.0,1.0,2013.0,1.0,3.0,265.0
2,0,30,40.0,2.0,1.0,2013.0,2.0,2.0,265.0
3,0,30,40.0,2.0,1.0,2013.0,3.0,2.0,265.0
4,0,30,40.0,2.0,1.0,2013.0,4.0,4.0,265.0
...,...,...,...,...,...,...,...,...,...
2723239,45,18454,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2723240,45,16188,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2723241,45,15757,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2723242,45,19648,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
train.isnull().values.any()


False

## 5 - Save Data

In [None]:
train.to_csv('cleaned_2.csv')
