In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer,QuantileTransformer, OneHotEncoder

#Visualization
import matplotlib.cm as cm
from sklearn import random_projection

import datetime

In [2]:
dtype_dict = {
    'ProductFamily_ID':'category',
    'ProductCategory_ID':'category',
    'ProductBrand_ID':'category',
    'ProductName_ID':'category',
    'ProductPackSKU_ID':'category',
    'Point-of-Sale_ID':'category',
    'Measures':'category',
    'Quantity':'float32',
    'Total_Sales':'float32',
    'Value':'float32',
    'Week': 'int32',
    'Year':'int32'
}
original_path = r'D:\NOVAIMS_MAA\NOVAIMS_MAA_2020e21_BusinessCasesDataScience_MindOverData_RetailChallenge.csv'
path = r'D:\NOVAIMS_MAA\NOVAIMS_MAA_final_1.csv'

In [3]:
example = pd.read_csv(path,dtype=dtype_dict,parse_dates=['Date'],nrows = 1000,usecols = [1,2,3,4,5,6,7,8,9])
example.head()

Unnamed: 0,ProductFamily_ID,ProductCategory_ID,ProductBrand_ID,ProductName_ID,ProductPackSKU_ID,Point-of-Sale_ID,Date,Quantity,Total_Sales
0,16,11,306,649,1970,1,2017-03-04,2.0,1540.0
1,16,11,306,649,1970,1,2016-05-02,4.0,3080.0
2,16,11,306,649,1970,1,2016-10-24,2.0,1540.0
3,16,11,306,649,1970,1,2017-10-13,2.0,1620.0
4,16,11,306,649,1970,1,2017-10-14,2.0,1620.0


In [11]:
df = pd.read_csv(path,dtype=dtype_dict,usecols = [4,7,8])

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91187504 entries, 0 to 91187503
Data columns (total 3 columns):
 #   Column          Dtype   
---  ------          -----   
 0   ProductName_ID  category
 1   Date            object  
 2   Quantity        float32 
dtypes: category(1), float32(1), object(1)
memory usage: 1.2+ GB


In [13]:

def memoize_dt(s):
    """
    Memoization technique
    """
    dates = {date:datetime.datetime.strptime(date,'%Y-%m-%d') for date in s.unique()}
    return s.map(dates)

def memoize_week(s):
    """
    Memoization technique
    """
    weeks = {date: date.week for date in s.unique()}
    return s.map(weeks)
def memoize_month(s):
    """
    Memoization technique
    """
    months = {date: date[5:7] for date in s.unique()}
    return s.map(months)
def memoize_year(s):
    """
    Memoization technique
    """
    years = {date: date[:4] for date in s.unique()}
    return s.map(years)

def preprocessing(dataframe):
    print("Generating Year and Month...")
    dataframe['Month'] = memoize_month(dataframe.Date)
    dataframe['Year'] = memoize_year(dataframe.Date)
    print("Converting datetime...")
    dataframe.Date  = memoize_dt(dataframe.Date)
    print("Generating week...")
    dataframe['Week'] = dataframe.Date.dt.week
    dataframe.drop(columns = "Date",inplace = True)
    print("Finalize...")
    dataframe.Month = dataframe.Month.astype("category")
    dataframe.Year = dataframe.Year.astype("category")
    dataframe.Week = dataframe.Week.astype("category")
    return dataframe

In [14]:
df = preprocessing(df)

Generating Year and Month
Converting datetime
Generating week
Finalize...


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91187504 entries, 0 to 91187503
Data columns (total 6 columns):
 #   Column          Dtype         
---  ------          -----         
 0   ProductName_ID  category      
 1   Date            datetime64[ns]
 2   Quantity        float32       
 3   Month           category      
 4   Year            category      
 5   Week            category      
dtypes: category(4), datetime64[ns](1), float32(1)
memory usage: 1.4 GB


In [17]:
df[:100].groupby(['Year','Month','Week','ProductName_ID']).sum()['Quantity'].reset_index()

Unnamed: 0,Year,Month,Week,ProductName_ID,Quantity
0,2016,01,1,1000,
1,2016,01,1,1004,
2,2016,01,1,1006,
3,2016,01,1,1007,
4,2016,01,1,1013,
...,...,...,...,...,...
7174075,2019,12,53,2231,
7174076,2019,12,53,546,
7174077,2019,12,53,1665,
7174078,2019,12,53,1127,


In [30]:
df = df.groupby(['Year','Month','Week','ProductName_ID']).sum()['Quantity'].reset_index()
#df.to_csv(r'D:\NOVAIMS_MAA\timeseries\product_total_ts.csv')

In [34]:
df = pd.read_csv(r'D:\NOVAIMS_MAA\timeseries\product_total_ts.csv',dtype=dtype_dict,usecols = [1,2,3,4,5])

In [36]:
df.info()
df.dropna(inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7174080 entries, 0 to 7174079
Data columns (total 5 columns):
 #   Column          Dtype   
---  ------          -----   
 0   Year            int32   
 1   Month           int64   
 2   Week            int32   
 3   ProductName_ID  category
 4   Quantity        float32 
dtypes: category(1), float32(1), int32(2), int64(1)
memory usage: 150.6 MB


In [None]:
df.head