In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from functions import outlierCorrections, plotTS, combine_columns, TS_Stats, timeShiftTs, MovingAverage, combineAggregatedInfo, outlierCorColWrapper, calendarFeatures, checkNanValues, countNaNs

# Load data
deptId = 'Dept16'
path = 'Data/' + deptId
df_train = pd.read_csv(path + '/Train_2Y.csv')
df_test = pd.read_csv(path + '/Test_1Y.csv')
# df_test

In [18]:
df_train = calendarFeatures(df_train)
df_test = calendarFeatures(df_test)

df_train = outlierCorColWrapper(df_train, ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment'], plot=False)
df_test = outlierCorColWrapper(df_test, ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment'], plot=False)

In [19]:
countNaNs(df_train)

Series([], dtype: int64)


In [20]:
# def add_noise(timeSeries, columns, mean = 0, std = 1):
#     timeSeries = timeSeries.copy()
#     for col in columns:
#         noise = np.random.normal(mean, std, timeSeries.shape[0])
#         timeSeries[col] = timeSeries[col] + noise
#     return timeSeries

# df_test = add_noise(df_test, ['corrected_Temperature', 'corrected_Fuel_Price', 'corrected_CPI', 'corrected_Unemployment'])

In [21]:
# df_test.to_csv(path + '/tmp.csv', index=False)
# df_test = pd.read_csv(path + '/tmp.csv')
# df_test

In [22]:
def tsColRange(timeSeries, col):
    """
    It returns the minimum and maximum values of a column in a time series
    
    :param timeSeries: The dataframe containing the time series data
    :param col: the column of the dataframe you want to get the range of
    :return: The min and max values of the column
    """
    max = timeSeries[col].max()
    min = timeSeries[col].min()
    return min, max

for col in ['corrected_Temperature', 'corrected_Fuel_Price', 'corrected_CPI', 'corrected_Unemployment']:

    min_train, max_train = tsColRange(df_train, col)
    min_test, max_test = tsColRange(df_test, col)
    diff = max(max_train, max_test) - min(min_train, min_test)
    print(f'Column {col} has range ({min(min_train, min_test)}, {max(max_train, max_test)}) and difference is {diff}')


Column corrected_Temperature has range (34.14, 94.22) and difference is 60.08
Column corrected_Fuel_Price has range (2.514, 3.907) and difference is 1.3930000000000002
Column corrected_CPI has range (210.0011018, 223.0783366) and difference is 13.077234800000014
Column corrected_Unemployment has range (6.17, 8.324) and difference is 2.154


In [23]:
for col, new_col in zip(['corrected_Temperature', 'corrected_Fuel_Price', 'corrected_CPI', 'corrected_Unemployment'], ['temperature_class', 'fuel_price_class', 'cpi_class', 'unemployment_class']):
    bins = pd.cut(df_train[col], bins=3, labels=False, retbins=True, duplicates='drop')[1]
    df_train[new_col]  = pd.cut(df_train[col], bins=bins, labels=False, duplicates='drop')

for col, new_col in zip(['corrected_Temperature', 'corrected_Fuel_Price', 'corrected_CPI', 'corrected_Unemployment'], ['temperature_class', 'fuel_price_class', 'cpi_class', 'unemployment_class']):
    bins = pd.cut(df_test[col], bins=3, labels=False, retbins=True, duplicates='drop')[1]
    df_test[new_col]  = pd.cut(df_test[col], bins=bins, labels=False, duplicates='drop')

In [24]:
df_test['unemployment_class'].value_counts()

2    28
1    17
0     5
Name: unemployment_class, dtype: int64

In [25]:
df_train = df_train.drop(columns=['sales'])
df_train = df_train.rename(columns={'corrected_sales':'sales'})

df_test = df_test.drop(columns=['sales'])
df_test = df_test.rename(columns={'corrected_sales':'sales'})

In [27]:
aggMethods = ['mean', 'median', 'std', 'var', 'mad', 'max', 'min']
colNames = ['meanSls', 'medianSls', 'stdSls', 'varSls', 'madSls', 'maxSls', 'minSls']
for method, col_name in zip(aggMethods, colNames):
    df_train, df_temp = combineAggregatedInfo(timeSeries = df_train, column = 'temperature_class', new_col_name = col_name + 'PerTmpClass', method = method)
    df_test = pd.merge(df_test, df_temp, on='temperature_class', how='left')
    df_test = df_test.rename(columns={'sales_x':'sales', 'sales_y':col_name + 'PerTmpClass'})

    df_train, df_fp = combineAggregatedInfo(timeSeries = df_train, column = 'fuel_price_class', new_col_name = col_name + 'PerFPClass', method = method)
    df_test = pd.merge(df_test, df_fp, on='fuel_price_class', how='left')
    df_test = df_test.rename(columns={'sales_x':'sales', 'sales_y':col_name + 'PerFPClass'})

    df_train, df_cpi = combineAggregatedInfo(timeSeries = df_train, column = 'cpi_class', new_col_name = col_name + 'PerCPIClass', method = method)
    df_test = pd.merge(df_test, df_cpi, on='cpi_class', how='left')
    df_test = df_test.rename(columns={'sales_x':'sales', 'sales_y':col_name + 'PerCPIClass'})

    df_train, df_unempl = combineAggregatedInfo(timeSeries = df_train, column = 'unemployment_class', new_col_name = col_name + 'PerUnemplClass', method = method)
    df_test = pd.merge(df_test, df_unempl, on='unemployment_class', how='left')
    df_test = df_test.rename(columns={'sales_x':'sales', 'sales_y':col_name + 'PerUnemplClass'})

    df_train, df_woy = combineAggregatedInfo(timeSeries = df_train, column = 'woy', new_col_name = col_name + 'PerWoY', method = method)
    df_test = pd.merge(df_test, df_woy, on='woy', how='left')
    df_test = df_test.rename(columns={'sales_x':'sales', 'sales_y':col_name + 'PerWoY'})

    df_train,  df_month  = combineAggregatedInfo(timeSeries = df_train, column = 'month', new_col_name = col_name + 'PerMonth', method = method)
    df_test = pd.merge(df_test, df_month, on='month', how='left')
    df_test = df_test.rename(columns={'sales_x':'sales', 'sales_y':col_name + 'PerMonth'})

In [28]:
countNaNs(df_test)
countNaNs(df_train)

Series([], dtype: int64)
Series([], dtype: int64)


In [29]:
df_train = MovingAverage(timeSeries=df_train, method='')

df_train = TS_Stats(df_train)
df_train

No selected method all the available methods will be used!


Unnamed: 0,week,Temperature,Fuel_Price,CPI,Unemployment,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,...,minSlsPerUnemplClass,minSlsPerWoY,minSlsPerMonth,sma,cma,ema,momentum,norm,kurtosis,skew
0,2010-01-04,70.280,2.603,211.329874,8.1630,0.00,0.0,0.000,0.0,0.00,...,938.32,1269.97,912.67,1269.970000,1269.970000,1269.970000,1.000000,0.303966,3.774326,1.732051
1,2011-01-03,71.965,3.524,214.662779,7.8915,0.00,0.0,0.000,0.0,0.00,...,790.54,1269.97,912.67,2678.140000,2678.140000,1551.604000,3.217643,0.978054,3.774326,1.732051
2,2010-01-11,0.000,0.000,0.000000,0.0000,0.00,0.0,0.000,0.0,0.00,...,938.32,921.64,912.67,2208.750000,2208.750000,1523.440600,0.310787,0.303966,3.774326,1.732051
3,2011-01-10,34.140,2.983,211.117671,8.0280,0.00,0.0,0.000,0.0,0.00,...,790.54,921.64,912.67,1886.972500,1886.972500,1463.260540,0.725718,0.220594,3.774326,1.924076
4,2010-01-18,0.000,0.000,0.000000,0.0000,0.00,0.0,0.000,0.0,0.00,...,938.32,1019.16,912.67,1886.972500,1763.572000,1443.931486,1.377946,0.303966,4.752788,2.159031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,2011-09-12,61.030,3.342,217.149737,7.6465,0.75,0.0,1.705,0.0,887.39,...,504.49,1624.77,790.54,1964.422500,1912.603813,1882.931067,1.001483,0.389464,-0.007600,0.624381
100,2010-09-20,80.790,2.624,211.255258,8.0990,0.00,0.0,0.000,0.0,0.00,...,938.32,790.54,790.54,1595.792500,1907.984370,1839.241960,0.888679,0.346108,0.018131,0.639872
101,2011-09-19,75.680,3.467,216.028236,7.8520,0.00,0.0,0.000,0.0,0.00,...,790.54,790.54,790.54,1372.132500,1897.029033,1734.371764,0.546693,0.189215,0.004868,0.639464
102,2010-09-27,0.000,0.000,0.000000,0.0000,0.00,0.0,0.000,0.0,0.00,...,938.32,1146.58,790.54,1458.541153,1897.741417,1757.975049,2.492479,0.471614,0.031737,0.639706


In [30]:
past1Year = df_train.shape[0] - 50
past2Year = df_train.shape[0] - 2*50

In [31]:
for col in ['sma', 'cma', 'ema', 'momentum', 'norm', 'kurtosis', 'skew']:
    df_test[col] = np.ma.mean([df_train[past1Year:][col].to_numpy(), df_train[past2Year:past1Year][col].to_numpy()], axis=0)

In [32]:
path

'Data/Dept16'

In [33]:
df_train.to_csv(path + '/data_train.csv', index=False)
df_test.to_csv(path + '/data_test.csv', index=False)