In [None]:
!pip install xgboost
!pip install linearmodels
!pip install mlxtend
import sys
!{sys.executable} -m pip install pandas-profiling

In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import statistics
from itertools import chain
# visualization
from pandas.plotting import scatter_matrix
from statsmodels.graphics.tsaplots import plot_acf
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from pprint import pprint
import pandas_profiling as pp
%matplotlib inline
sns.set_style('whitegrid')
#plotly
import plotly.io as pio
import plotly.express as px
from plotly.offline import download_plotlyjs,init_notebook_mode, plot, iplot
import plotly as py 
import plotly.graph_objs as go # plotly graphical object
import plotly.io as pio
pio.renderers.default='notebook'
# setting the general visualization style
sns.set_style('whitegrid')
# feature engineering
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
#Libraries for Statistical Models
import statsmodels.api as sm
# ignoring warnings in the notebook
import warnings 
warnings.filterwarnings('ignore') 
# To display full output 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# feature engineering
import pandas_profiling as pp
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.decomposition import PCA
# machine learning models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.impute import KNNImputer
import statsmodels.api as sm
from statsmodels.tsa.api import VAR
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# model selection
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, StratifiedKFold, learning_curve
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
# Tuning and Esembling
from sklearn.model_selection import GridSearchCV        
from sklearn.model_selection import RandomizedSearchCV 
from mlxtend.regressor import StackingCVRegressor

In [None]:
# building plot for feature importance for each model
def feature_importance(regressor, name,figsize1):
# setting up the frame
    fig, axes = plt.subplots(figsize = figsize1)
# setting up parameters
    indices = np.argsort(regressor.feature_importances_)[::-1][:30]
# ploting feature importance
    g = sns.barplot(y=X_train.columns[indices][:30],
                x = regressor.feature_importances_[indices][:30],
                orient='h')
# labeling
    g.set_xlabel("Relative importance",fontsize=15)
    g.set_ylabel("Features",fontsize=15)
    g.tick_params(labelsize=15)
    g.set_title(name + " feature importance");

In [None]:
# def a function to plot the learning curves
def plot_learning_curve(estimator, title, X, y, scoring, ylim = None, cv = None,
                        n_jobs = -1, train_sizes = np.linspace(.1, 1.0, 5)):
    # creating plt
    plt.figure()
    # labeling title
    plt.title(title)
    # setting y-axis limit if necessary
    if ylim is not None:
        plt.ylim(*ylim)
    # labeling x,y axis
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    # calculating cross-validated training and test scores for different training set sizes.
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y,
                                                            cv = cv, n_jobs = n_jobs, 
                                                            train_sizes = train_sizes,
                                                            scoring = scoring, shuffle = True)
    # calculating mean, stdv for scores
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    # plotting the above results and designing 
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    # positioning legend
    plt.legend(loc="best")
    # return the plot
    return plt

In [None]:
def plotly(name, mode):
    if mode == 1:
        fig = px.line(df_sku[df_sku['full_name']==name],
                      x='date', y='Sales', title='Sales for '+ name,
                      color='channel', template="none")

        fig.update_xaxes(
            rangeslider_visible=True,
            rangeselector=dict(
                buttons=list([
                    dict(count=1, label="1m", step="month", stepmode="backward"),
                    dict(count=6, label="6m", step="month", stepmode="backward"),
                    dict(count=1, label="YTD", step="year", stepmode="todate"),
                    dict(count=1, label="1y", step="year", stepmode="backward"),
                    dict(step="all")
                ])
            )
        )
        fig.show();
        
        fig = px.line(df_sku[df_sku['full_name']== name],
                  x='date', y='units_sold', title='units_sold for '+ name,
                  color='channel', template="none")

        fig.update_xaxes(
            rangeslider_visible=True,
            rangeselector=dict(
                buttons=list([
                    dict(count=1, label="1m", step="month", stepmode="backward"),
                    dict(count=6, label="6m", step="month", stepmode="backward"),
                    dict(count=1, label="YTD", step="year", stepmode="todate"),
                    dict(count=1, label="1y", step="year", stepmode="backward"),
                    dict(step="all")
                ])
            )
        )
        fig.show();
        
        fig = px.line(df_sku[df_sku['full_name']==name],
                      x='date', y='kg_sold', title='kg_sold for '+ name,
                      color='channel', template="none")

        fig.update_xaxes(
            rangeslider_visible=True,
            rangeselector=dict(
                buttons=list([
                    dict(count=1, label="1m", step="month", stepmode="backward"),
                    dict(count=6, label="6m", step="month", stepmode="backward"),
                    dict(count=1, label="YTD", step="year", stepmode="todate"),
                    dict(count=1, label="1y", step="year", stepmode="backward"),
                    dict(step="all")
                ])
            )
        )
        fig.show();

        fig = px.line(df_sku[df_sku['full_name']==name],
                      x='date', y='distribution', title='distribution for '+ name,
                      color='channel', template="none")

        fig.update_xaxes(
            rangeslider_visible=True,
            rangeselector=dict(
                buttons=list([
                    dict(count=1, label="1m", step="month", stepmode="backward"),
                    dict(count=6, label="6m", step="month", stepmode="backward"),
                    dict(count=1, label="YTD", step="year", stepmode="todate"),
                    dict(count=1, label="1y", step="year", stepmode="backward"),
                    dict(step="all")
                ])
            )
        )
        fig.show();
    else:
        fig = px.line(df_sku[df_sku['full_name']== name],
              x='date', y='price_per_unit', title='Price_per_units for '+ name,
              color='channel', template="none")

        fig.update_xaxes(
            rangeslider_visible=True,
            rangeselector=dict(
                buttons=list([
                    dict(count=1, label="1m", step="month", stepmode="backward"),
                    dict(count=6, label="6m", step="month", stepmode="backward"),
                    dict(count=1, label="YTD", step="year", stepmode="todate"),
                    dict(count=1, label="1y", step="year", stepmode="backward"),
                    dict(step="all")
                ])
            )
        )
        fig.show();


In [None]:
df_sku_raw1 = pd.read_csv("/project/raw_data/SKU.csv",index_col=0)
df_sku_raw2 = pd.read_csv("/project/raw_data/SKU2.csv",index_col=0)
# rename rows
df_sku_raw1.loc[df_sku_raw1['biscuit_category'] == 1, ['biscuit_category']] = 'EVERYDAY TREATS'
df_sku_raw1.loc[df_sku_raw1['biscuit_category'] == 2, ['biscuit_category']] = 'EVERYDAY BISCUITS'
df_sku_raw1.loc[df_sku_raw1['biscuit_category'] == 3, ['biscuit_category']] = 'CHOCOLATE BISCUIT BARS'
# merge two dfs
df_sku_raw = pd.merge(df_sku_raw1, df_sku_raw2, how="outer")
df_sku_raw = df_sku_raw.drop(columns=['sheet'])
df_sku_raw = df_sku_raw.loc[(df_sku_raw.date < '2021-05-22')]
df_sku_raw = df_sku_raw.loc[(df_sku_raw.date >= '2018-05-22')]
df_raw = df_sku_raw
# fix 0 and less than 0 data
df_sku_raw[df_sku_raw['Sales'] <= 0]['Sales'] = 0
df_sku_raw[df_sku_raw['units_sold'] <= 0]['units_sold'] = 0
df_sku_raw[df_sku_raw['kg_sold'] <= 0]['kg_sold'] = 0
# reshaping the levels
df_sku_raw = df_sku_raw[df_sku_raw['channel'].isin(['Tesco Express','Tesco excl. Express','Sainsbury Local','Sainsbury excl Local'])]
df_sku_raw.loc[(df_sku_raw.channel == 'Tesco Express'),'retailer']='Tesco'
df_sku_raw.loc[(df_sku_raw.channel == 'Tesco excl. Express'),'retailer']='Tesco'
df_sku_raw.loc[(df_sku_raw.channel == 'Sainsbury Local'),'retailer']='Sainsbury'
df_sku_raw.loc[(df_sku_raw.channel == 'Sainsbury excl Local'),'retailer']='Sainsbury'
# applying date dtype
df_sku_raw['date'] = pd.to_datetime(df_sku_raw['date'])
df_sku_raw['WeekOfYear'] = df_sku_raw.date.dt.weekofyear
df_sku_raw['Year'] = df_sku_raw.date.dt.year
df_sku_raw['Month'] = df_sku_raw.date.dt.month
# renaming features
df_sku_raw.loc[(df_sku_raw['biscuit_category'] == 'HEALTHIER BISCUITS'),'biscuit_category']='Healthier'
df_sku_raw.loc[(df_sku_raw['biscuit_category'] == 'CHOCOLATE BISCUIT BARS'),'biscuit_category']='CBB'
df_sku_raw.loc[(df_sku_raw['biscuit_category'] == 'EVERYDAY TREATS'),'biscuit_category']='EDT'
df_sku_raw.loc[(df_sku_raw['biscuit_category'] == 'EVERYDAY BISCUITS'),'biscuit_category']='EDB'
df_sku_raw.rename(columns={'biscuit_category':'Subsegment'}, inplace=True)
# re-leveling
df_sku_raw = df_sku_raw[~df_sku_raw['channel'].isin(['Tesco','Sainsbury'])]
df_sku_raw.loc[(df_sku_raw.channel == 'Tesco Express'),'format']='Express'
df_sku_raw.loc[(df_sku_raw.channel == 'Tesco excl. Express'),'format']='Main'
df_sku_raw.loc[(df_sku_raw.channel == 'Sainsbury Local'),'format']='Express'
df_sku_raw.loc[(df_sku_raw.channel == 'Sainsbury excl Local'),'format']='Main'
# drop useless variabel
df_sku_raw = df_sku_raw.drop(['off_shelf'], axis=1)

# preview
df_sku_raw
df_sku_raw.info()
df_sku_raw.describe()
df_sku_raw.to_csv("/project/data_cleaning/SKU_new.csv")  
df_sku = df_sku_raw

In [None]:
df_raw.describe()

In [None]:
df_raw = df_raw.drop(['off_shelf'], axis=1)
df_raw.profile_report()

In [None]:
def distribution(variable,title,x_title):
    fig, ax = plt.subplots(figsize=(13,5));
    df_sku_raw[variable].hist(bins=60, ax=ax);
    ax.set_title(title, fontsize=16)
    ax.set_xlabel(x_title, fontsize=12);
    ax.set_ylabel("Counts", fontsize=12);
#distribution('Sales')
distribution('Sales','Figure 4: Histogram of Sales for SKU level dataset','Sales')
distribution('distribution','Figure 5: Histogram of Distribution for SKU level dataset','Distribution')
#distribution('')

In [None]:
# identifying valid data points for each sku, based on the distribution(managing out of stock/delisting)
Threshold = 0.2
df_sku_gb = df_sku.groupby(['full_name'],as_index=False,sort=False).sum()
Tesco_Express = []
Tesco_Main = []
Sainsbury_Local = []
Sainsbury_Main = []
product = df_sku['full_name'].drop_duplicates().values.tolist()
company = []
total_sales = df_sku_gb['Sales'].values.tolist()
brand = []
pack = []
for i in product:
    Tesco_Express.append(df_sku[(df_sku['full_name']== i) 
                                & (df_sku['distribution']>Threshold)
                                & (df_sku['channel']=='Tesco Express')]['date'].count())
    Tesco_Main.append(df_sku[(df_sku['full_name']== i) 
                                & (df_sku['distribution']>Threshold)
                                & (df_sku['channel']=='Tesco excl. Express')]['date'].count())
    Sainsbury_Local.append(df_sku[(df_sku['full_name']== i) 
                                & (df_sku['distribution']>Threshold)
                                & (df_sku['channel']=='Sainsbury Local')]['date'].count())
    Sainsbury_Main.append(df_sku[(df_sku['full_name']== i) 
                                & (df_sku['distribution']>Threshold)
                                & (df_sku['channel']=='Sainsbury excl Local')]['date'].count())

    company.append(df_sku.loc[df_sku['full_name'] == i]['company']
                    .drop_duplicates().values.tolist())
    brand.append(df_sku.loc[df_sku['full_name'] == i]['brand']
                    .drop_duplicates().values.tolist())
    pack.append(df_sku.loc[df_sku['full_name'] == i]['pack_type']
                    .drop_duplicates().values.tolist())
company = list(chain.from_iterable(company))
brand = list(chain.from_iterable(brand))
pack = list(chain.from_iterable(pack))
data = {'full_name':product, "Tesco_Express":Tesco_Express,"Tesco_Main": Tesco_Main,
        'Sainsbury_Local':Sainsbury_Local,'Sainsbury_Main':Sainsbury_Main,
       'total_sales':total_sales,'company':company,'brand':brand, 'pack':pack}
df_flt = pd.DataFrame(data)
df_flt = df_flt.sort_values(by=['total_sales'], ascending=False)

fuse1=150
fuse2=0
df_valid = df_flt[df_flt['Tesco_Express']>=fuse2]
df_valid = df_valid[df_flt['Tesco_Main']>=fuse1]
df_valid = df_valid[df_flt['Sainsbury_Local']>=fuse2]
df_valid = df_valid[df_flt['Sainsbury_Main']>=fuse2]
df_valid.reset_index(drop=True)
#df_valid.to_csv("/project/temporary.csv")  
#df_valid.to_html("/project/data_cleaning/df_valid.html") 
df_valid.describe()

In [None]:
# select skus that has missing values
df_problematic = df_valid[df_valid['Tesco_Express']!=156]
# drop skus that cannot be imputated
drop = [36,223,154,141,152,187,15,237,7,16]
df_problematic = df_problematic.drop(index=drop)
df_problematic
#df_problematic.describe()
df_not_bad = df_problematic.drop(df_problematic[(df_problematic['Sainsbury_Local']==0)].index)
df_not_bad = df_not_bad.drop(df_not_bad[(df_not_bad['Sainsbury_Main']==0)].index)
df_not_bad = df_not_bad.drop(index=37)
df_not_bad

# XGB Imputing for large chunk of missing data for a single channel

In [None]:
def scale(col_names,df):
    # scaling numerical values
    #Locate the attribute we need to standardize
    features = df[col_names]
    # Use scaler of choice; here Standard scaler is used
    scaler = StandardScaler().fit(features.values)
    df[col_names] = scaler.transform(features.values)

In [None]:
df_sku.loc[(df_sku.distribution <= 0.2),'kg_sold']=0
df_sku.loc[(df_sku.distribution <= 0.2),'units_sold']=0
df_sku.loc[(df_sku.distribution <= 0.2),'distribution']=0
df_sku.loc[(df_sku.distribution <= 0.2),'Sales']=0

In [None]:
def impute(full_name, variable,dependent, independ1,independ2, independ3, n_iter):
    # select skus that has missing values
    df_problematic = df_valid[df_valid['Tesco_Express']!=156]
    # drop skus that cannot be imputated
    drop = [36,223,154,141,152,187,15,237,7]
    df_problematic = df_problematic.drop(index=drop)
    #df_problematic
    #df_problematic.describe()
    df_not_bad = df_problematic.drop(df_problematic[(df_problematic['Sainsbury_Local']==0)].index)
    df_not_bad = df_not_bad.drop(df_not_bad[(df_not_bad['Sainsbury_Main']==0)].index)
    df_not_bad = df_not_bad.drop(index=37)
    df_problematic = df_problematic.drop(columns=['total_sales','company','brand','pack'])
    df_problematic=pd.melt(df_problematic,id_vars=['full_name'],var_name='channel', value_name='valid')
    df_problematic = df_problematic.sort_values(by=['full_name'], ascending=False)
    #df_problematic = df.drop(df[<some boolean condition>].index)
    #df_problematic.to_csv("/project/data_cleaning/df_problematic.csv")  
    df_not_bad = df_sku[df_sku['full_name'].isin(df_not_bad['full_name'].values.tolist())]
    # set sku
    df_not_bad = df_not_bad[df_not_bad['full_name']==full_name]
    # dropping useless ones
    df_not_bad = df_not_bad.drop(columns=['format', 'flavour', 'brand', 'company', 'pack_type',\
                                   'weight', 'Subsegment','full_name','retailer'])
    df_not_bad = df_not_bad.pivot_table(index=["date",'WeekOfYear','Year','Month'], 
                        columns='channel', 
                        values=[variable]).reset_index()
    df_not_bad.columns = ['_'.join(col) for col in df_not_bad.columns.values]
    df_final = df_not_bad
    # label encoding
    #unit_encoder = LabelEncoder()
    #df_not_bad['Year_'] = unit_encoder.fit_transform(df_not_bad['Year_'])
    df_not_bad = df_not_bad.drop(columns=['date_'])
    # data used to train and test model
    data_model_pre = df_not_bad[~(df_not_bad[dependent]==0)]
    data_model = data_model_pre
    # data with missing gender and income info
    data_predict = df_not_bad[df_not_bad[dependent]==0]
    data_predict = data_predict.drop(columns=[dependent])
    # scaling
    col_names=[independ1,independ2, independ3]
    if variable != 'distribution':
        scale(col_names,data_predict)
        scale(col_names,data_model)
    #scaler = StandardScaler()
    #data_model.loc[:,'units_sold_Sainsbury Local':'units_sold_Tesco excl. Express'] = scaler.fit_transform(data_model.loc[:,'units_sold_Sainsbury Local':'units_sold_Tesco excl. Express'])
    #data_predict.loc[:,'units_sold_Sainsbury Local':'units_sold_Tesco excl. Express'] = scaler.fit_transform(data_predict.loc[:,'units_sold_Sainsbury Local':'units_sold_Tesco excl. Express'])
    # train test split
    train, test = train_test_split(data_model, test_size=0.2, random_state=42)
    Y_train = data_model[dependent].values.ravel()
    X_train = data_model.drop(columns=[dependent])
    #Y_train = train['units_sold_Tesco Express'].values.ravel()
    #X_train = train.drop(columns=['units_sold_Tesco Express'])
    Y_test = test[dependent].values.ravel()
    X_test = test.drop(columns=[dependent])
    skf = KFold(n_splits=3, shuffle = True, random_state = 42)

    params = {
            'max_depth': [10, 20, 50,100],
            'learning_rate': [0.01,0.05,0.1],
            'subsample': [0.7,0.8,0.9],
            'seed' :[42],
            'colsample_bytree': [0.3, 0.5, 0.7],
            'colsample_bynode': [0.3, 0.5, 0.7],
            'colsample_bylevel': [0.3, 0.5, 0.7],
            'min_child_weight': [5,7,10],
            'n_estimators': [100, 200,500],
            'gamma': [0.1, 0.25, 0.5],
            'objective': ['reg:squarederror']}

    model_unit = RandomizedSearchCV(estimator = XGBRegressor(), param_distributions=params,
                             scoring='neg_root_mean_squared_error', n_jobs=4, n_iter = n_iter,
                             cv=skf.split(X_train,Y_train),
                             verbose=2)

    model_unit.fit(X_train, Y_train)
    modelcv = pd.DataFrame(model_unit.cv_results_)
    modelcv
    model_unit.best_params_
    # building plot for feature importance for each model
    def feature_importance(regressor, name,figsize1):
    # setting up the frame
        fig, axes = plt.subplots(figsize = figsize1)
    # setting up parameters
        indices = np.argsort(regressor.feature_importances_)[::-1][:30]
    # ploting feature importance
        g = sns.barplot(y=X_train.columns[indices][:30],
                    x = regressor.feature_importances_[indices][:30],
                    orient='h')
    # labeling
        g.set_xlabel("Relative importance",fontsize=15)
        g.set_ylabel("Features",fontsize=15)
        g.tick_params(labelsize=15)
        g.set_title(name + " feature importance");
    # plotting the feature importance for both of the best models
    feature_importance(model_unit.best_estimator_, 'XGBoost Best Model ',(18,8))
    g = plot_learning_curve(model_unit.best_estimator_,
                            "XGBoost best model learning curves: neg_mean_squared_error score",
                            X_train,Y_train, 'neg_root_mean_squared_error',ylim = [0, -50000],cv=3)
    g = plot_learning_curve(model_unit.best_estimator_,
                            "XGBoost best model learning curves: neg_mean_absolute_error",
                            X_train,Y_train, 'neg_mean_absolute_error',ylim = [0, -50000],cv=3)
    g = plot_learning_curve(model_unit.best_estimator_,
                            "XGBoost best model learning curves: r2",
                            X_train,Y_train, 'r2',ylim = [0, 1],cv=3)

    #train_pred = model_unit.predict(X_train)
    test_pred = model_unit.predict(X_test)

    #train_mse = mean_squared_error(Y_train, train_pred)
    #train_rmse = np.sqrt(train_mse)
    #print('Training Set: Root Mean Squared Error (RMSE): '+ str(np.round(train_rmse,4)))
    #rmspe_train = (np.sqrt(np.mean(np.square((Y_train - train_pred) / Y_train))))*100
    #print('Training Set: Root Mean Squared percentage Error (RMSPE): '+ str(np.round(rmspe_train,4))+'%' +
    #      ' (avg. accuracy: ' +str(100 - np.round(rmspe_train,4))+ '%)')
    test_mse = mean_squared_error(Y_test, test_pred)
    test_rmse = np.sqrt(test_mse)
    print('Testing Set: Root Mean Squared Error (RMSE): '+ str(np.round(test_rmse,4)))
    rmspe_test = (np.sqrt(np.mean(np.square((Y_test - test_pred) / Y_test))))*100
    print('Testing Set: Root Mean Squared percentage Error (RMSPE): '+ str(np.round(rmspe_test,4))+'%' +
          ' (avg. accuracy: ' +str(100 - np.round(rmspe_test,4))+ '%)')



    df_final.loc[(df_final[dependent]==0),dependent] = model_unit.predict(data_predict)
    df_final[dependent] = np.round(df_final[dependent],4)
    df_sku.loc[(df_sku['full_name'] == full_name) & 
           (df_sku['channel'] == 'Tesco Express'),variable] = df_final[dependent].values.tolist()

In [None]:
impute('GNGR MCVTS GNGR NTS GNGR 250 GM SNGL',
       'Sales','Sales_Tesco Express',
       'Sales_Sainsbury Local',
      'Sales_Sainsbury excl Local',
      'Sales_Tesco excl. Express',30)

In [None]:
impute('GNGR MCVTS GNGR NTS GNGR 250 GM SNGL',
       'units_sold','units_sold_Tesco Express',
       'units_sold_Sainsbury Local',
      'units_sold_Sainsbury excl Local',
      'units_sold_Tesco excl. Express',30)

In [None]:
impute('GNGR MCVTS GNGR NTS GNGR 250 GM SNGL',
       'kg_sold','kg_sold_Tesco Express',
       'kg_sold_Sainsbury Local',
      'kg_sold_Sainsbury excl Local',
      'kg_sold_Tesco excl. Express',30)

In [None]:
impute('GNGR MCVTS GNGR NTS GNGR 250 GM SNGL',
       'distribution','distribution_Tesco Express',
       'distribution_Sainsbury Local',
      'distribution_Sainsbury excl Local',
      'distribution_Tesco excl. Express',30)

In [None]:
plotly('GNGR MCVTS GNGR NTS GNGR 250 GM SNGL',1)

In [None]:
impute('JFF CKS DRK CHCLT & ORNG 244 GM SNGL',
       'Sales','Sales_Tesco Express',
       'Sales_Sainsbury Local',
      'Sales_Sainsbury excl Local',
      'Sales_Tesco excl. Express',30)

In [None]:
impute('JFF CKS DRK CHCLT & ORNG 244 GM SNGL',
       'units_sold','units_sold_Tesco Express',
       'units_sold_Sainsbury Local',
      'units_sold_Sainsbury excl Local',
      'units_sold_Tesco excl. Express',30)

In [None]:
impute('JFF CKS DRK CHCLT & ORNG 244 GM SNGL',
       'kg_sold','kg_sold_Tesco Express',
       'kg_sold_Sainsbury Local',
      'kg_sold_Sainsbury excl Local',
      'kg_sold_Tesco excl. Express',30)

In [None]:
impute('JFF CKS DRK CHCLT & ORNG 244 GM SNGL',
       'distribution','distribution_Tesco Express',
       'distribution_Sainsbury Local',
      'distribution_Sainsbury excl Local',
      'distribution_Tesco excl. Express',30)

In [None]:
plotly('JFF CKS DRK CHCLT & ORNG 244 GM SNGL',1)

In [None]:
df_sku.to_csv("/project/data_cleaning/imputed1.csv")  

# dropping missing data of sainsbury

In [None]:
df_sku = pd.read_csv("/project/data_cleaning/imputed1.csv",index_col=0)

In [None]:
drop_name = ['CSTRD CRMS PRVT LBL CSTRD 400 GM SNGL',
            'JFF CKS DRK CHCLT & ORNG 122 GM SNGL',
            'STRWBRR&LV-YGRT 51 GM 5 PCK',
            'TRTS JM FXS-JM\'N-CRM RSPBRR 150 GM SNGL',
            'BRS KLLGGS CC PPS BR CHCLT 20 GM 6 PCK',
            'EVRD TRTS CKS PRVT LBL CHCLT 200 GM SNGL',
            'BLVT-BRKFST-SFT-BKS BLBRR 50 GM 5 PCK']
for i in drop_name:
    df_sku.drop(df_sku.loc[(df_sku.full_name==i)&
                           (df_sku.channel=='Sainsbury Local')].index, inplace=True)
    df_sku.drop(df_sku.loc[(df_sku.full_name==i)&
                           (df_sku.channel=='Sainsbury excl Local')].index, inplace=True)

# k-Nearest Neighbors imputation for missing at random data

In [None]:
plotly('TWX FNGRS CRML & SHRTCK 23 GM 9 PCK',1)

In [None]:
plotly('RC KRSPS SQRS MRSHMLLW 28 GM 4 PCK',1)

In [None]:
plotly('BLVT-BRKFST-SFT-BKS CHC-CHPS 50 GM 5 PCK',1)

In [None]:
plotly('BRS KLLGGS CC PPS BR CHCLT 20 GM 6 PCK',1)

In [None]:
def knn_impute(sku,neighbors,threshold):
    df_sku.loc[(df_sku.full_name==sku)&
               (df_sku.distribution==0),['units_sold','kg_sold','Sales']]=np.nan
    df_sku.loc[(df_sku.full_name==sku)&
               (df_sku.distribution==0),['distribution']]=threshold
    imputer = KNNImputer(n_neighbors=neighbors,copy = False)
    df_sku.loc[(df_sku.full_name==sku),
               ['units_sold','kg_sold','Sales','distribution']] = imputer.fit_transform(df_sku.loc[(df_sku.full_name==sku),
                                                                                                   ['units_sold','kg_sold','Sales','distribution']])

In [None]:
knn_impute('TWX FNGRS CRML & SHRTCK 23 GM 9 PCK',3,0.22)

In [None]:
knn_impute('RC KRSPS SQRS MRSHMLLW 28 GM 4 PCK',3,0.23)

In [None]:
knn_impute('BLVT-BRKFST-SFT-BKS CHC-CHPS 50 GM 5 PCK',3,0.3)

In [None]:
knn_impute('BRS KLLGGS CC PPS BR CHCLT 20 GM 6 PCK',3,0.33)


In [None]:
plotly('TWX FNGRS CRML & SHRTCK 23 GM 9 PCK',1)

In [None]:
plotly('RC KRSPS SQRS MRSHMLLW 28 GM 4 PCK',1)

In [None]:
plotly('BLVT-BRKFST-SFT-BKS CHC-CHPS 50 GM 5 PCK',1)

In [None]:
plotly('BRS KLLGGS CC PPS BR CHCLT 20 GM 6 PCK',1)

# dropping missing data at the beginning of the time series

In [None]:
plotly('TRTS OTHR CRM OR-THNS VNLL 192 GM SNGL',1)

In [None]:
def drop_first(name,date):
    df_sku.drop(df_sku.loc[(df_sku.full_name==name)&
               (df_sku.date<=date)].index, inplace=True)

In [None]:
drop_first('TRTS OTHR CRM OR-THNS VNLL 192 GM SNGL','2018-07-14')
drop_first('BLVT-BRKFST-SFT-BKS BLBRR 50 GM 5 PCK','2018-07-07')

In [None]:
plotly('TRTS OTHR CRM OR-THNS VNLL 192 GM SNGL',1)

In [None]:
plotly('BLVT-BRKFST-SFT-BKS BLBRR 50 GM 5 PCK',1)

# some final adjustments

In [None]:
#create new variables
df_sku['price_per_unit']=np.round(df_sku['Sales']/df_sku['units_sold'],3)
df_sku['price_per_kg']=np.round(df_sku['Sales']/df_sku['kg_sold'],3)

# fillna for new variables
#df_sku['price_per_unit'] = df_sku['price_per_unit'].fillna(0.0)
#df_sku['price_per_kg'] = df_sku['price_per_kg'].fillna(0.0)
#df_sku.isnull().mean()


In [None]:
df_sku.loc[(df_sku.full_name=='GNGR MCVTS GNGR NTS GNGR 250 GM SNGL')&
           (df_sku.channel=='Tesco Express')&(df_sku.date<='2019-09-14'),'price_per_unit']=1
df_sku.loc[(df_sku.full_name=='JFF CKS DRK CHCLT & ORNG 244 GM SNGL')&
           (df_sku.channel=='Tesco Express')&(df_sku.date<='2019-09-14'),'price_per_unit']=df_sku.loc[
           (df_sku.full_name=='JFF CKS DRK CHCLT & ORNG 244 GM SNGL')&
           (df_sku.channel=='Tesco excl. Express')&(df_sku.date<='2019-09-14'),'price_per_unit'].values.tolist()

In [None]:
df_sku.loc[(df_sku.full_name=='GNGR MCVTS GNGR NTS GNGR 250 GM SNGL')&
           (df_sku.channel=='Tesco Express')&(df_sku.date<='2019-09-14'),
           'units_sold']=df_sku.loc[(df_sku.full_name=='JFF CKS DRK CHCLT & ORNG 244 GM SNGL')&
           (df_sku.channel=='Tesco Express')&(df_sku.date<='2019-09-14'),
           'Sales']/df_sku.loc[(df_sku.full_name=='JFF CKS DRK CHCLT & ORNG 244 GM SNGL')&
           (df_sku.channel=='Tesco Express')&(df_sku.date<='2019-09-14'),
           'price_per_unit']
df_sku.loc[(df_sku.full_name=='JFF CKS DRK CHCLT & ORNG 244 GM SNGL')&
           (df_sku.channel=='Tesco Express')&(df_sku.date<='2019-09-14'),
           'units_sold']=df_sku.loc[(df_sku.full_name=='JFF CKS DRK CHCLT & ORNG 244 GM SNGL')&
           (df_sku.channel=='Tesco Express')&(df_sku.date<='2019-09-14'),
           'Sales']/df_sku.loc[(df_sku.full_name=='JFF CKS DRK CHCLT & ORNG 244 GM SNGL')&
           (df_sku.channel=='Tesco Express')&(df_sku.date<='2019-09-14'),
           'price_per_unit']


In [None]:
df_sku.to_csv("/project/data_cleaning/imputed2.csv")  

In [None]:
df_to_be_imputed = pd.read_csv("/project/data_cleaning/df_to_be_imputed.csv",index_col=0)
df_sku = pd.read_csv("/project/data_cleaning/imputed2.csv",index_col=0)

In [None]:
df_final_valid = pd.merge(df_valid, df_to_be_imputed, how="outer")
df_final_valid.to_csv("/project/data_cleaning/df_final_valid.csv") 

In [None]:
df_final1 = df_sku[df_sku['full_name'].isin(df_final_valid['full_name'].values.tolist())]
df_final1.describe()
df_final1.to_csv("/project/data_cleaning/df_final1.csv") 

In [None]:
# identifying valid data points for each sku, based on the distribution(managing out of stock/delisting)
Threshold = 0.2
df_sku_gb = df_sku.groupby(['full_name'],as_index=False,sort=False).sum()
Tesco_Express = []
Tesco_Main = []
Sainsbury_Local = []
Sainsbury_Main = []
product = df_sku['full_name'].drop_duplicates().values.tolist()
company = []
total_sales = df_sku_gb['Sales'].values.tolist()
brand = []
pack = []
for i in product:
    Tesco_Express.append(df_sku[(df_sku['full_name']== i) 
                                & (df_sku['distribution']>Threshold)
                                & (df_sku['channel']=='Tesco Express')]['date'].count())
    Tesco_Main.append(df_sku[(df_sku['full_name']== i) 
                                & (df_sku['distribution']>Threshold)
                                & (df_sku['channel']=='Tesco excl. Express')]['date'].count())
    Sainsbury_Local.append(df_sku[(df_sku['full_name']== i) 
                                & (df_sku['distribution']>Threshold)
                                & (df_sku['channel']=='Sainsbury Local')]['date'].count())
    Sainsbury_Main.append(df_sku[(df_sku['full_name']== i) 
                                & (df_sku['distribution']>Threshold)
                                & (df_sku['channel']=='Sainsbury excl Local')]['date'].count())

    company.append(df_sku.loc[df_sku['full_name'] == i]['company']
                    .drop_duplicates().values.tolist())
    brand.append(df_sku.loc[df_sku['full_name'] == i]['brand']
                    .drop_duplicates().values.tolist())
    pack.append(df_sku.loc[df_sku['full_name'] == i]['pack_type']
                    .drop_duplicates().values.tolist())
company = list(chain.from_iterable(company))
brand = list(chain.from_iterable(brand))
pack = list(chain.from_iterable(pack))
data = {'full_name':product, "Tesco_Express":Tesco_Express,"Tesco_Main": Tesco_Main,
        'Sainsbury_Local':Sainsbury_Local,'Sainsbury_Main':Sainsbury_Main,
       'total_sales':total_sales,'company':company,'brand':brand, 'pack':pack}
df_flt = pd.DataFrame(data)
df_flt = df_flt.sort_values(by=['total_sales'], ascending=False)

fuse1=150
fuse2=0
df_valid = df_flt[df_flt['Tesco_Express']>=fuse1]
df_valid = df_valid[df_flt['Tesco_Main']>=fuse1]
df_valid = df_valid[df_flt['Sainsbury_Local']>=fuse2]
df_valid = df_valid[df_flt['Sainsbury_Main']>=fuse2]
df_valid
#df_valid.to_csv("/project/data_cleaning/df_without_missing.csv")  
#df_valid.to_html("/project/data_cleaning/df_valid.html") 
df_valid.describe()