In [11]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

In [12]:
data_inventory = pd.read_csv("../data/inventory.csv")
data_products = pd.read_csv("../data/products.csv")
data_promotions = pd.read_csv("../data/promotions.csv")
data_transactions = pd.read_csv("../data/transactions.csv")

In [13]:
data_transactions[data_transactions['description'] == 'Rundergehakt'].head()

Unnamed: 0,day,time,customer,bank acount,category,product_id,description,size,std_sales_price,purchase_price,bio,basic
374,1/1/2018,13:52:41,23.0,53801240.0,meat,m_7,Rundergehakt,300g,2.29,2.29,0.0,0.0
383,1/1/2018,13:52:41,23.0,53801240.0,meat,m_1,Rundergehakt,500g,3.69,3.69,0.0,0.0
403,1/1/2018,14:09:57,24.0,55280334.0,meat,m_7,Rundergehakt,300g,2.29,2.29,0.0,0.0
496,1/1/2018,14:30:47,27.0,51176058.0,meat,m_1,Rundergehakt,500g,3.69,3.69,0.0,0.0
553,1/1/2018,15:19:45,31.0,51171942.0,meat,m_7,Rundergehakt,300g,2.29,2.29,0.0,0.0


In [14]:
data_promotions

Unnamed: 0,week,category,product_id,description,discount
0,1,vegetable,v_14,Biologische knoflook,30
1,1,vegetable,v_2,Courgette,5
2,1,vegetable,v_16,Biologische kikkererwten,10
3,1,vegetable,v_12,Biologische courgette,15
4,1,bread,b_41,Chinois brioche,30
...,...,...,...,...,...
775,52,vegetable,v_5,Paprika,10
776,52,meat,m_2,Gerookte spekreepjes,35
777,52,vegetable,v_32,Buitenbeentjes komkommer,10
778,52,meat,m_12,Unox Gelderse rookworst,30


In [15]:
#data_promotions['description'].value_counts()

In [16]:
def sales_different_prices(df_transactions, product :str, size):
    product_sales = data_transactions[data_transactions['description'] == product]
    product_sales = product_sales[product_sales['size'] == size]
    sales_per_day = product_sales['day'].value_counts()
    
    daily_sales = []
    daily_price = []
    for day in range(len(sales_per_day.index)):
        sales = sales_per_day[day]
        price1 = product_sales[product_sales['day'] == sales_per_day.index[day]]['purchase_price']
        #price1 = product_sales[product_sales['day'] == sales_per_day.index[day]]['purchase_price']
        price = [price1[price1.index[0]]]
        daily_sales.append(sales)
        daily_price.append(price)
        
    return daily_price, daily_sales   

In [17]:
def sales_different_prices_id(df_transactions, product_id):
    product_sales = data_transactions[data_transactions['product_id'] == product_id]
    sales_per_day = product_sales['day'].value_counts()
    
    daily_sales = []
    daily_price = []
    for day in range(len(sales_per_day.index)):
        sales = sales_per_day[day]
        price1 = product_sales[product_sales['day'] == sales_per_day.index[day]]['purchase_price']
        #price1 = product_sales[product_sales['day'] == sales_per_day.index[day]]['purchase_price']
        price = [price1[price1.index[0]]]
        daily_sales.append(sales)
        daily_price.append(price)
        
    return daily_price, daily_sales   

In [18]:
def sales_different_pricesandsizes(df_transactions, product):
    product_sales = data_transactions[data_transactions['description'] == product]
    #product_sales = product_sales[product_sales['size'] == size]
    #sales_per_day = product_sales['day'].value_counts()
    days = product_sales['day'].unique()
    
    daily_sales = []
    daily_price = []
    size_dummy = -1
    

    for size in product_sales['size'].unique():
        product_sales_loop = product_sales[product_sales['size'] == size]
        sales_per_day = product_sales_loop['day'].value_counts()
        size_dummy += 1
        for day in range(len(sales_per_day)):
            sales = sales_per_day[day]
            price1 = product_sales[product_sales['day'] == sales_per_day.index[day]]['purchase_price']
            #price1 = product_sales[product_sales['day'] == sales_per_day.index[day]]['purchase_price']
            price = [price1[price1.index[0]], size_dummy]
            daily_sales.append(sales)
            daily_price.append(price)
        
    return daily_price, daily_sales   

In [19]:
def forest_df(data, product):
    days = []
    sales = []
    prices = []
    size_list = []
    
    different_sizes = data[data['description'] == product]['size'].unique()
    different_sizes = [x for x in different_sizes if x == x]
    product_data = data[data['description'] == product]
    
    for day in data['day'].unique():
        size_variable = -1
        daily_sales = data[data['day'] == day]
        product_sales = daily_sales[daily_sales['description'] == product]
        
        for size in different_sizes:
            size_variable += 1
            sales_per_size = len(product_sales[product_sales['size'] == size])
            try:
                price = product_sales[product_sales['size'] == size]['purchase_price'].iloc[0]
            except:
                price = product_data[product_data['size'] == size]['purchase_price'].max()
            
            days.append(day)
            sales.append(sales_per_size)
            size_list.append(size_variable)
            prices.append(price)
     
    raw_data = {'date' : days, 'amount_sold':sales, 'price': prices, 'size':size_list}
    dataframe = pd.DataFrame(raw_data)
    return dataframe

In [20]:
def forest_variables(forest_data):
    prices = forest_data['price']
    sales = forest_data['amount_sold']
    sizes = forest_data['size']
    
    zipped = zip(prices, sizes)
    x_train = [list(a) for a in zipped]
    y_train = sales
    
    return x_train, y_train
    

In [21]:
#product_sales = data_transactions[data_transactions['description'] == 'Rundergehakt']
#product_sales_loop = product_sales[product_sales['size'] == '500g']
#sales_per_day = product_sales_loop['day'].value_counts()
#sales_per_day[150:180]

In [22]:
#x_price, y_sales = sales_different_prices(data_transactions, 'Rundergehakt', '500g')

In [23]:
#x_price1, y_sales1 = sales_different_pricesandsizes(data_transactions, 'Rundergehakt')

In [24]:
#x_price2, y_sales2 = sales_different_pricesandsizes(data_transactions, 'm_1')

In [25]:
forest_data = forest_df(data_transactions, 'Rundergehakt')

In [203]:
forest_data.head()

Unnamed: 0,date,amount_sold,price,size
0,1/1/2018,5,2.29,0
1,1/1/2018,3,3.69,1
2,,0,2.29,0
3,,0,3.69,1
4,2/1/2018,4,2.29,0


In [26]:
x_price, y_sales = forest_variables(forest_data)

In [43]:
X_train, X_test, y_train, y_test =  train_test_split(x_price, y_sales, test_size = 0.25, random_state=42)


In [55]:
from math import ceil

bos = RandomForestRegressor(n_estimators = 500, min_samples_split = 0.05, min_samples_leaf = 0.02, max_features = ceil(len(X_train[0])/3), max_depth = 100, bootstrap = True, random_state=42)
bos.fit(X_train, y_train)

RandomForestRegressor(max_depth=100, max_features=1, min_samples_leaf=0.02,
                      min_samples_split=0.05, n_estimators=500,
                      random_state=42)

In [56]:
from sklearn import metrics

y_pred = bos.predict(X_test)
np.sqrt(metrics.mean_squared_error(y_test, y_pred))

2.265150091284047

In [218]:
bos_clf = RandomForestClassifier(n_estimators = 600, min_samples_split =10, min_samples_leaf = 4, max_features = 'sqrt', max_depth = 90, bootstrap = False)
bos_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=90, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=600, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [212]:
def Accuracy(bos, X_test, y_test):
    predictions = bos.predict(X_test)
    rounded_predictions = [round(num) for num in predictions]
    acc = 0
    for i in range(len(y_test)):
        if rounded_predictions[i] == y_test.iloc[i]:
            acc += 1
            
    return acc/len(predictions)

In [223]:
Accuracy(bos, X_test, y_test)

0.14754098360655737

In [None]:
# input: [Samples, features]
# output: [Samples, quantity demand]

## Tuning parameters

In [17]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [18]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', len(X_train[0])/3]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(20, 200, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [x for x in np.linspace(0.01, 0.07, num = 4)]
# Minimum number of samples required at each leaf node
min_samples_leaf = [x for x in np.linspace(0.01, 0.07, num = 4)]
# Method of selecting samples for training each tree
bootstrap = [True]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [220]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor(random_state=42)
#rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:   10.9s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=25, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [221]:
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

In [None]:
try:
    for key, value in grid_best_param.items():
        best_param = rf_random.best_params_[key]
        if best_param not in value:
            value.append(rf_random.best_params_[key])
except:
    grid_best_param = {key: [value] for key, value in rf_random.best_params_.items()}

In [None]:
rf = RandomForestRegressor()
rf_grid = GridSearchCV(estimator = rf, param_grid = grid_best_param, cv = 3, n_jobs = -1)
rf_grid.fit(X_train, y_train)

In [None]:
rf_grid.best_params_

In [None]:
# Fill in the rf_grid.best_params_
rf = RandomForestRegressor(n_estimators = , min_samples_split = , min_samples_leaf = , max_depth = , max_features = , bootstrap = True)
rf.fit(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

In [31]:
# # Test
# d = {1: ["one"], 2: ["three"]}
# d1 = {1: 'hi', 2: "three"}

# # try:
# for key, value in d.items():
#     best_param = d1[key]
#     if best_param not in value:
#         value.append(d1[key])

# print(d)
# # d = {key:  for key, value in d.items()}
# # # except:
# # #     d = {key: [value] for key, value in d.items()}
# # print(d)

{1: ['one', 'hi'], 2: ['three']}
