In [1]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

In [2]:
data_inventory = pd.read_csv(r"C:\Users\20193727\Downloads\Data\Data\inventory.csv")
data_products = pd.read_csv(r"C:\Users\20193727\Downloads\Data\Data\products.csv")
data_promotions = pd.read_csv(r"C:\Users\20193727\Downloads\Data\Data\promotions.csv")
data_transactions = pd.read_csv(r"C:\Users\20193727\Downloads\Data\Data\transactions.csv")

In [3]:
def forest_df(data, product):
    days = []
    sales = []
    prices = []
    size_list = []
    
    different_sizes = data[data['description'] == product]['size'].unique()
    different_sizes = [x for x in different_sizes if x == x]
    product_data = data[data['description'] == product]
    
    for day in data['day'].unique():
        size_variable = -1
        daily_sales = data[data['day'] == day]
        product_sales = daily_sales[daily_sales['description'] == product]
        
        for size in different_sizes:
            size_variable += 1
            sales_per_size = len(product_sales[product_sales['size'] == size])
            try:
                price = product_sales[product_sales['size'] == size]['purchase_price'].iloc[0]
            except:
                price = product_data[product_data['size'] == size]['purchase_price'].max()
            
            days.append(day)
            sales.append(sales_per_size)
            size_list.append(size_variable)
            prices.append(price)
     
    raw_data = {'date' : days, 'amount_sold':sales, 'price': prices, 'size':size_list}
    dataframe = pd.DataFrame(raw_data)
    return dataframe

In [None]:
def days_to_weeks(data):
    

In [4]:
def forest_variables(forest_data):
    prices = forest_data['price']
    sales = forest_data['amount_sold']
    sizes = forest_data['size']
    
    zipped = zip(prices, sizes)
    x_train = [list(a) for a in zipped]
    y_train = sales
    
    return x_train, y_train
    

In [5]:
forest_data = forest_df(data_transactions, 'Rundergehakt')

In [6]:
x_price, y_sales = forest_variables(forest_data)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(x_price, y_sales, test_size=0.25, random_state=42)

## models


In [15]:
bos = RandomForestRegressor(n_estimators = 2000, min_samples_split =5, min_samples_leaf = 2, max_features = 'auto', max_depth = 50, bootstrap = True)
bos.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [None]:
bos_clf = RandomForestClassifier(n_estimators = 600, min_samples_split =10, min_samples_leaf = 4, max_features = 'sqrt', max_depth = 90, bootstrap = False)
bos_clf.fit(X_train, y_train)

In [9]:
def Accuracy(bos, X_test, y_test):
    predictions = bos.predict(X_test)
    rounded_predictions = [round(num) for num in predictions]
    acc = 0
    for i in range(len(y_test)):
        if rounded_predictions[i] == y_test.iloc[i]:
            acc += 1
            
    return acc/len(predictions)

In [16]:
Accuracy(bos, X_test, y_test)

0.14754098360655737

## parameter tuning

In [11]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [12]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 64, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [13]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
#rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 25, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:   11.0s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=25, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [14]:
rf_random.best_params_

{'n_estimators': 2000,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 50,
 'bootstrap': True}