# Route Demand Forecasting
## Using the revenue data of a logistics company on a particular route, we forecasted the revenue for the next 6 months


In [227]:
#libraries for computation and dataframe handling
import pandas as pd
from pandas import DataFrame
import numpy as np

#libraries for Random Forest algorithm
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor

#Libraries for date series
from datetime import datetime
import calendar
from datetime import timedelta
import datetime as dt

In [228]:
# Import the dataset

df = pd.read_csv('revenue.csv')
df.head()

Unnamed: 0,date,total_revenue
0,2019/04/01,158
1,2019/05/01,199
2,2019/06/01,132
3,2019/07/01,146
4,2019/08/01,161


In [229]:
# Function to create additional rows based on the forecast length

def add_month(df, forecast_length, forecast_period):
    end_point = len(df)
    df1 = pd.DataFrame(index=range(forecast_length), columns=range(2))
    df1.columns = ['total_revenue', 'date']
    df = df.append(df1)
    df = df.reset_index(drop=True)
    
    x = df.at[end_point - 1, 'date']
    x = pd.to_datetime(x, format='%Y-%m-%d')
    days_in_month=calendar.monthrange(x.year, x.month)[1]
    if forecast_period == 'Week':
        for i in range(forecast_length):
            df.at[df.index[end_point + i], 'date'] = x + timedelta(days=7 + 7 * i)
            df.at[df.index[end_point + i], 'total_revenue'] = 0
    elif forecast_period == 'Month':
        for i in range(forecast_length):
            df.at[df.index[end_point + i], 'date'] = x + timedelta(days=days_in_month + days_in_month * i)
            df.at[df.index[end_point + i], 'total_revenue'] = 0
            
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
    df['month'] = df['date'].dt.month
    df = df.drop(['date'], axis=1)
    
    return df

In [230]:
# Function to create lag variables to be used for forecasting

def create_lag(df2):
    dataframe = DataFrame()
    for i in range(12, 0, -1):
        dataframe['t-' + str(i)] = df2.total_revenue.shift(i)
    df3 = pd.concat([df2, dataframe], axis=1)
    df3.dropna(inplace=True)
    return df3


In [231]:
def randomForest(df1, forecast_length, forecast_period):
    df2 = df1[['total_revenue', 'date']]
    df2 = add_month(df2, forecast_length, forecast_period) # calling function add_month()
    
    finaldf = create_lag(df2) # calling function create_lag()
    finaldf = finaldf.reset_index(drop=True)
    
    n = forecast_length
    end_point = len(finaldf)
    x = end_point - n
    
    #Subsetting training and testing set from the original data
    dataset = finaldf.loc[:x - 1, :]
    finaldf_train = dataset.loc[:n - 1, :]
    finaldf_test = dataset.loc[n:, :]
    finaldf_test_x = finaldf_test.loc[:, finaldf_test.columns != 'total_revenue']
    finaldf_test_y = finaldf_test['total_revenue']
    finaldf_train_x = finaldf_train.loc[:, finaldf_train.columns != 'total_revenue']
    finaldf_train_y = finaldf_train['total_revenue']
        
    #Initialising Random Forest Regressor
    print("Starting model train..")
    rfe = RFE(RandomForestRegressor(n_estimators=100, random_state=1), 4)
    fit = rfe.fit(finaldf_train_x, finaldf_train_y)
    print("Model train completed..")
    
    #Predicting the testing set using the trained model and printing the accuracy
    print("Starting testing set prediction")
    y_pred = fit.predict(finaldf_test_x)
    y_true = np.array(finaldf_test_y)
    sumvalue=np.sum(y_true)
    mape=np.sum(np.abs((y_true - y_pred)))/sumvalue*100
    accuracy=100-mape
    print('Accuracy:', round(accuracy,2),'%.')
    
    #Forecasting the revenue for April - September 2021 and appending it to the main dataset
    
    print("Creating forecasted set..")
    yhat = []
    end_point = len(finaldf)
    n = forecast_length
    df2_end = len(df2)
    
    for i in range(n, 0, -1):
        y = end_point - i
        inputfile = finaldf.loc[y:end_point, :]
        inputfile_x = inputfile.loc[:, inputfile.columns != 'total_revenue']
        pred_set = inputfile_x.head(1)
        pred = fit.predict(pred_set)
        df2.at[df2.index[df2_end - i], 'total_revenue'] = pred[0]
        finaldf = create_lag(df2)
        finaldf = finaldf.reset_index(drop=True)
        yhat.append(pred)
    yhat = np.array(yhat)
    print("Forecast complete..")
    return finaldf

In [232]:
predicted_value=randomForest(df, 6, 'Month')

Starting model train..




Model train completed..
Starting testing set prediction
Accuracy: 62.46 %.
Creating forecasted set..
Forecast complete..


In [233]:
# Revenue prediction for April 2021 to September 2021
predicted_value[:6]

Unnamed: 0,total_revenue,month,t-12,t-11,t-10,t-9,t-8,t-7,t-6,t-5,t-4,t-3,t-2,t-1
0,53,4,158,199,132,146,161,170,158,53,200,198,193,146
1,132,5,199,132,146,161,170,158,53,200,198,193,146,53
2,158,6,132,146,161,170,158,53,200,198,193,146,53,132
3,143,7,146,161,170,158,53,200,198,193,146,53,132,158
4,170,8,161,170,158,53,200,198,193,146,53,132,158,143
5,169,9,170,158,53,200,198,193,146,53,132,158,143,170


The model has a accuracy of 62.46%.

The dataset had only revenue data for 24 months. 

It is hoped that with more data, a higher accuracy can be achieved in revenue forecasting