In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
import xgboost as xgb
import pickle

In [13]:
data=pd.read_csv("Monthly_Training_Data.csv",usecols=["Service Name","Month","Year","Count"])
data.head()

Unnamed: 0,Service Name,Count,Month,Year
0,Anterior Chamber Wash,1,1,2016
1,Athens Protocol,16,1,2016
2,Clear Lens Exchange (CLE / RLE),1,1,2016
3,Combined Phaco Vitrectomy,40,1,2016
4,Combined Phacotrabeculectomy,8,1,2016


In [14]:
ServiceNames = {name : idx for idx, name in enumerate(data['Service Name'].unique())}
ServiceIndex= {idx : name for idx, name in enumerate(data['Service Name'].unique())}

In [15]:
processed_data = data.copy()
processed_data['Service Name']=processed_data['Service Name'].map(ServiceNames)
processed_data.head()

Unnamed: 0,Service Name,Count,Month,Year
0,0,1,1,2016
1,1,16,1,2016
2,2,1,1,2016
3,3,40,1,2016
4,4,8,1,2016


In [16]:
X=data.drop(["Count"],axis=1)
y=data["Count"]

In [17]:
split_index = processed_data[(processed_data['Month'] == 6)].iloc[-1].name

In [18]:
# X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=42)

X_train = processed_data.iloc[:split_index].drop('Count', axis=1)
X_test = processed_data.iloc[split_index:].drop('Count', axis=1)

y_train = processed_data.iloc[:split_index]['Count']
y_test = processed_data.iloc[split_index:]['Count']

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((1158, 3), (234, 3), (1158,), (234,))

In [19]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [20]:
def prediction_cases(regressor):
     month_ahead = processed_data[(processed_data['Year'] == 2018) & (processed_data['Month'] == 7)]
     
     quarter_year = processed_data[(processed_data['Year'] == 2018) & (processed_data['Month'] >= 9)]
     
     half_year = processed_data[(processed_data['Year'] == 2018) & (processed_data['Month'] >= 7)]

     month_ahead_pred = regressor.predict(month_ahead.drop('Count', axis=1))
     quarter_year_pred = regressor.predict(quarter_year.drop('Count', axis=1))
     half_year_pred = regressor.predict(half_year.drop('Count', axis=1))
     
     err = mean_absolute_error(month_ahead['Count'], month_ahead_pred)
     month_MAPE = mean_absolute_percentage_error(month_ahead['Count'], month_ahead_pred)
     quarter_year_MAPE = mean_absolute_percentage_error( quarter_year['Count'], quarter_year_pred)
     half_year_MAPE = mean_absolute_percentage_error(half_year['Count'], half_year_pred) 

     return err,month_MAPE, quarter_year_MAPE, half_year_MAPE

In [21]:
def perform_regression(regressor):
    # regressor = DecisionTreeRegressor(random_state=0)
    regressor.fit(X_train, y_train)
    # y_pred = regressor.predict(X_test)
    # print(f'Total MAE {mean_absolute_percentage_error(y_pred, y_test)}')
    err,month_MAPE ,quarter_year_MAPE, half_year_MAPE = prediction_cases(regressor)

    print(f'1 month predection MAE {err}')
    print(f'1 month predection MAPE {month_MAPE}')
    print(f'Last quarter year MAPE {quarter_year_MAPE}')
    print(f'half year MAPE {half_year_MAPE}')

In [22]:
gbc=GradientBoostingRegressor(n_estimators=500,max_depth=8,min_samples_split=2,loss="ls",random_state=7)
perform_regression(gbc)

1 month predection MAE 12.616720472808476
1 month predection MAPE 65.28717760323758
Last quarter year MAPE 68.6403748200217
half year MAPE 69.63421056416956
