In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor
import xgboost as xgb
import pickle
import plotly.graph_objects as go

In [2]:
monthly_data=pd.read_csv("Monthly_Training_Data.csv",usecols=["Service Name","Month","Year","Count"])
weekly_data=pd.read_csv("Weekly_training_data.csv",usecols=["Service Name","Month","Year","Count","wom"])
print(weekly_data.shape,monthly_data.shape)
monthly_data.head()

(4148, 5) (1392, 4)


Unnamed: 0,Service Name,Count,Month,Year
0,Anterior Chamber Wash,1,1,2016
1,Athens Protocol,16,1,2016
2,Clear Lens Exchange (CLE / RLE),1,1,2016
3,Combined Phaco Vitrectomy,40,1,2016
4,Combined Phacotrabeculectomy,8,1,2016


In [3]:
# # monthly_data=monthly_data.sort_values(by=["Count"],ascending=False).iloc[0:100]
# monthly_data=monthly_data[(monthly_data['Year'] == 2018) & (monthly_data['Month'] == 12)]
# monthly_data=monthly_data.sort_values(by=["Count"],ascending=False).iloc[0:20]
# monthly_data

In [4]:
with open('operations_names_en_to_ar.pickle', 'rb') as f:
    operations_names_en_to_ar = pickle.load(f)

In [5]:
# Service_Names_to_index = {name : idx for idx, name in enumerate(weekly_data['Service Name'].unique())}
# Service_Index_to_name= {idx : name for idx, name in enumerate(weekly_data['Service Name'].unique())}
# monthly_Service_Index_to_name= {idx : name for idx, name in enumerate(weekly_data['Service Name'].unique())}

In [6]:
def encode_monthly_names(df):
    Service_Names_to_index = {name : idx for idx, name in enumerate(monthly_data['Service Name'].unique())}
    df['Service Name']=df['Service Name'].map(Service_Names_to_index)
    return df

In [7]:
def decode_monthly_names(df):
    Service_Index_to_name= {idx : name for idx, name in enumerate(monthly_data['Service Name'].unique())}
    df['Service Name']=df['Service Name'].map(Service_Index_to_name)
    return df

In [8]:
def encode_weekly_names(df):
    Service_Names_to_index = {name : idx for idx, name in enumerate(weekly_data['Service Name'].unique())}
    df['Service Name']=df['Service Name'].map(Service_Names_to_index)
    return df

In [9]:
def decode_weekly_names(df):
    Service_Index_to_name= {idx : name for idx, name in enumerate(weekly_data['Service Name'].unique())}
    df['Service Name']=df['Service Name'].map(Service_Index_to_name)
    return df

In [10]:
weekly_processed_data = weekly_data.copy()
weekly_processed_data=encode_weekly_names(weekly_processed_data)
monthly_processed_data= monthly_data.copy()
monthly_processed_data=encode_monthly_names(monthly_processed_data)
monthly_processed_data.head()

Unnamed: 0,Service Name,Count,Month,Year
0,0,1,1,2016
1,1,16,1,2016
2,2,1,1,2016
3,3,40,1,2016
4,4,8,1,2016


In [11]:
# X=data.drop(["Count"],axis=1)
# y=data["Count"]

In [12]:
monthly_split_index = monthly_processed_data[(monthly_processed_data['Year'] == 2018) & (monthly_processed_data['Month'] == 6)].iloc[-1].name
weekly_split_index = weekly_processed_data[(weekly_processed_data['Year'] == 2018) & (weekly_processed_data['Month'] == 11)].iloc[-1].name
print(monthly_split_index,weekly_split_index)

1158 4014


In [13]:
# X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=42)
def get_X_y(df,split_index):
    X_train = df.iloc[:split_index].drop('Count', axis=1)
    X_test = df.iloc[split_index:].drop('Count', axis=1)

    y_train = df.iloc[:split_index]['Count']
    y_test = df.iloc[split_index:]['Count']

    return X_train, X_test, y_train, y_test


In [14]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [15]:
# weekly_processed_data[(weekly_processed_data['Year'] == 2018) & (weekly_processed_data['Month'] == 12) & (weekly_processed_data['wom'] == 1) & (weekly_processed_data['wom'] == 2)]

In [16]:
def weekly_prediction_cases(regressor):
     week_ahead = weekly_processed_data[(weekly_processed_data['Year'] == 2018) & (weekly_processed_data['Month'] == 12) & (weekly_processed_data['wom'] == 1)].sort_values(by=["Count"],ascending=False).iloc[0:10]
     
     two_weeks_ahead = weekly_processed_data[(weekly_processed_data['Year'] == 2018) & (weekly_processed_data['Month'] == 12) & ((weekly_processed_data['wom'] == 1) | (weekly_processed_data['wom'] == 2))].sort_values(by=["Count"],ascending=False).iloc[0:10]
     

     week_ahead_pred = regressor.predict(week_ahead.drop('Count', axis=1))
     two_weeks_ahead_pred = regressor.predict(two_weeks_ahead.drop('Count', axis=1))
     # half_year_pred = regressor.predict(half_year.drop('Count', axis=1))
     
     err = mean_absolute_error(week_ahead['Count'], week_ahead_pred)
     week_ahead_MAPE = mean_absolute_percentage_error(week_ahead['Count'], week_ahead_pred)
     two_weeks_ahead_MAPE = mean_absolute_percentage_error( two_weeks_ahead['Count'], two_weeks_ahead_pred)
     # half_year_MAPE = mean_absolute_percentage_error(half_year['Count'], half_year_pred) 
     
     week_ahead["predictions"]=week_ahead_pred
     two_weeks_ahead["predictions"]=two_weeks_ahead_pred
     # half_year["predictions"]=half_year_pred
     week_ahead=decode_weekly_names(week_ahead)
     two_weeks_ahead=decode_weekly_names(two_weeks_ahead)
     # half_year['Service Name']=half_year['Service Name'].map(Service_Index_to_name)

     week_ahead['Service Name AR']=week_ahead['Service Name'].map(operations_names_en_to_ar)
     two_weeks_ahead['Service Name AR']=two_weeks_ahead['Service Name'].map(operations_names_en_to_ar)
     # half_year['Service Name AR']=half_year['Service Name'].map(operations_names_en_to_ar)     

     return err,week_ahead_MAPE, two_weeks_ahead_MAPE, week_ahead, two_weeks_ahead

In [17]:
def monthly_prediction_cases(regressor):
     month_ahead = monthly_processed_data[(monthly_processed_data['Year'] == 2018) & (monthly_processed_data['Month'] == 12)].sort_values(by=["Count"],ascending=False).iloc[0:10]
     
     quarter_year = monthly_processed_data[(monthly_processed_data['Year'] == 2018) & (monthly_processed_data['Month'] >= 9)].sort_values(by=["Count"],ascending=False).iloc[0:20]
     
     half_year = monthly_processed_data[(monthly_processed_data['Year'] == 2018) & (monthly_processed_data['Month'] >= 7)].sort_values(by=["Count"],ascending=False).iloc[0:20]

     month_ahead_pred = regressor.predict(month_ahead.drop('Count', axis=1))
     quarter_year_pred = regressor.predict(quarter_year.drop('Count', axis=1))
     half_year_pred = regressor.predict(half_year.drop('Count', axis=1))
     
     err = mean_absolute_error(month_ahead['Count'], month_ahead_pred)
     month_MAPE = mean_absolute_percentage_error(month_ahead['Count'], month_ahead_pred)
     quarter_year_MAPE = mean_absolute_percentage_error( quarter_year['Count'], quarter_year_pred)
     half_year_MAPE = mean_absolute_percentage_error(half_year['Count'], half_year_pred) 
     
     month_ahead["predictions"]=month_ahead_pred
     quarter_year["predictions"]=quarter_year_pred
     half_year["predictions"]=half_year_pred

     month_ahead=decode_monthly_names(month_ahead)
     quarter_year=decode_monthly_names(quarter_year)
     half_year=decode_monthly_names(half_year)
     # month_ahead['Service Name']=month_ahead['Service Name'].map(Service_Index_to_name)
     # quarter_year['Service Name']=quarter_year['Service Name'].map(Service_Index_to_name)
     # half_year['Service Name']=half_year['Service Name'].map(Service_Index_to_name)
     

     month_ahead['Service Name AR']=month_ahead['Service Name'].map(operations_names_en_to_ar)
     quarter_year['Service Name AR']=quarter_year['Service Name'].map(operations_names_en_to_ar)
     half_year['Service Name AR']=half_year['Service Name'].map(operations_names_en_to_ar)     
     # month_ahead['MAPE']=


     return err,month_MAPE,quarter_year_MAPE, half_year_MAPE, month_ahead,quarter_year, half_year

In [18]:
def perform_weekly_regression(regressor):
    # regressor = DecisionTreeRegressor(random_state=0)
    X_train, X_test, y_train, y_test=get_X_y(weekly_processed_data,weekly_split_index)
    regressor.fit(X_train, y_train)
    # y_pred = regressor.predict(X_test)
    # print(f'Total MAE {mean_absolute_percentage_error(y_pred, y_test)}')
    err,week_ahead_MAPE, two_weeks_ahead_MAPE, week_ahead, two_weeks_ahead= weekly_prediction_cases(regressor)

    print(f'1 week predection MAE {err}')
    print(f'1 week predection MAPE {week_ahead_MAPE}')
    print(f'2 weeks predection MAPE {two_weeks_ahead_MAPE}')
    # print(f'half year MAPE {half_year_MAPE}')
    return week_ahead, two_weeks_ahead

In [19]:
def perform_monthly_regression(regressor):
    # regressor = DecisionTreeRegressor(random_state=0)
    X_train, X_test, y_train, y_test=get_X_y(monthly_processed_data,monthly_split_index)
    regressor.fit(X_train, y_train)
    # y_pred = regressor.predict(X_test)
    # print(f'Total MAE {mean_absolute_percentage_error(y_pred, y_test)}')
    err,month_MAPE,quarter_year_MAPE, half_year_MAPE, month_ahead,quarter_year, half_year = monthly_prediction_cases(regressor)

    print(f'1 month predection MAE {err}')
    print(f'1 month predection MAPE {month_MAPE}')
    print(f'Last quarter year MAPE {quarter_year_MAPE}')
    print(f'half year MAPE {half_year_MAPE}')
    return month_ahead,quarter_year, half_year

In [21]:
def vis(df):
    fig = go.Figure(data=[
                        go.Bar(name = 'Real Data', x=df["Service Name"], y=df["Count"], marker_color='#800000'),
                        go.Bar(name = 'Predicted', x=df["Service Name"], y=df["predictions"], marker_color='#228B22'),

    ])

    fig.update_layout(barmode='group',width=1200, height=400)
    return fig.show()

1. **Decision Tree Regression**

In [22]:
# regressor = DecisionTreeRegressor(random_state=0)
# perform_monthly_regression(regressor)

2. **Support Vector Machine Regression**

In [23]:
# svr = SVR()
# perform_monthly_regression(svr)

3. **AdaBoost Regression**

In [24]:
ada = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mae'), n_estimators=200, learning_rate=0.02, loss='square',random_state=72)
w=perform_weekly_regression(ada)
# vis(x[0])

1 week predection MAE 13.4
1 week predection MAPE 38.14023682283974
2 weeks predection MAPE 29.571080727284848


** One Week Visualization **

In [25]:
vis(w[0])

** Two Weeks Visualization **

In [30]:
vis(w[1])

In [31]:
ada = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mae'), n_estimators=300, learning_rate=0.01, loss='square',random_state=7)
m=perform_monthly_regression(ada)


1 month predection MAE 14.2
1 month predection MAPE 21.59803388752053
Last quarter year MAPE 11.875189475844564
half year MAPE 13.232832052992654


** One Month Visualization **

In [32]:
vis(m[0]) 

** Four Months Visualization **

In [33]:
vis(m[1]) 

In [25]:
# xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=7)
# perform_monthly_regression(xgb_model)

In [26]:
# gbc=GradientBoostingRegressor(n_estimators=500,max_depth=8,min_samples_split=2,loss="ls",random_state=7)
# perform_monthly_regression(gbc)

** choosing the best Model  till now which is (ADABOOSTING) so We will pickle it **

In [38]:
# with open('./ada_regressor.pickle', 'wb') as f:
#     pickle.dump(ada, f)

**Testing the pickled model**

In [35]:
with open('./ada_regressor.pickle', 'rb') as f:
  reg = pickle.load(f)
month=perform_monthly_regression(reg)

1 month predection MAE 14.2
1 month predection MAPE 21.59803388752053
Last quarter year MAPE 11.875189475844564
half year MAPE 13.232832052992654


In [36]:
week=perform_weekly_regression(reg)

1 week predection MAE 13.3
1 week predection MAPE 37.34658602918894
2 weeks predection MAPE 29.694537517408303


In [37]:
week[0].to_csv("pred_one_week_ada.csv",encoding='utf-8-sig')
month[0].to_csv("pred_one_month_ada.csv",encoding='utf-8-sig')
# quarter.to_csv("pred_last_quarter_ada.csv",encoding='utf-8-sig')
# half_year.to_csv("pred_half_year_ada.csv",encoding='utf-8-sig')