In [None]:
# import packages
import numpy as np
import pandas as pd
import statsmodels as sm
from matplotlib import pyplot as plt
from pylab import rcParams
from sklearn.linear_model import LinearRegression
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa import api as smt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
import matplotlib as mpl
import warnings
from pprint import pprint
warnings.filterwarnings("ignore")
import plotly
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import lux
import seaborn as sns
import plots, metric # custom files for metrics/plots from util.plots - etc

In [None]:
date_column ='EffectiveDate'
file_location = 'FleetForecasting_Top100ProductSubCategory_WithIHSData_Weather_BYDay_V2.csv'
input_data_raw=pd.read_csv(file_location ,sep ='|',parse_dates=[date_column])
input_data_copy = input_data_raw.copy()

In [None]:
input_data_copy.head()

In [None]:
input_data_copy.RegionName.unique() # total of 13 regions - Strings needs to be cleaned before processing

In [None]:
input_data_copy.ProductCategory_Nbl.nunique() # 207 different products represented

In [None]:
input_data_copy.ProductCategory_Desc.nunique() # discrepency of product category number

In [None]:
input_data_copy.columns # column region is useless

In [None]:
#input_data_copy.groupby('ProductCategory_Nbl').size().plot(kind='bar')

In [None]:
input_data_copy['OnRent'].describe().apply(lambda x: format(x, 'f'))

In [None]:
# summary statistics there are 207 unique product categories we are taking a stab at 
# OnRent 

In [None]:
input_data_copy['RegionName'] = input_data_copy['RegionName'].str.replace(r"[\"\',< ]", '')
input_data_copy['ProductCategory_Desc'] = input_data_copy['ProductCategory_Desc'].str.replace(r"[\"\',< ]", '')
cols = ['ProductCategory_Nbl', 'ProductCategory_Desc']
input_data_copy['eq_nm'] = input_data_copy[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
input_data_copy.Division = input_data_copy.Division.astype(str) # convert numerical to string for categorical

### Date time stamp on min/max time line

In [None]:
input_data_copy[date_column].min(), input_data_copy[date_column].max() # time stamp from the start to end

## Distribution of On Rent per region

In [None]:
input_data_copy['OnRent'].hist(by=input_data_copy['RegionName'], figsize = (16,18)) #OnRent quantity per region

In [None]:
(input_data_copy.groupby(['OnRent', 'ProductCategory_Desc'], as_index=False).mean())

In [None]:
input_data_copy.tail()

In [None]:
subset_100_input_data_copy=input_data_copy.loc[(input_data_copy['OnRent'] >= 100)]

In [None]:
subset_100_input_data_copy.ProductCategory_Desc.nunique() # list of machines that have over 100 on rent status. 
                                                          # 41 machines. 20% of products have rentals over 100 at a given point

In [None]:
subset_100_input_data_copy.ProductCategory_Desc

In [None]:
subset_100_input_data_copy['OnRent'].hist(by=subset_100_input_data_copy['RegionName'], figsize = (16,18)) 

In [None]:
subset_250_input_data_copy=input_data_copy.loc[(input_data_copy['OnRent'] >= 250)]

In [None]:
subset_250_input_data_copy.ProductCategory_Desc.nunique() ## 11 machines are over 250 on rent at a single point

## Total Company On Rent 

In [None]:
daily_OnRent = input_data_copy.groupby(date_column, as_index=False)['OnRent'].sum()

In [None]:
import plotly.offline as pyo
import plotly.graph_objs as go
pyo.init_notebook_mode()

daily_sales_sc = go.Scatter(x=daily_OnRent[date_column], y=daily_OnRent['OnRent'])
layout = go.Layout(title='Total Daily OnRent', xaxis=dict(title='Date'), yaxis=dict(title='OnRent'))
fig = go.Figure(data=[daily_sales_sc], layout=layout)
iplot(fig)

## Regional On Rent on Daily Scale

In [None]:
region_daily_sales = input_data_copy.groupby(['RegionName', date_column], as_index=False)['OnRent'].sum()

In [None]:
store_daily_sales_sc = []
for store in region_daily_sales['RegionName'].unique():
    current_store_daily_sales = region_daily_sales[(region_daily_sales['RegionName'] == store)]
    store_daily_sales_sc.append(go.Scatter(x=current_store_daily_sales[date_column], y=current_store_daily_sales['OnRent'], name=('Region: %s' % store)))

layout = go.Layout(title='Region daily OnRent', xaxis=dict(title='Date'), yaxis=dict(title='OnRent'))
fig = go.Figure(data=store_daily_sales_sc, layout=layout)
iplot(fig)

In [None]:
input_data_copy.head()

### Daily Item analysis 

In [None]:
item_daily_sales = input_data_copy.groupby(['eq_nm', date_column], as_index=False)['OnRent'].sum()

In [None]:
item_daily_sales_sc = []
for item in item_daily_sales['eq_nm'].unique():
    current_item_daily_sales = item_daily_sales[(item_daily_sales['eq_nm'] == item)]
    item_daily_sales_sc.append(go.Scatter(x=current_item_daily_sales[date_column], y=current_item_daily_sales['OnRent'], name=('Item %s' % item)))

layout = go.Layout(title='Item daily OnRent', xaxis=dict(title='Date'), yaxis=dict(title='OnRent'))
fig = go.Figure(data=item_daily_sales_sc, layout=layout)
iplot(fig)

In [None]:
## Random Autocorrelation for unique time series 

In [None]:
input_data_copy.head()

### Additional EDA stats if necessary

In [None]:
eda_discovery=input_data_copy[['DRKey','EffectiveDate','OnRent']]
date_column = 'EffectiveDate'

In [None]:
eda_discovery.head()

In [None]:
split_date = '2019-5-20'
eda_discovery.sort_values(by=['DRKey',date_column], ascending=True,inplace=True)
eda_discovery.drop(eda_discovery.loc[eda_discovery[date_column] < '2015-05-01 00:00:00'].index, inplace=True) # removing years before 2015 and 2020 before covid
eda_discovery.drop(eda_discovery.loc[eda_discovery[date_column] > '2020-02-01 01:00:00'].index, inplace=True)
eda_discovery.rename(columns={'DRKey': 'unique_id'},inplace = True)
eda_discovery.rename(columns={'EffectiveDate': 'ds'},inplace = True)
eda_discovery.rename(columns={'OnRent': 'y'},inplace = True)
date_column = 'ds' #changing column name for stat forecast
df_training = eda_discovery.loc[eda_discovery[date_column] <= split_date]
df_test = eda_discovery.loc[eda_discovery[date_column] > split_date]

In [None]:
df_training

In [None]:
plots.plot_grid(df_training,df_test)

In [None]:
plots.plot_autocorrelation_grid(df_training)

In [None]:
## filter region look up 
# use_region = ['REGION 04']
# input_data_copy = input_data_copy[input_data_copy.RegionName.isin(use_region)]

In [None]:
## Shorten Columns

In [None]:
#feature_importance = [col for col in input_data_copy.columns if col not in ['Region','clean_time','Rental', 'QtyOwned', "ProductCategory_Nbl","ProductCategory_Desc"]]