RMSE between two datasets

In [21]:
import numpy as np
import pandas as pd
from scipy.stats import gamma
from scipy.stats import norm

In [38]:
def rmse_percent(a,b):
    rmse = np.sqrt(sum((a-b)**2)/len(a))/np.mean(b)
    return round(rmse*100,1)

In [39]:
def get_data(car_maker):
    df= pd.read_csv('norway_new_car_sales_by_make.csv')
    df['Date'] = pd.to_datetime(df['Year'].astype(str)+df['Month'].astype(str), format='%Y%m')
    df= (df.loc[df['Make'] == car_maker, ['Date', 'Quantity']].rename(columns={'Quantity':'Sales'}).set_index('Date'))
    return df

In [40]:
df = get_data('Ford')


In [41]:
hist_range = (df.Sales.min()*0.8, df.Sales.max()*1.2)
y_actuals, edges = np.histogram(df, bins=30, density=True, range=hist_range)
x = (edges + np.roll(edges, -1))[:-1] / 2.0

In [42]:
#Normal fit
mu = df.Sales.mean()
std = df.Sales.std()
y_normal = norm.pdf(x, mu, std)

#Gamma fit
shape = mu**2/std**2  #k
scale = std**2/mu #theta
y_gamma = gamma.pdf(x, shape, loc=0, scale=scale)

#Gamma fith with min demand
mini = df.Sales.min()
mu_p = mu-mini
shape_p = mu_p**2/std**2
scale_p = std**2/mu_p
y_gamma_p = gamma.pdf(x, shape_p, loc=mini, scale=scale_p)

In [44]:
y_actuals /=y_actuals.sum()
y_normal /=y_normal.sum()
y_gamma /=y_gamma.sum()
y_gamma_p /=y_gamma_p.sum()

In [46]:
rmse_normal = rmse_percent(y_actuals,y_normal)
rmse_gamma = rmse_percent(y_actuals,y_gamma)
rmse_gamma_p = rmse_percent(y_actuals, y_gamma_p)

In [47]:
print(rmse_normal)
print(rmse_gamma)
print(rmse_gamma_p)

60.9
51.5
44.8
