In [37]:
import tqdm
import numpy as np
import pandas as pd
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults
from sklearn.metrics import mean_squared_error

In [42]:
# to ignore warnings when fitting arima
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("small_results.csv")

In [3]:
df[:5]

Unnamed: 0.1,Unnamed: 0,views,timestamp,page_name
0,0,29,2018-01-01,Хелловін
1,1,29,2018-01-02,Хелловін
2,2,32,2018-01-03,Хелловін
3,3,23,2018-01-04,Хелловін
4,4,39,2018-01-05,Хелловін


In [6]:
np.random.randn(df.shape[0])

array([ 1.88762427,  0.72465953,  1.26144787, -1.16698146, -0.63881992,
        0.48280226,  1.22699345, -1.02888135, -0.34950983,  0.30320045,
       -0.53642526, -0.41771718,  0.10973228, -0.2535655 ,  2.66058477,
       -0.71324346,  1.16134096, -1.23055132,  0.4692298 , -0.1196932 ,
        0.82918103, -0.67772005,  1.60041806, -1.07106042,  0.41139457,
       -0.62925946, -0.90345067, -1.90550645,  1.11859434,  0.54821075,
       -1.10251094, -0.79524134,  1.02014372, -0.19619806,  1.04611723,
       -0.0535095 ,  0.18381274,  1.14750751,  0.40104736,  1.02322566,
        1.13431133, -1.03465789, -0.031713  , -1.28973525, -0.1271336 ,
       -1.14381696, -0.02767995, -1.52248187, -2.44019872,  1.78793925,
        0.48959161,  0.32982456,  0.81472203,  0.78934424,  0.9001609 ,
        0.15331596,  1.62741761,  0.39102561, -1.02330182, -0.60137263,
       -1.18066262, -0.66192402, -0.08162862, -0.52817455,  0.69747372,
       -0.54783576, -1.00824901, -0.54647559,  1.42440942,  1.85

In [9]:
# generation of the second time series column for development process
df["rand"] = pd.Series(np.random.randn(df.shape[0]))

In [10]:
df[:5]

Unnamed: 0.1,Unnamed: 0,views,timestamp,page_name,rand
0,0,29,2018-01-01,Хелловін,-1.186046
1,1,29,2018-01-02,Хелловін,-1.222354
2,2,32,2018-01-03,Хелловін,-0.530051
3,3,23,2018-01-04,Хелловін,-1.342433
4,4,39,2018-01-05,Хелловін,0.870854


In [11]:
unique_pages = df['page_name'].unique()

In [13]:
# evaluate an ARIMA model for a given order (p,d,q)
def evaluate_arima_model(X, arima_order):
    # prepare training dataset
    train_size = int(len(X) * 0.66)
    train, test = X[0:train_size], X[train_size:]
    history = [x for x in train]
    # make predictions
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=arima_order)
        model_fit = model.fit(disp=0)
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(test[t])
    # calculate out of sample error
    error = mean_squared_error(test, predictions)
    return error

In [39]:
import sys

def fit_arima(dataset):
    # define grid of parameters
    p_values = range(0, 2)
    d_values = range(0, 2)
    q_values = range(0, 2)
#     evaluate_models(series.values, p_values, d_values, q_values)
    dataset = dataset.astype('float32')
    best_score, best_cfg = float("inf"), None

    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    mse = evaluate_arima_model(dataset, order)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
#                     print('ARIMA%s MSE=%.3f' % (order,mse))
                except Exception as e: 
#                     print(e)
                    continue
#     print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score))                
    return best_cfg


In [43]:
stats = {
    'page_name':[],
    'count_records':[],
    'mean':[],
    'std':[],
    'ar':[],
    'd':[],
    'ma':[],
    'max':[],
    'min':[]
}

for page in unique_pages:
    page_ts = df[df['page_name'] == page]['views'].values
    stats['page_name'].append(page)
    stats['count_records'].append(len(page_ts))
    stats['mean'].append(page_ts.mean())
    stats['std'].append(page_ts.std())
    stats['min'].append(page_ts.min())
    stats['max'].append(page_ts.max())
    ar, d, ma = fit_arima(page_ts)
    stats['ar'].append(ar)
    stats['d'].append(d)
    stats['ma'].append(ma)

In [44]:
pd.DataFrame.from_dict(stats)

Unnamed: 0,ar,count_records,d,ma,max,mean,min,page_name,std
0,0,41,1,1,62,37.097561,23,Хелловін,8.319254
1,0,41,1,0,160,69.658537,37,Танці_з_зірками,28.406981
2,0,41,1,1,169,31.219512,11,Мартин_(птах),28.642891
