In [1]:
import tqdm
import numpy as np
import pandas as pd
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults
from sklearn.metrics import mean_squared_error

In [2]:
# to ignore warnings when fitting arima
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("small_results.csv")

In [4]:
df[:5]

Unnamed: 0.1,Unnamed: 0,views,timestamp,page_name
0,0,29,2018-01-01,Хелловін
1,1,29,2018-01-02,Хелловін
2,2,32,2018-01-03,Хелловін
3,3,23,2018-01-04,Хелловін
4,4,39,2018-01-05,Хелловін


In [5]:
# generation of the second time series column for development process
df["rand"] = pd.Series(np.random.randn(df.shape[0]))

In [6]:
df[:5]

Unnamed: 0.1,Unnamed: 0,views,timestamp,page_name,rand
0,0,29,2018-01-01,Хелловін,0.879687
1,1,29,2018-01-02,Хелловін,0.06633
2,2,32,2018-01-03,Хелловін,-0.742322
3,3,23,2018-01-04,Хелловін,0.205484
4,4,39,2018-01-05,Хелловін,-1.179973


In [7]:
# evaluate an ARIMA model for a given order (p,d,q)
# source: https://machinelearningmastery.com/grid-search-arima-hyperparameters-with-python/

def evaluate_arima_model(X, arima_order):
    # prepare training dataset
    train_size = int(len(X) * 0.66)
    train, test = X[0:train_size], X[train_size:]
    history = [x for x in train]
    # make predictions
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=arima_order)
        model_fit = model.fit(disp=0)
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(test[t])
    # calculate out of sample error
    error = mean_squared_error(test, predictions)
    return error

In [8]:
import sys

def fit_arima(dataset):
    # define grid of parameters
    p_values = range(0, 2)
    d_values = range(0, 2)
    q_values = range(0, 2)
#     evaluate_models(series.values, p_values, d_values, q_values)
    dataset = dataset.astype('float32')
    best_score, best_cfg = float("inf"), None

    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    mse = evaluate_arima_model(dataset, order)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
#                     print('ARIMA%s MSE=%.3f' % (order,mse))
                except Exception as e: 
#                     print(e)
                    continue
#     print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score))                
    return best_cfg

In [12]:
unique_pages = df['page_name'].unique()
# ts_columns should be hard coded...
ts_column_names = ['views', 'rand']

In [41]:
def initiate_stats_dict(ts_column_names):
    feature_dict = { 'page_name':[] }
    ts_features = [
        'count_records',
        'mean',
        'std',
        'ar',
        'd',
        'ma',
        'max',
        'min'
    ]
    for col in ts_column_names:
        for feature in ts_features:
            feature_dict[col+'_'+feature] = []
    return feature_dict

In [42]:
stats = initiate_stats_dict(ts_column_names)
stats

{'page_name': [],
 'rand_ar': [],
 'rand_count_records': [],
 'rand_d': [],
 'rand_ma': [],
 'rand_max': [],
 'rand_mean': [],
 'rand_min': [],
 'rand_std': [],
 'views_ar': [],
 'views_count_records': [],
 'views_d': [],
 'views_ma': [],
 'views_max': [],
 'views_mean': [],
 'views_min': [],
 'views_std': []}

In [43]:
for page in unique_pages:
    stats['page_name'].append(page)
    for col in ts_column_names:
        page_ts = df[df['page_name'] == page][col].values
        stats[col+'_'+'count_records'].append(len(page_ts))
        stats[col+'_'+'mean'].append(page_ts.mean())
        stats[col+'_'+'std'].append(page_ts.std())
        stats[col+'_'+'min'].append(page_ts.min())
        stats[col+'_'+'max'].append(page_ts.max())
        
        ar, d, ma = fit_arima(page_ts)
        stats[col+'_'+'ar'].append(ar)
        stats[col+'_'+'d'].append(d)
        stats[col+'_'+'ma'].append(ma)

In [44]:
stats

{'page_name': ['Хелловін', 'Танці_з_зірками', 'Мартин_(птах)'],
 'rand_ar': [0, 0, 1],
 'rand_count_records': [41, 41, 41],
 'rand_d': [0, 0, 1],
 'rand_ma': [1, 0, 1],
 'rand_max': [2.2125441016427052, 2.3855768452463888, 2.3103443680416138],
 'rand_mean': [0.052474308304936011,
  0.032639600671850955,
  0.20022495535858972],
 'rand_min': [-2.3459598207990613, -1.8858183543190452, -2.4443781980976151],
 'rand_std': [1.07267159964303, 0.99880316567393612, 1.1373816539321333],
 'views_ar': [0, 0, 0],
 'views_count_records': [41, 41, 41],
 'views_d': [1, 1, 1],
 'views_ma': [1, 0, 1],
 'views_max': [62, 160, 169],
 'views_mean': [37.097560975609753, 69.658536585365852, 31.219512195121951],
 'views_min': [23, 37, 11],
 'views_std': [8.319254416782794, 28.406981069240246, 28.642891422311735]}

In [45]:
pd.DataFrame.from_dict(stats)

Unnamed: 0,page_name,rand_ar,rand_count_records,rand_d,rand_ma,rand_max,rand_mean,rand_min,rand_std,views_ar,views_count_records,views_d,views_ma,views_max,views_mean,views_min,views_std
0,Хелловін,0,41,0,1,2.212544,0.052474,-2.34596,1.072672,0,41,1,1,62,37.097561,23,8.319254
1,Танці_з_зірками,0,41,0,0,2.385577,0.03264,-1.885818,0.998803,0,41,1,0,160,69.658537,37,28.406981
2,Мартин_(птах),1,41,1,1,2.310344,0.200225,-2.444378,1.137382,0,41,1,1,169,31.219512,11,28.642891
