In [1]:
import tqdm
import numpy as np
import pandas as pd
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults
from sklearn.metrics import mean_squared_error

In [2]:
# # to ignore warnings when fitting arima
# import warnings
# warnings.filterwarnings("ignore")

In [3]:
# read articles which were translated from Ukr to Eng
# this will be a filter for test
import pickle

pkl_file = open('pages.pkl', 'rb')
df = pickle.load(pkl_file)

In [35]:
# read data where each row has a structure: (datetime, pagen_name, [features])
df = pd.read_csv("40days100pages.csv")

In [33]:
df[:5]

Unnamed: 0.1,Unnamed: 0,timestamp,page_name,revisions_count,contributors_count,age_of_page_days
0,0,2018-06-07T12:00:00Z,Himantolophus_borealis,5,1,1294
1,1,2018-06-07T12:00:00Z,Himantolophus_brevirostris,6,2,1294
2,2,2018-06-07T12:00:00Z,Himantolophus_compressus,7,3,1294
3,3,2018-06-07T12:00:00Z,Himantolophus_cornifer,9,2,1294
4,4,2018-06-07T12:00:00Z,Himantolophus_crinitus,4,2,1294


In [34]:
df[df['page_name']=='Himantolophus_borealis'][:5]

Unnamed: 0.1,Unnamed: 0,timestamp,page_name,revisions_count,contributors_count,age_of_page_days
0,0,2018-06-07T12:00:00Z,Himantolophus_borealis,5,1,1294
100,100,2018-06-06T12:00:00Z,Himantolophus_borealis,5,1,1293
200,200,2018-06-05T12:00:00Z,Himantolophus_borealis,5,1,1292
300,300,2018-06-04T12:00:00Z,Himantolophus_borealis,5,1,1291
400,400,2018-06-03T12:00:00Z,Himantolophus_borealis,5,1,1290


In [18]:
df.describe()

Unnamed: 0.1,Unnamed: 0,revisions_count,contributors_count,age_of_page_days
count,4000.0,4000.0,4000.0,4000.0
mean,1999.5,19.5055,10.015,2834.289
std,1154.844867,51.502681,12.913383,1437.922958
min,0.0,1.0,1.0,287.0
25%,999.75,5.0,2.0,1323.0
50%,1999.5,11.0,8.5,3354.0
75%,2999.25,16.5,13.0,4189.0
max,3999.0,472.0,112.0,4463.0


In [89]:
# find coefficient characterising trend of TS by fitting a simple arima
def get_trend(dataset):
    if dataset.std() == 0:
        return 0
    else:
        model = ARIMA(dataset, order=(1,0,0))
        model_fit = model.fit()
        return model_fit.arparams[0]

In [43]:
dataset = df[df['page_name']=='Himantolophus_borealis']
dataset.sort_values(by = ['timestamp'],ascending=False)
dataset[:5]

Unnamed: 0.1,Unnamed: 0,timestamp,page_name,revisions_count,contributors_count,age_of_page_days
0,0,2018-06-07T12:00:00Z,Himantolophus_borealis,5,1,1294
100,100,2018-06-06T12:00:00Z,Himantolophus_borealis,5,1,1293
200,200,2018-06-05T12:00:00Z,Himantolophus_borealis,5,1,1292
300,300,2018-06-04T12:00:00Z,Himantolophus_borealis,5,1,1291
400,400,2018-06-03T12:00:00Z,Himantolophus_borealis,5,1,1290


In [71]:
def initiate_stats_dict(ts_column_names):
    feature_dict = { 'page_name':[] }
    ts_features = [
        'mean',
        'std',
        'ar',
        'max',
        'min',
        'avg_last_week',
        'avg_last_month',
        'avg_all_period'
    ]
    for col in ts_column_names:
        for feature in ts_features:
            feature_dict[col+'_'+feature] = []
    return feature_dict

In [72]:
unique_pages = df['page_name'].unique()
# ts_columns should be hard coded...
ts_column_names = [
    'revisions_count', 
    'contributors_count'
#     'age_of_page_days'
]

In [73]:
stats = initiate_stats_dict(ts_column_names)
stats

{'contributors_count_ar': [],
 'contributors_count_avg_all_period': [],
 'contributors_count_avg_last_month': [],
 'contributors_count_avg_last_week': [],
 'contributors_count_max': [],
 'contributors_count_mean': [],
 'contributors_count_min': [],
 'contributors_count_std': [],
 'page_name': [],
 'revisions_count_ar': [],
 'revisions_count_avg_all_period': [],
 'revisions_count_avg_last_month': [],
 'revisions_count_avg_last_week': [],
 'revisions_count_max': [],
 'revisions_count_mean': [],
 'revisions_count_min': [],
 'revisions_count_std': []}

In [74]:
for page in unique_pages:
    stats['page_name'].append(page)
    page_data = df[df['page_name'] == page].sort_values(by = ['timestamp'],ascending=False)
    for col in ts_column_names:
        page_ts = page_data[col].values
#         stats[col+'_'+'count_records'].append(len(page_ts))
        stats[col+'_'+'mean'].append(page_ts.mean())
        stats[col+'_'+'std'].append(page_ts.std())
        stats[col+'_'+'min'].append(page_ts.min())
        stats[col+'_'+'max'].append(page_ts.max())
        stats[col+'_'+'ar'].append(get_trend(page_ts))
        stats[col+'_'+'avg_last_week'].append(np.average(page_ts[:7]))
        stats[col+'_'+'avg_last_month'].append(np.average(page_ts[:30]))
        stats[col+'_'+'avg_all_period'].append(np.average(page_ts))

In [75]:
stats

{'contributors_count_ar': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0.94818226145364404,
  0.94818226145364404,
  0.94818226145364404,
  0.94818225292017222,
  0.94818224627325809,
  0.94818227987173009,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0.88856211160988274,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'contributors_count_avg_all_period': [1.0,
  2.0,
  3.0,
  2.0,
  2.0,
  1.0,
  2.0,
  2.0,
  1.0,
  2.0,
  2.0,
  1.0,
  1.0,
  2.0,
  2.0,
  2.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  3.0,
  16.0,
  31.0,
  3.0,
  7.0,
  22.0,
  8.0,
  1.0,
  48.0,
  7.0,
  5.0,
  18.0,
  4.0,
  5.0,
  4.0,
  1.0,
  1.0,
  26.0,
  6.0,
  2.0,
  11.425000

In [76]:
result = pd.DataFrame.from_dict(stats)
result[:5]

Unnamed: 0,contributors_count_ar,contributors_count_avg_all_period,contributors_count_avg_last_month,contributors_count_avg_last_week,contributors_count_max,contributors_count_mean,contributors_count_min,contributors_count_std,page_name,revisions_count_ar,revisions_count_avg_all_period,revisions_count_avg_last_month,revisions_count_avg_last_week,revisions_count_max,revisions_count_mean,revisions_count_min,revisions_count_std
0,0.0,1.0,1.0,1.0,1,1.0,1,0.0,Himantolophus_borealis,0.0,5.0,5.0,5.0,5,5.0,5,0.0
1,0.0,2.0,2.0,2.0,2,2.0,2,0.0,Himantolophus_brevirostris,0.0,6.0,6.0,6.0,6,6.0,6,0.0
2,0.0,3.0,3.0,3.0,3,3.0,3,0.0,Himantolophus_compressus,0.0,7.0,7.0,7.0,7,7.0,7,0.0
3,0.0,2.0,2.0,2.0,2,2.0,2,0.0,Himantolophus_cornifer,0.0,9.0,9.0,9.0,9,9.0,9,0.0
4,0.0,2.0,2.0,2.0,2,2.0,2,0.0,Himantolophus_crinitus,0.0,4.0,4.0,4.0,4,4.0,4,0.0


In [81]:
# change order of columns
cols = result.columns.tolist()
cols = [cols[8]] +cols[:8] +cols[9:]
result = result[cols]

In [82]:
result.to_csv("../data/preprocessed.csv", index=False)