In [1]:
import tqdm
import numpy as np
import pandas as pd
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults
from sklearn.metrics import mean_squared_error

In [2]:
# # to ignore warnings when fitting arima
# import warnings
# warnings.filterwarnings("ignore")

In [4]:
# read articles which were translated from Ukr to Eng
# this will be a filter for test
import pickle

pkl_file = open('../data/pages.pkl', 'rb')
df = pickle.load(pkl_file)

In [5]:
# read data where each row has a structure: (datetime, pagen_name, [features])
# df = pd.read_csv("40days100pages.csv")
df = pd.read_csv("../data/Not_translated40days100pages.csv")

In [6]:
df[:5]

Unnamed: 0.1,Unnamed: 0,timestamp,page_name,revisions_count,contributors_count,age_of_page_days
0,0,2018-06-07T12:00:00Z,(163950)_2003_UN22,9,7,2565
1,1,2018-06-07T12:00:00Z,(164207)_2004_GU9,21,4,896
2,2,2018-06-07T12:00:00Z,(16553)_1991_TL14,11,8,2568
3,3,2018-06-07T12:00:00Z,(166609)_2002_RF232,10,8,2565
4,4,2018-06-07T12:00:00Z,(170025)_2002_VO,10,7,2565


In [7]:
df = df.drop("Unnamed: 0", axis = 1)

In [8]:
df[df['page_name']=='Himantolophus_borealis'][:5]

Unnamed: 0,timestamp,page_name,revisions_count,contributors_count,age_of_page_days


In [9]:
df.describe()

Unnamed: 0,revisions_count,contributors_count,age_of_page_days
count,4000.0,4000.0,4000.0
mean,11.57,7.49,2364.81
std,7.045389,2.941439,548.472834
min,3.0,1.0,163.0
25%,9.0,7.0,2532.0
50%,10.0,7.0,2544.0
75%,13.0,8.0,2555.0
max,64.0,25.0,2818.0


In [11]:
# find coefficient characterising trend of TS by fitting a simple arima
def get_trend(dataset):
    if dataset.std() == 0:
        return 0
    else:
        model = ARIMA(dataset, order=(1,0,0))
        model_fit = model.fit()
        return model_fit.arparams[0]

In [12]:
dataset = df[df['page_name']=='Himantolophus_borealis']
dataset.sort_values(by = ['timestamp'],ascending=False)
dataset[:5]

Unnamed: 0,timestamp,page_name,revisions_count,contributors_count,age_of_page_days


In [13]:
def initiate_stats_dict(ts_column_names):
    feature_dict = { 'page_name':[] }
    ts_features = [
        'mean',
        'std',
        'ar',
        'max',
        'min',
        'avg_last_week',
        'avg_last_month',
        'avg_all_period'
    ]
    for col in ts_column_names:
        for feature in ts_features:
            feature_dict[col+'_'+feature] = []
    return feature_dict

TODO: Process age of page

In [14]:
unique_pages = df['page_name'].unique()
# ts_columns should be hard coded...
ts_column_names = [
    'revisions_count', 
    'contributors_count'
#     'age_of_page_days'
]

In [15]:
stats = initiate_stats_dict(ts_column_names)
stats

{'contributors_count_ar': [],
 'contributors_count_avg_all_period': [],
 'contributors_count_avg_last_month': [],
 'contributors_count_avg_last_week': [],
 'contributors_count_max': [],
 'contributors_count_mean': [],
 'contributors_count_min': [],
 'contributors_count_std': [],
 'page_name': [],
 'revisions_count_ar': [],
 'revisions_count_avg_all_period': [],
 'revisions_count_avg_last_month': [],
 'revisions_count_avg_last_week': [],
 'revisions_count_max': [],
 'revisions_count_mean': [],
 'revisions_count_min': [],
 'revisions_count_std': []}

In [16]:
for page in unique_pages:
    stats['page_name'].append(page)
    page_data = df[df['page_name'] == page].sort_values(by = ['timestamp'],ascending=False)
    for col in ts_column_names:
        page_ts = page_data[col].values
#         stats[col+'_'+'count_records'].append(len(page_ts))
        stats[col+'_'+'mean'].append(page_ts.mean())
        stats[col+'_'+'std'].append(page_ts.std())
        stats[col+'_'+'min'].append(page_ts.min())
        stats[col+'_'+'max'].append(page_ts.max())
        stats[col+'_'+'ar'].append(get_trend(page_ts))
        stats[col+'_'+'avg_last_week'].append(np.average(page_ts[:7]))
        stats[col+'_'+'avg_last_month'].append(np.average(page_ts[:30]))
        stats[col+'_'+'avg_all_period'].append(np.average(page_ts))

In [17]:
stats

{'contributors_count_ar': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'contributors_count_avg_all_period': [7.0,
  4.0,
  8.0,
  8.0,
  7.0,
  7.0,
  7.0,
  6.0,
  8.0,
  7.0,
  7.0,
  7.0,
  7.0,
  2.0,
  7.0,
  7.0,
  7.0,
  7.0,
  7.0,
  7.0,
  7.0,
  7.0,
  7.0,
  8.0,
  7.0,
  8.0,
  7.0,
  7.0,
  8.0,
  7.0,
  7.0,
  7.0,
  8.0,
  7.0,
  1.0,
  3.0,
  7.0,
  8.0,
  7.0,
  7.0,
  7.0,
  8.0,
  8.0,
  7.0,
  8.0,
  10.0,
  7.0,
  8.0,
  8.0,
  8.0,
  9.0,
  8.0,
  8.0,
  7.0,
  6.0,
  6.0,
  25.0,
  7.0,
  7.0,
  6.0,
  7.0,
  7.0,
 

In [18]:
result = pd.DataFrame.from_dict(stats)
result[:5]

Unnamed: 0,contributors_count_ar,contributors_count_avg_all_period,contributors_count_avg_last_month,contributors_count_avg_last_week,contributors_count_max,contributors_count_mean,contributors_count_min,contributors_count_std,page_name,revisions_count_ar,revisions_count_avg_all_period,revisions_count_avg_last_month,revisions_count_avg_last_week,revisions_count_max,revisions_count_mean,revisions_count_min,revisions_count_std
0,0,7.0,7.0,7.0,7,7.0,7,0.0,(163950)_2003_UN22,0,9.0,9.0,9.0,9,9.0,9,0.0
1,0,4.0,4.0,4.0,4,4.0,4,0.0,(164207)_2004_GU9,0,21.0,21.0,21.0,21,21.0,21,0.0
2,0,8.0,8.0,8.0,8,8.0,8,0.0,(16553)_1991_TL14,0,11.0,11.0,11.0,11,11.0,11,0.0
3,0,8.0,8.0,8.0,8,8.0,8,0.0,(166609)_2002_RF232,0,10.0,10.0,10.0,10,10.0,10,0.0
4,0,7.0,7.0,7.0,7,7.0,7,0.0,(170025)_2002_VO,0,10.0,10.0,10.0,10,10.0,10,0.0


In [19]:
# change order of columns
cols = result.columns.tolist()
cols = [cols[8]] +cols[:8] +cols[9:]
result = result[cols]

In [20]:
# result.to_csv("../data/preprocessed.csv", index=False)
result.to_csv("../data/not_translated_preprocessed.csv", index=False)