In [None]:
import tqdm
import numpy as np
import pandas as pd
from statsmodels.tsa.arima_model import ARIMA
# from statsmodels.tsa.arima_model import ARIMAResults
from sklearn.metrics import mean_squared_error

In [None]:
# # to ignore warnings when fitting arima
# import warnings
# warnings.filterwarnings("ignore")

In [None]:
TR_PAGES_PATH = "../data/40days100pages.csv"
NOT_TR_PAGES_PATH = "../data/Not_translated40days100pages.csv"

In [None]:
# read data where each row has a structure: (datetime, pagen_name, [features])
df_translated = pd.read_csv(TR_PAGES_PATH)
df_not_translated = pd.read_csv(NOT_TR_PAGES_PATH)


In [None]:
# remove unneccessary fields
df_translated = df_translated.drop('Unnamed: 0',axis=1)
df_not_translated = df_not_translated.drop('Unnamed: 0',axis=1)
df_arr = [df_translated, df_not_translated]

In [25]:
for df in df_arr:
    display(df[:3])

Unnamed: 0,timestamp,page_name,revisions_count,contributors_count,age_of_page_days
0,2018-06-07T12:00:00Z,Himantolophus_borealis,5,1,1294
1,2018-06-07T12:00:00Z,Himantolophus_brevirostris,6,2,1294
2,2018-06-07T12:00:00Z,Himantolophus_compressus,7,3,1294


Unnamed: 0,timestamp,page_name,revisions_count,contributors_count,age_of_page_days
0,2018-06-07T12:00:00Z,(163950)_2003_UN22,9,7,2565
1,2018-06-07T12:00:00Z,(164207)_2004_GU9,21,4,896
2,2018-06-07T12:00:00Z,(16553)_1991_TL14,11,8,2568


In [44]:
df_translated[df_translated['page_name']=='Himantolophus_borealis'][:5]

Unnamed: 0,timestamp,page_name,revisions_count,contributors_count,age_of_page_days
0,2018-06-07T12:00:00Z,Himantolophus_borealis,5,1,1294
100,2018-06-06T12:00:00Z,Himantolophus_borealis,5,1,1293
200,2018-06-05T12:00:00Z,Himantolophus_borealis,5,1,1292
300,2018-06-04T12:00:00Z,Himantolophus_borealis,5,1,1291
400,2018-06-03T12:00:00Z,Himantolophus_borealis,5,1,1290


In [28]:
for df in df_arr:
    display(df.describe())

Unnamed: 0,revisions_count,contributors_count,age_of_page_days
count,4000.0,4000.0,4000.0
mean,19.5055,10.015,2785.6
std,51.502681,12.913383,1438.134923
min,1.0,1.0,219.0
25%,5.0,2.0,1281.0
50%,11.0,8.5,3305.5
75%,16.5,13.0,4138.0
max,472.0,112.0,4434.0


Unnamed: 0,revisions_count,contributors_count,age_of_page_days
count,4000.0,4000.0,4000.0
mean,11.57,7.49,2364.81
std,7.045389,2.941439,548.472834
min,3.0,1.0,163.0
25%,9.0,7.0,2532.0
50%,10.0,7.0,2544.0
75%,13.0,8.0,2555.0
max,64.0,25.0,2818.0


In [29]:
# find coefficient characterising trend of TS by fitting a simple arima
def get_trend(dataset):
    if dataset.std() == 0:
        return 0
    else:
        model = ARIMA(dataset, order=(1,0,0))
        model_fit = model.fit()
        return model_fit.arparams[0]

In [106]:
from collections import OrderedDict

def initiate_stats_dict(ts_column_names):
    feature_dict = OrderedDict( [('page_name',[])] )
    feature_dict['page_age_days'] = []
    ts_features = [
        'mean',
        'std',
        'ar',
        'max',
        'min',
        'range',
        'avg_last_week',
        'avg_last_month',
        'avg_all_period'
    ]
    for col in ts_column_names:
        for feature in ts_features:
            feature_dict[col+'_'+feature] = []
    return feature_dict

In [37]:
# ts_columns should be hard coded...
ts_column_names = [
    'revisions_count', 
    'contributors_count'
]

In [114]:
def create_stats(df):
    unique_pages = df['page_name'].unique()
    stats = initiate_stats_dict(ts_column_names)
    # display(stats)
    
    for page in unique_pages:
        stats['page_name'].append(page)
        page_data = df[df['page_name'] == page].sort_values(by = ['timestamp'],ascending=False)
        stats['page_age_days'].append(page_data['age_of_page_days'].values.max())
        for col in ts_column_names:
            page_ts = page_data[col].values
            stats[col+'_'+'mean'].append(float(page_ts.mean()))
            stats[col+'_'+'std'].append(float(page_ts.std()))
            stats[col+'_'+'min'].append(float(page_ts.min()))
            stats[col+'_'+'max'].append(float(page_ts.max() - page_ts.min()))
            stats[col+'_'+'range'].append(float(page_ts.max()))
            stats[col+'_'+'ar'].append(float(get_trend(page_ts)))
            stats[col+'_'+'avg_last_week'].append(np.average(page_ts[:7]))
            stats[col+'_'+'avg_last_month'].append(np.average(page_ts[:30]))
            stats[col+'_'+'avg_all_period'].append(np.average(page_ts))
    
    #validate stats structure (all columns should be of the same length
    n_names = len(unique_pages)
    assert all(len(stats[name])==n_names for name in stats.keys())
    
    result = pd.DataFrame.from_dict(stats)
    
#     display(result[:3])
    return result

In [115]:
result_tr = create_stats(df_translated)
result_tr['translated'] = 1
display(result_tr[:3])

result_not_tr = create_stats(df_not_translated)
result_not_tr['translated'] = 0
display(result_not_tr[:3])

Unnamed: 0,page_name,page_age_days,revisions_count_mean,revisions_count_std,revisions_count_ar,revisions_count_max,revisions_count_min,revisions_count_range,revisions_count_avg_last_week,revisions_count_avg_last_month,...,contributors_count_mean,contributors_count_std,contributors_count_ar,contributors_count_max,contributors_count_min,contributors_count_range,contributors_count_avg_last_week,contributors_count_avg_last_month,contributors_count_avg_all_period,translated
0,Himantolophus_borealis,1294,5.0,0.0,0.0,0.0,5.0,5.0,5.0,5.0,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1
1,Himantolophus_brevirostris,1294,6.0,0.0,0.0,0.0,6.0,6.0,6.0,6.0,...,2.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,1
2,Himantolophus_compressus,1294,7.0,0.0,0.0,0.0,7.0,7.0,7.0,7.0,...,3.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,1


Unnamed: 0,page_name,page_age_days,revisions_count_mean,revisions_count_std,revisions_count_ar,revisions_count_max,revisions_count_min,revisions_count_range,revisions_count_avg_last_week,revisions_count_avg_last_month,...,contributors_count_mean,contributors_count_std,contributors_count_ar,contributors_count_max,contributors_count_min,contributors_count_range,contributors_count_avg_last_week,contributors_count_avg_last_month,contributors_count_avg_all_period,translated
0,(163950)_2003_UN22,2565,9.0,0.0,0.0,0.0,9.0,9.0,9.0,9.0,...,7.0,0.0,0.0,0.0,7.0,7.0,7.0,7.0,7.0,0
1,(164207)_2004_GU9,896,21.0,0.0,0.0,0.0,21.0,21.0,21.0,21.0,...,4.0,0.0,0.0,0.0,4.0,4.0,4.0,4.0,4.0,0
2,(16553)_1991_TL14,2568,11.0,0.0,0.0,0.0,11.0,11.0,11.0,11.0,...,8.0,0.0,0.0,0.0,8.0,8.0,8.0,8.0,8.0,0


In [116]:
# write results
result_tr.to_csv("../data/tr_preprocessed.csv", index=False)
result_not_tr.to_csv("../data/not_tr_preprocessed.csv", index=False)