In [1]:
import tqdm
import numpy as np
import pandas as pd
from statsmodels.tsa.arima_model import ARIMA
# from statsmodels.tsa.arima_model import ARIMAResults
from sklearn.metrics import mean_squared_error

In [None]:
# # to ignore warnings when fitting arima
# import warnings
# warnings.filterwarnings("ignore")

In [2]:
TR_PAGES_PATH = "../data/Clean_df_timeseries.csv"
NOT_TR_PAGES_PATH = "../data/Not_tranlated_timeseries.csv"

In [3]:
# read data where each row has a structure: (datetime, pagen_name, [features])
df_translated = pd.read_csv(TR_PAGES_PATH)
df_not_translated = pd.read_csv(NOT_TR_PAGES_PATH)


In [4]:
# remove unneccessary fields
df_translated = df_translated.drop('Unnamed: 0',axis=1)
df_not_translated = df_not_translated.drop('Unnamed: 0',axis=1)
df_arr = [df_translated, df_not_translated]

In [5]:
for df in df_arr:
    display(df[:3])

Unnamed: 0,timestamp,page_name,revisions_count,contributors_count,age_of_page_days,num_of_views
0,2004-11-05T23:24:05Z,Мільярд,2,2,280,0.0
1,2004-11-04T23:24:05Z,Мільярд,2,2,279,0.0
2,2004-11-03T23:24:05Z,Мільярд,2,2,278,0.0


Unnamed: 0,timestamp,page_name,revisions_count,contributors_count,age_of_page_days,num_of_views
0,2018-07-10T00:00:00Z,Іванівська сільська рада (Кам'янсько-Дніпровсь...,9,7,2298,0.0
1,2018-07-10T00:00:00Z,Категорія:Архітектори Кам'янського,4,2,633,0.0
2,2018-07-10T00:00:00Z,Категорія:Посли Непалу в Болгарії,1,1,416,0.0


In [6]:
df_translated[df_translated['page_name']=='Himantolophus_borealis'][:5]

Unnamed: 0,timestamp,page_name,revisions_count,contributors_count,age_of_page_days,num_of_views


In [7]:
for df in df_arr:
    display(df.describe())

Unnamed: 0,revisions_count,contributors_count,age_of_page_days,num_of_views
count,167250.0,167250.0,167250.0,167250.0
mean,13.837232,7.118164,1150.646547,0.220897
std,27.987941,9.285681,1005.995837,8.950244
min,1.0,1.0,1.0,0.0
25%,3.0,2.0,365.0,0.0
50%,6.0,4.0,780.0,0.0
75%,14.0,8.0,1714.0,0.0
max,731.0,208.0,4642.0,1958.0


Unnamed: 0,revisions_count,contributors_count,age_of_page_days,num_of_views
count,149548.0,149548.0,149548.0,149548.0
mean,12.055975,5.737201,2031.270936,0.030204
std,22.666954,6.606972,1272.215753,0.331828
min,1.0,1.0,0.0,0.0
25%,3.0,2.0,941.0,0.0
50%,6.0,4.0,1972.0,0.0
75%,14.0,8.0,3028.0,0.0
max,586.0,102.0,5195.0,21.0


In [8]:
# find coefficient characterising trend of TS by fitting a simple arima
def get_trend(dataset):
    if dataset.std() == 0:
        return 0
    else:
        model = ARIMA(dataset, order=(1,0,0))
        model_fit = model.fit()
        return model_fit.arparams[0]

In [9]:
from collections import OrderedDict

def initiate_stats_dict(ts_column_names):
    feature_dict = OrderedDict( [('page_name',[])] )
    feature_dict['page_age_days'] = []
    ts_features = [
        'mean',
        'std',
        'ar',
        'max',
        'min',
        'range',
        'avg_last_week',
        'avg_last_month',
        'avg_all_period'
    ]
    for col in ts_column_names:
        for feature in ts_features:
            feature_dict[col+'_'+feature] = []
    return feature_dict

In [10]:
# ts_columns should be hard coded...
ts_column_names = [
    'revisions_count', 
    'contributors_count'
]

In [11]:
def create_stats(df):
    unique_pages = df['page_name'].unique()
    stats = initiate_stats_dict(ts_column_names)
    # display(stats)
    
    for page in unique_pages:
        stats['page_name'].append(page)
        page_data = df[df['page_name'] == page].sort_values(by = ['timestamp'],ascending=False)
        stats['page_age_days'].append(page_data['age_of_page_days'].values.max())
        for col in ts_column_names:
            page_ts = page_data[col].values
            stats[col+'_'+'mean'].append(float(page_ts.mean()))
            stats[col+'_'+'std'].append(float(page_ts.std()))
            stats[col+'_'+'min'].append(float(page_ts.min()))
            stats[col+'_'+'max'].append(float(page_ts.max() - page_ts.min()))
            stats[col+'_'+'range'].append(float(page_ts.max()))
            stats[col+'_'+'ar'].append(float(get_trend(page_ts)))
            stats[col+'_'+'avg_last_week'].append(np.average(page_ts[:7]))
            stats[col+'_'+'avg_last_month'].append(np.average(page_ts[:30]))
            stats[col+'_'+'avg_all_period'].append(np.average(page_ts))
    
    #validate stats structure (all columns should be of the same length
    n_names = len(unique_pages)
    assert all(len(stats[name])==n_names for name in stats.keys())
    
    result = pd.DataFrame.from_dict(stats)
    
#     display(result[:3])
    return result

In [12]:
result_tr = create_stats(df_translated)
result_tr['translated'] = 1
display(result_tr[:3])

result_not_tr = create_stats(df_not_translated)
result_not_tr['translated'] = 0
display(result_not_tr[:3])

  newparams = ((1-np.exp(-params))/
  (1+np.exp(-params))).copy()
  (1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/
  (1+np.exp(-params))).copy()
  (1+np.exp(-params))).copy()
  R_mat, T_mat)


Unnamed: 0,page_name,page_age_days,revisions_count_mean,revisions_count_std,revisions_count_ar,revisions_count_max,revisions_count_min,revisions_count_range,revisions_count_avg_last_week,revisions_count_avg_last_month,...,contributors_count_mean,contributors_count_std,contributors_count_ar,contributors_count_max,contributors_count_min,contributors_count_range,contributors_count_avg_last_week,contributors_count_avg_last_month,contributors_count_avg_all_period,translated
0,Мільярд,280,2.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,...,2.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,1
1,1020-ті,1204,6.0,0.0,0.0,0.0,6.0,6.0,6.0,6.0,...,4.0,0.0,0.0,0.0,4.0,4.0,4.0,4.0,4.0,1
2,1345,1334,31.2,0.4,0.914108,1.0,31.0,32.0,31.857143,31.2,...,11.0,0.0,0.0,0.0,11.0,11.0,11.0,11.0,11.0,1




Unnamed: 0,page_name,page_age_days,revisions_count_mean,revisions_count_std,revisions_count_ar,revisions_count_max,revisions_count_min,revisions_count_range,revisions_count_avg_last_week,revisions_count_avg_last_month,...,contributors_count_mean,contributors_count_std,contributors_count_ar,contributors_count_max,contributors_count_min,contributors_count_range,contributors_count_avg_last_week,contributors_count_avg_last_month,contributors_count_avg_all_period,translated
0,Іванівська сільська рада (Кам'янсько-Дніпровсь...,2298,9.0,0.0,0.0,0.0,9.0,9.0,9.0,9.0,...,7.0,0.0,0.0,0.0,7.0,7.0,7.0,7.0,7.0,0
1,Категорія:Архітектори Кам'янського,633,4.0,0.0,0.0,0.0,4.0,4.0,4.0,4.0,...,2.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,0
2,Категорія:Посли Непалу в Болгарії,416,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0


In [13]:
# write results
result_tr.to_csv("../data/tr_preprocessed.csv", index=False)
result_not_tr.to_csv("../data/not_tr_preprocessed.csv", index=False)