In [2]:
import tqdm
import numpy as np
import pandas as pd
from statsmodels.tsa.arima_model import ARIMA
# from statsmodels.tsa.arima_model import ARIMAResults
from sklearn.metrics import mean_squared_error

In [79]:
pd.set_option('display.max_columns', 99)

In [85]:
TR_PAGES_PATH = "../data/all_translated.csv"
NOT_TR_PAGES_PATH = "../data/all_not_translated.csv"

In [86]:
# read data where each row has a structure: (datetime, pagen_name, [features])
df_translated = pd.read_csv(TR_PAGES_PATH)
df_not_translated = pd.read_csv(NOT_TR_PAGES_PATH)

In [87]:
# remove unneccessary fields
df_translated = df_translated.drop('Unnamed: 0',axis=1).drop('Unnamed: 0.1',axis=1).fillna(0, axis = 0)
df_not_translated = df_not_translated.drop('Unnamed: 0',axis=1).drop('Unnamed: 0.1',axis=1).fillna(0, axis = 0)
df_arr = [df_translated, df_not_translated]

In [88]:
for df in df_arr:
    display(df[:5])

Unnamed: 0,timestamp,page_name,revisions_count,contributors_count,age_of_page_days,num_of_views,incoming_links,outcoming_links
0,2004-11-05T23:24:05Z,Мільярд,2,2,280,0.0,166.0,29
1,2004-11-04T23:24:05Z,Мільярд,2,2,279,0.0,166.0,29
2,2004-11-03T23:24:05Z,Мільярд,2,2,278,0.0,166.0,29
3,2004-11-02T23:24:05Z,Мільярд,2,2,277,0.0,166.0,29
4,2004-11-01T23:24:05Z,Мільярд,2,2,276,0.0,166.0,29


Unnamed: 0,timestamp,page_name,revisions_count,contributors_count,age_of_page_days,num_of_views,incoming_links,outcoming_links
0,2018-07-10T00:00:00Z,Іванівська сільська рада (Кам'янсько-Дніпровський район),9,7,2298,0.0,0.0,0.0
1,2018-07-10T00:00:00Z,Категорія:Архітектори Кам'янського,4,2,633,0.0,0.0,0.0
2,2018-07-10T00:00:00Z,Категорія:Посли Непалу в Болгарії,1,1,416,0.0,0.0,0.0
3,2018-07-10T00:00:00Z,Князєв,12,8,2472,0.0,10.0,22.0
4,2018-07-10T00:00:00Z,Дущак Анна Кирилівна,23,15,2182,0.0,0.0,0.0


Number of views is not 0 everywhere, no worries =)

In [89]:
for df in df_arr:
    display(df.describe())

Unnamed: 0,revisions_count,contributors_count,age_of_page_days,num_of_views,incoming_links,outcoming_links
count,251040.0,251040.0,251040.0,251040.0,251040.0,251040.0
mean,16.185739,7.921108,1144.876195,0.273642,119.940368,72.577557
std,33.09134,10.341049,1023.530605,10.154088,282.56764,100.32246
min,1.0,1.0,1.0,0.0,0.0,1.0
25%,3.0,2.0,316.0,0.0,6.0,8.0
50%,7.0,5.0,777.0,0.0,42.0,32.0
75%,16.0,9.0,1756.0,0.0,127.0,108.0
max,731.0,208.0,4642.0,1958.0,6041.0,1223.0


Unnamed: 0,revisions_count,contributors_count,age_of_page_days,num_of_views,incoming_links,outcoming_links
count,158368.0,158368.0,158368.0,158368.0,158368.0,158368.0
mean,12.616052,6.004622,2053.011757,0.035613,12.114625,12.863394
std,25.360375,7.098979,1290.915155,0.37159,212.942454,38.768984
min,1.0,1.0,0.0,0.0,0.0,0.0
25%,3.0,2.0,931.0,0.0,0.0,0.0
50%,6.0,4.0,2029.0,0.0,0.0,0.0
75%,15.0,8.0,3141.0,0.0,0.0,0.0
max,586.0,102.0,5195.0,21.0,15238.0,818.0


In [90]:
# find coefficient characterising trend of TS by fitting a simple arima
def get_trend(dataset):
    if dataset.std() == 0:
        return 0
    else:
        model = ARIMA(dataset, order=(1,0,0))
        model_fit = model.fit()
        return model_fit.arparams[0]

In [95]:
from collections import OrderedDict

def initiate_stats_dict(ts_column_names):
    feature_dict = OrderedDict( [('page_name',[])] )
    feature_dict['page_age_days'] = []
    ts_features = [
        'mean',
        'std',
        'ar',
        'max',
        'min',
        'range',
        'avg_last_week',
#         'avg_last_month',
        'avg_all_period'
    ]
    for col in ts_column_names:
        for feature in ts_features:
            feature_dict[col+'_'+feature] = []
    return feature_dict

In [96]:
# ts_columns should be hard coded...
ts_column_names = [
    'revisions_count', 
    'contributors_count',
    'num_of_views',
    'incoming_links',
    'outcoming_links'
]

In [97]:
def create_stats(df):
    unique_pages = df['page_name'].unique()
    stats = initiate_stats_dict(ts_column_names)
    # display(stats)
    
    for page in unique_pages:
        stats['page_name'].append(page)
        page_data = df[df['page_name'] == page].sort_values(by = ['timestamp'],ascending=False)
        stats['page_age_days'].append(page_data['age_of_page_days'].values.max())
        for col in ts_column_names:
            page_ts = page_data[col].values
            stats[col+'_'+'mean'].append(float(page_ts.mean()))
            stats[col+'_'+'std'].append(float(page_ts.std()))
            stats[col+'_'+'min'].append(float(page_ts.min()))
            stats[col+'_'+'max'].append(float(page_ts.max()))
            stats[col+'_'+'range'].append(float(page_ts.max() - page_ts.min()))
            stats[col+'_'+'ar'].append(float(get_trend(page_ts)))
            stats[col+'_'+'avg_last_week'].append(np.average(page_ts[:7]))
#             stats[col+'_'+'avg_last_month'].append(np.average(page_ts[:30]))
            stats[col+'_'+'avg_all_period'].append(np.average(page_ts))
    
    #validate stats structure (all columns should be of the same length
    n_names = len(unique_pages)
    assert all(len(stats[name])==n_names for name in stats.keys())
    
    result = pd.DataFrame.from_dict(stats)
    
    return result

In [98]:
result_tr = create_stats(df_translated)
result_tr['translated'] = 1
display(result_tr[:3])

result_not_tr = create_stats(df_not_translated)
result_not_tr['translated'] = 0
display(result_not_tr[:3])

  R_mat, T_mat)
  newparams = ((1-np.exp(-params))/
  (1+np.exp(-params))).copy()
  (1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/
  (1+np.exp(-params))).copy()
  (1+np.exp(-params))).copy()


Unnamed: 0,page_name,page_age_days,revisions_count_mean,revisions_count_std,revisions_count_ar,revisions_count_max,revisions_count_min,revisions_count_range,revisions_count_avg_last_week,revisions_count_avg_all_period,contributors_count_mean,contributors_count_std,contributors_count_ar,contributors_count_max,contributors_count_min,contributors_count_range,contributors_count_avg_last_week,contributors_count_avg_all_period,num_of_views_mean,num_of_views_std,num_of_views_ar,num_of_views_max,num_of_views_min,num_of_views_range,num_of_views_avg_last_week,num_of_views_avg_all_period,incoming_links_mean,incoming_links_std,incoming_links_ar,incoming_links_max,incoming_links_min,incoming_links_range,incoming_links_avg_last_week,incoming_links_avg_all_period,outcoming_links_mean,outcoming_links_std,outcoming_links_ar,outcoming_links_max,outcoming_links_min,outcoming_links_range,outcoming_links_avg_last_week,outcoming_links_avg_all_period,translated
0,Мільярд,280,2.0,0.0,0.0,2.0,2.0,0.0,2.0,2.0,2.0,0.0,0.0,2.0,2.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,166.0,0.0,0.0,166.0,166.0,0.0,166.0,166.0,29.0,0.0,0.0,29.0,29.0,0.0,29.0,29.0,1
1,1020-ті,1204,6.0,0.0,0.0,6.0,6.0,0.0,6.0,6.0,4.0,0.0,0.0,4.0,4.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,176.0,0.0,0.0,176.0,176.0,0.0,176.0,176.0,55.0,0.0,0.0,55.0,55.0,0.0,55.0,55.0,1
2,1345,1334,31.2,0.4,0.958504,32.0,31.0,1.0,32.0,31.2,11.0,0.0,0.0,11.0,11.0,0.0,11.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,146.0,0.0,0.0,146.0,146.0,0.0,146.0,146.0,55.5,36.5,-0.79661,92.0,19.0,73.0,60.714286,55.5,1




Unnamed: 0,page_name,page_age_days,revisions_count_mean,revisions_count_std,revisions_count_ar,revisions_count_max,revisions_count_min,revisions_count_range,revisions_count_avg_last_week,revisions_count_avg_all_period,contributors_count_mean,contributors_count_std,contributors_count_ar,contributors_count_max,contributors_count_min,contributors_count_range,contributors_count_avg_last_week,contributors_count_avg_all_period,num_of_views_mean,num_of_views_std,num_of_views_ar,num_of_views_max,num_of_views_min,num_of_views_range,num_of_views_avg_last_week,num_of_views_avg_all_period,incoming_links_mean,incoming_links_std,incoming_links_ar,incoming_links_max,incoming_links_min,incoming_links_range,incoming_links_avg_last_week,incoming_links_avg_all_period,outcoming_links_mean,outcoming_links_std,outcoming_links_ar,outcoming_links_max,outcoming_links_min,outcoming_links_range,outcoming_links_avg_last_week,outcoming_links_avg_all_period,translated
0,Іванівська сільська рада (Кам'янсько-Дніпровський район),2298,9.0,0.0,0.0,9.0,9.0,0.0,9.0,9.0,7.0,0.0,0.0,7.0,7.0,0.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,Категорія:Архітектори Кам'янського,633,4.0,0.0,0.0,4.0,4.0,0.0,4.0,4.0,2.0,0.0,0.0,2.0,2.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,Категорія:Посли Непалу в Болгарії,416,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


### Remove columns with 0 standard deviation

In [104]:
print(result_tr['incoming_links_range'].sum(),
      result_tr['incoming_links_std'].sum(),
      result_tr['incoming_links_ar'].sum())
print(result_not_tr['incoming_links_range'].sum(),
      result_not_tr['incoming_links_std'].sum(),
      result_not_tr['incoming_links_ar'].sum())

0.0 0.0 0.0
0.0 0.0 0.0


We can see, that incoming links number don't change in 30 days period. That is why we exclude 3 columns which characterize incoming links number dynamics from the datasets.

In [None]:
result_tr.drop(columns=['incoming_links_range', 'incoming_links_std', 'incoming_links_ar',
                        'incoming_links_max','incoming_links_min','incoming_links_avg_last_week', 
                        'incoming_links_avg_all_period' ], inplace = True)
result_not_tr.drop(columns=['incoming_links_range', 'incoming_links_std', 'incoming_links_ar', 
                            'incoming_links_max','incoming_links_min', 'incoming_links_avg_last_week',
                            'incoming_links_avg_all_period'], inplace = True)

### Save results

In [116]:
result_tr.to_csv("../data/tr_preprocessed.csv", index=False)
result_not_tr.to_csv("../data/not_tr_preprocessed.csv", index=False)