In [None]:
# from SPARQLWrapper import SPARQLWrapper, JSON

#from ratelimiter import RateLimiter
from collections import OrderedDict
import pandas as pd
#%load_ext sql_magic

import os
import glob
from urllib.parse import unquote

import wmfdata as wmf
from wmfdata import charting, mariadb, hive
from wmfdata.utils import pct_str, pd_display_all

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

In [None]:
#read csv
df_relative_length = pd.read_csv('../../../../data/processed/relative_length/indonesia_relative_lengths.csv', encoding='utf-8', index=False)

In [None]:
#count of clean-non-null entries of featured articles in each wiki
df_relative_length.groupby(df_relative_length['database_code'])['page_len'].count()

In [None]:
#count of clean-non-null entries of featured articles in each wiki
df_relative_length.groupby(df_relative_length['wikicode'])['page_len'].count()

In [None]:
median_values = df_relative_length.groupby(df_relative_length['database_code'])['page_len'].median()

In [None]:
%store median_values

In [None]:
ax = median_values.plot (kind='bar', figsize=(10,6), color="indigo", fontsize=11);
ax.set_alpha(0.8)
ax.set_title("Median Values", fontsize=22)
ax.set_ylabel("page_len", fontsize=15);
plt.show()
path = './results/figs/relative_length/'
plt.savefig(path+'1_median_values.png');

In [None]:
#about error bars: https://en.wikipedia.org/wiki/Error_bar
sns.set(style='whitegrid')
plt.figure(figsize=(13,7))
g = sns.barplot(x='wikicode', y='page_len', data=df_relative_length)
g.set_yscale("log")
plt.title('Logscale Pagelen Bar Plot: as,bn,gu,hi,kn,ml,mr,or,pa,sa,ta,te,ur')
path = './results/figs/relative_length/'
plt.savefig(path+'2_logscale_pagelens_by_wiki');

In [None]:
# Sort the dataframe by wikicode and plot in distplot
# see more about y axis kde value: https://stackoverflow.com/questions/51666784/what-is-y-axis-in-seaborn-distplot
wikicode_bn = df_relative_length.loc[df_relative_length['wikicode'] == 'bnwiki'] #normal distribution
wikicode_ml = df_relative_length.loc[df_relative_length['wikicode'] == 'mlwiki'] 
sns.distplot(wikicode_bn[['page_len']], hist=False, rug=True, label ='bnwiki')
sns.distplot(wikicode_ml[['page_len']], hist=False, rug=True, label ='mlwiki')
plt.ylabel('Density')
plt.xlabel('Page Length')
plt.title('Page Length & KDE: bn, ml')
plt.legend()
plt.show()
path = './results/figs/relative_length/'
plt.savefig(path+'3_Pagelen_KDE_bn_ml.png');

In [None]:
wikicode_hi = df_relative_length.loc[df_relative_length['wikicode'] == 'hiwiki']
wikicode_kn = df_relative_length.loc[df_relative_length['wikicode'] == 'knwiki'] 
wikicode_gu = df_relative_length.loc[df_relative_length['wikicode'] == 'guwiki']
wikicode_te = df_relative_length.loc[df_relative_length['wikicode'] == 'tewiki']

sns.distplot(wikicode_hi[['page_len']], hist=False, rug=True, label ='hiwiki')
sns.distplot(wikicode_kn[['page_len']], hist=False, rug=True, label ='knwiki')
sns.distplot(wikicode_gu[['page_len']], hist=False, rug=True, label ='guwiki')
sns.distplot(wikicode_te[['page_len']], hist=False, rug=True, label ='tewiki')
plt.ylabel('Density')
plt.xlabel('Page Length')
plt.title('Page Length & KDE: hi, kn, gu, te')
plt.legend()
plt.show()
path = './results/figs/relative_length/'
plt.savefig(path+'4_Pagelen_KDE_hi_kn_gu_te');

In [None]:
wikicode_ur = df_relative_length.loc[df_relative_length['wikicode'] == 'urwiki']
wikicode_sa = df_relative_length.loc[df_relative_length['wikicode'] == 'sawiki']
sns.distplot(wikicode_ur[['page_len']], rug=True, label ='urwiki')
sns.distplot(wikicode_sa[['page_len']], hist=False, rug=True, label ='sawiki')
plt.ylabel('Density')
plt.xlabel('Page Length')
plt.title('Page Length & KDE: sa, ur')
plt.legend()
plt.show()
path = './results/figs/relative_length/'
plt.savefig(path+'5_Pagelen_KDE_sa_ur');

In [None]:
wikicode_or = df_relative_length.loc[df_relative_length['wikicode'] == 'orwiki']
sns.distplot(wikicode_or[['page_len']], rug=True, label ='orwiki')
plt.ylabel('Density')
plt.xlabel('Page Length')
plt.title('Page Length & KDE: or')
plt.legend()
plt.show()
path = './results/figs/relative_length/'
plt.savefig(path+'6_Pagelen_KDE_or');

In [None]:
plt.figure(figsize=(17,8))
wikicode_as = df_relative_length.loc[df_relative_length['wikicode'] == 'aswiki'] #12 featured articles
wikicode_mr = df_relative_length.loc[df_relative_length['wikicode'] == 'mrwiki']
wikicode_ta = df_relative_length.loc[df_relative_length['wikicode'] == 'tawiki'] #13 featured articles
sns.distplot(wikicode_as[['page_len']], rug=True, label = 'aswiki')
sns.distplot(wikicode_mr[['page_len']], rug=True, label = 'mrwiki')
sns.distplot(wikicode_ta[['page_len']], rug=True, label = 'tawiki')
plt.ylabel('Density')
plt.xlabel('Page Length')
plt.title('Page Length & KDE: as, mr, ta')
plt.legend()
plt.show()
path = './results/figs/relative_length/'
plt.savefig(path+'7_Pagelen_KDE_as_mr_ta');

In [None]:
review_wikis = df_relative_length[df_relative_length['wikicode'].isin(['aswiki', 'mrwiki', 'tawiki'  ]) ]

In [None]:
review_wiki_list = review_wikis.wikicode.unique()
plt.figure(figsize=(13,7))
plt.hist([review_wikis.loc[review_wikis.wikicode ==x, 'page_len']for x in review_wiki_list], label=review_wiki_list);
plt.ylabel('Count')
plt.xlabel('Page Length')
plt.title('Page Length & Count: mr, ta, as')
plt.legend()
plt.show()
path = './results/figs/relative_length/'
plt.savefig(path+'8_Pagelen_count_mr_ta_as');

In [None]:
#about error bars: https://en.wikipedia.org/wiki/Error_bar
sns.set(style='whitegrid')
plt.figure(figsize=(13,7))
g = sns.barplot(x='wikicode', y='page_len', data=review_wikis)
g.set_yscale("log")
plt.title('Logscale Pagelen Bar Plot: mr, ta, as')
path = './results/figs/relative_length/'
plt.savefig(path+'9_logscale_barplot_mr_ta_as');

### Address pawiki which only had one featured article

In [None]:
#wikicode_pa has only one featured article
df_relative_length.loc[df_relative_length['wikicode'] == 'pawiki']

In [None]:
median_value_pa = df_relative_length[df_relative_length['wikicode']== 'pawiki'].page_len

### Address tcywiki and satwiki which had zero featured articles

In [None]:
tcy_articles_r = wmf.mariadb.run(articles_len_wiki_query, 'tcywiki')
sat_articles_r = wmf.mariadb.run(articles_len_wiki_query, 'satwiki')

#### start by cleaning tcy_articles_r data

In [None]:
# |
#check to see if any of the page_ids are redirects or double redirects
((tcy_articles_r['p1_is_redirect']==1) & (tcy_articles_r['is_double_redirect']==1)).any()

In [None]:
# |
#check to see if any of the page_ids are redirects or double redirects
((tcy_articles_r['p1_is_redirect']==1) | (tcy_articles_r['is_double_redirect']==1)).any()

In [None]:
# act on the results ...featured_pages_redirects_r

#create a df 
all_surviving_articles_tcy = tcy_articles_r[['page_id','page_title', 'page_len']] 

#seperate the redirected items into their own df
redirects_tcy = tcy_articles_r.loc[tcy_articles_r['p1_is_redirect']==1]
#pull only p1.page_id, p1.page_title, p1.page_len 
redirect_df_tcy = redirects_tcy[['page_id','page_title','page_len']] 

#remove the redirect items from the all_surviving_articles df & create global articles df
articles_tcy =  all_surviving_articles_tcy[~all_surviving_articles_tcy.isin(redirect_df_tcy)].dropna(how='all')


#### now clean sat_articles_r data

In [None]:
# Now with sat_articles_r
#check to see if any of the page_ids are redirects or double redirects
((sat_articles_r['p1_is_redirect']==1) & (sat_articles_r['is_double_redirect']==1)).any()

In [None]:
# |
#check to see if any of the page_ids are redirects or double redirects
((sat_articles_r['p1_is_redirect']==1) | (sat_articles_r['is_double_redirect']==1)).any()

In [None]:
# act on the results ...featured_pages_redirects_r

#create a df 
all_surviving_articles_sat = sat_articles_r[['page_id','page_title', 'page_len']] 

#seperate the redirected items into their own df
redirects_sat = sat_articles_r.loc[sat_articles_r['p1_is_redirect']==1]
#pull only p1.page_id, p1.page_title, p1.page_len 
redirect_df_sat = redirects_sat[['page_id','page_title','page_len']] 

#remove the redirect items from the all_surviving_articles df & create global articles df
articles_sat =  all_surviving_articles_sat[~all_surviving_articles_sat.isin(redirect_df_sat)].dropna(how='all')


In [None]:
articles_tcy.info()

In [None]:
articles_sat.info()

In [None]:
median_values_sat = articles_sat['page_len'].median()
median_values_sat 

In [None]:
articles_sat['page_len'].hist(bins=30, figsize=(16,18))
plt.ylabel('Count')
plt.xlabel('Page Length')
plt.title('Page Length Distribution: sat')
path = './results/figs/relative_length/'
plt.savefig(path+'10_pagelen_dis_sat');

In [None]:
median_values_tcy = articles_tcy['page_len'].median()
median_values_tcy

In [None]:
articles_tcy['page_len'].hist(bins=20, figsize=(16,18))
plt.ylabel('Count')
plt.xlabel('Page Length')
plt.title('Page Length Distribution: tcy')
path = './results/figs/relative_length/'
plt.savefig(path+'11_pagelen_dis_tcy');

## create an index of median values

In [None]:
#first convert tcy and sat median values into a df
tcy_sat = pd.DataFrame({'wikicode': {0:'satwiki', 1: 'tcywiki'},
                        'page_len': {0:6438, 1:3133}
                       })

In [None]:
median_values = median_values.to_frame().reset_index()

In [None]:
median_value_pa = median_value_pa.to_frame().reset_index().rename(columns={'index':'wikicode'})

In [None]:
median_value_pa['wikicode']= median_value_pa['wikicode'].map({461:'pawiki'})

In [None]:
IN_median_vi = pd.concat([median_values,median_value_pa, tcy_sat])

In [None]:
IN_median_vi.sort_values(['page_len']).reset_index(drop=True);

In [None]:
IN_median_vi.rename(columns={'page_len': 'mpl_index'}, inplace=True)

In [None]:
%store IN_median_vi