In [12]:
# from SPARQLWrapper import SPARQLWrapper, JSON

#from ratelimiter import RateLimiter
from collections import OrderedDict
import pandas as pd
#%load_ext sql_magic

import os
import glob
from urllib.parse import unquote

import wmfdata as wmf
from wmfdata import charting, mariadb, hive
from wmfdata.utils import pct_str, pd_display_all

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

In [13]:
%store -r query_vars

In [14]:
#In the absence of a functional SPARQL LOOP CALL, run SPARQL Query manually for each wiki: https://w.wiki/Gh$

#download results into csv files, one for each wiki
#note, some wikis don't have featured articles

In [None]:
os.chdir("../../data/raw/relative_length/indonesia/")

extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

In [8]:
#combine all files in the list
combined = pd.concat([pd.read_csv(f) for f in all_filenames ], sort=True)
#export to csv
combined.to_csv( "_combined.csv", index=False, encoding='utf-8-sig')

In [17]:
#os.chdir("./data/raw/relative_length")
combined_r = pd.read_csv("_combined.csv")
combined_r2 = combined_r.copy()

In [18]:
#extract wikicode
combined_r2['wikicode'] = combined_r2['sitelink'].str.extract('([a-z]+(?=\.))', expand=True)

#add 'wiki'
combined_r2['wikicode'] = combined_r2['wikicode']+'wiki'

#extract url title
combined_r2['encoded_title'] = combined_r2['sitelink'].str.extract('([^\/]+$)', expand=True)

#encoded URL to decoded title
combined_r2['page_title'] = combined_r2['encoded_title'].apply(lambda x: unquote(x))

In [21]:
combined_r2 = combined_r2.rename(columns={'wikicode':'database_code'})

In [30]:
# adapted from https://github.com/nettrom/suggestbot/blob/master/tool-labs/link-rec/link-recommender.py#L208
#https://www.mediawiki.org/wiki/Manual:Redirect_table
#https://www.mediawiki.org/wiki/Manual:Page_table
#https://www.mediawiki.org/wiki/Manual:Pagelinks_table
#--rd.redirect_id -- where is this field located? in which table can it be found?

articles = []

def get_clean_ids_mariadb(df):
    
    '''
    Connect to the MediaWiki databases for the wikis found
    in the given `pandas.DataFrame` `df` and return a
    DataFrame with pageids for non redirect articles
    '''

    clean_id_query = '''
    SELECT 
       p1.page_title AS page_title,
       DATABASE() AS database_code,
       p1.page_id  AS page_id,
       p1.page_len page_len,
       p1.page_is_redirect AS p1_is_redirect,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
    FROM page AS p1 
    LEFT JOIN redirect AS rd 
        ON p1.page_id=rd.rd_from 
    LEFT JOIN page AS p2 
        ON (rd_namespace = p2.page_namespace)
            AND rd_title = p2.page_title  
    WHERE p1.page_namespace = 0
        AND p1.page_title IN {raw_articles}
    '''
    
    clean_id_query_one_article = '''
    SELECT 
       p1.page_title AS page_title,
       DATABASE() AS database_code,
       p1.page_id  AS page_id,
       p1.page_len page_len,
       p1.page_is_redirect AS p1_is_redirect,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
    FROM page AS p1 
    LEFT JOIN redirect AS rd 
        ON p1.page_id=rd.rd_from 
    LEFT JOIN page AS p2 
        ON (rd_namespace = p2.page_namespace)
            AND rd_title = p2.page_title  
    WHERE p1.page_namespace = 0
        AND p1.page_title = '{raw_articles}'
    '''

    for wiki in df['database_code'].unique():
        print('***')
        print(wiki)
        
        grouping = df.loc[df['database_code'] == wiki].groupby('database_code')['page_title'].apply(pd.DataFrame)
        raw_articles = tuple(list(grouping[wiki]))
        article = grouping.reset_index(drop=True).iloc[0][0]
        
        if len(raw_articles)>= 2:
            redirects_r = mariadb.run(clean_id_query.format(raw_articles=raw_articles), wiki )
        else: redirects_r = mariadb.run(clean_id_query_one_article.format(raw_articles=article), wiki )
        articles.append(redirects_r)   
    
    return(articles)

In [31]:
get_clean_ids_mariadb(combined_r2)
featuredtitles = pd.concat(articles)

***
idwiki
***
jvwiki
***
suwiki
***
minwiki


In [32]:
# |
#check to see if any of the page_ids are redirects or double redirects
((featuredtitles['p1_is_redirect']==1) & (featuredtitles['is_double_redirect']==1)).any()

False

In [33]:
# |
#check to see if any of the page_ids are redirects or double redirects
((featuredtitles['p1_is_redirect']==1) | (featuredtitles['is_double_redirect']==1)).any()

False

In [35]:
# act on the results ...featured_pages_redirects_r

#create a df 
all_surviving_articles = featuredtitles[['page_id','page_title', 'page_len']] 

#seperate the redirected items into their own df
redirects = featuredtitles.loc[featuredtitles['p1_is_redirect']==1]
#pull only p1.page_id, p1.page_title, p1.page_len 
redirect_df = redirects[['page_id','page_title','page_len']] 

#remove the redirect items from the all_surviving_articles df & create global articles df
articles =  all_surviving_articles[~all_surviving_articles.isin(redirect_df)].dropna(how='all')

In [None]:
#check for duplicates
article_dupe_count = articles.groupby(['page_id','page_title', 'page_len']).size().reset_index()

In [37]:
#drop duplicates
articles_clean = articles.drop_duplicates(subset=['page_id', 'page_title'], keep='first')

In [38]:
#check for nulls
articles_clean[articles_clean.isnull().any(axis=1)]

Unnamed: 0,page_id,page_title,page_len


In [39]:
#combine combined_r2 and articles_clean3 on page_title
df = pd.merge(combined_r2, articles_clean, on='page_title', how='outer')

In [None]:
#check for nulls
df[df.isnull().any(axis=1)].count()

In [42]:
#create a df of just the null rows
df_nulls = df[df.isnull().any(axis=1)]

#remove the null items from the df 
df_clean =  df[~df.isin(df_nulls)].dropna(how='all')

In [43]:
df_nulls.reset_index(drop=True);

In [45]:
#create a df of just the null rows
df_no_info = df_nulls[df_nulls.isnull().any(axis=1)]

#remove the null items from the df 
df_formerly_nulls =  df_nulls[~df_nulls.isin(df_no_info)].dropna(how='all')

df_no_info.reset_index(drop=True);

In [46]:
#concat the formerly nulls back into the df
df_relative_length = pd.concat([df_clean, df_formerly_nulls], ignore_index=True)

In [47]:
#convert page_len to int
df_relative_length['page_len'] = df_relative_length['page_len'].astype(str).astype(int);

In [None]:
#validate results by pulling a sample and checking page info
#https://{}.wikipedia.org/w/index.php?title={}&action=info


In [55]:
#save df to csv
path = '../../../../data/processed/relative_length/'
df_relative_length.to_csv(path+'indonesia_relative_lengths.csv', encoding='utf-8', index=False)

In [None]:
#count of clean-non-null entries of featured articles in each wiki
df_relative_length.groupby(df_relative_length['wikicode'])['page_len'].count()

In [None]:
#count of clean-non-null entries of featured articles in each wiki
df_relative_length.groupby(df_relative_length['database_code'])['page_len'].count()

In [None]:
median_values = df_relative_length.groupby(df_relative_length['database_code'])['page_len'].median().rename_axis(['database_code']).rename('FA_median_len').reset_index()

In [66]:
%store median_values

Stored 'median_values' (DataFrame)


### Address pawiki which only had one featured article

In [None]:
#wikicode_pa has only one featured article
df_relative_length.loc[df_relative_length['wikicode'] == 'pawiki']

In [74]:
median_value_pa = df_relative_length[df_relative_length['wikicode']== 'pawiki'].page_len

### Address tcywiki and satwiki which had zero featured articles

In [31]:
tcy_articles_r = wmf.mariadb.run(articles_len_wiki_query, 'tcywiki')
sat_articles_r = wmf.mariadb.run(articles_len_wiki_query, 'satwiki')

#### start by cleaning tcy_articles_r data

In [33]:
# |
#check to see if any of the page_ids are redirects or double redirects
((tcy_articles_r['p1_is_redirect']==1) & (tcy_articles_r['is_double_redirect']==1)).any()

False

In [34]:
# |
#check to see if any of the page_ids are redirects or double redirects
((tcy_articles_r['p1_is_redirect']==1) | (tcy_articles_r['is_double_redirect']==1)).any()

True

In [35]:
# act on the results ...featured_pages_redirects_r

#create a df 
all_surviving_articles_tcy = tcy_articles_r[['page_id','page_title', 'page_len']] 

#seperate the redirected items into their own df
redirects_tcy = tcy_articles_r.loc[tcy_articles_r['p1_is_redirect']==1]
#pull only p1.page_id, p1.page_title, p1.page_len 
redirect_df_tcy = redirects_tcy[['page_id','page_title','page_len']] 

#remove the redirect items from the all_surviving_articles df & create global articles df
articles_tcy =  all_surviving_articles_tcy[~all_surviving_articles_tcy.isin(redirect_df_tcy)].dropna(how='all')


#### now clean sat_articles_r data

In [37]:
# Now with sat_articles_r
#check to see if any of the page_ids are redirects or double redirects
((sat_articles_r['p1_is_redirect']==1) & (sat_articles_r['is_double_redirect']==1)).any()

True

In [38]:
# |
#check to see if any of the page_ids are redirects or double redirects
((sat_articles_r['p1_is_redirect']==1) | (sat_articles_r['is_double_redirect']==1)).any()

True

In [40]:
# act on the results ...featured_pages_redirects_r

#create a df 
all_surviving_articles_sat = sat_articles_r[['page_id','page_title', 'page_len']] 

#seperate the redirected items into their own df
redirects_sat = sat_articles_r.loc[sat_articles_r['p1_is_redirect']==1]
#pull only p1.page_id, p1.page_title, p1.page_len 
redirect_df_sat = redirects_sat[['page_id','page_title','page_len']] 

#remove the redirect items from the all_surviving_articles df & create global articles df
articles_sat =  all_surviving_articles_sat[~all_surviving_articles_sat.isin(redirect_df_sat)].dropna(how='all')


In [None]:
median_values_sat = articles_sat['page_len'].median()
median_values_sat 

## create an index of median values

In [153]:
#first convert tcy and sat median values into a df
tcy_sat = pd.DataFrame({'wikicode': {0:'satwiki', 1: 'tcywiki'},
                        'page_len': {0:6438, 1:3133}
                       })

In [156]:
median_values = median_values.to_frame().reset_index()

In [157]:
median_value_pa = median_value_pa.to_frame().reset_index().rename(columns={'index':'wikicode'})

In [158]:
median_value_pa['wikicode']= median_value_pa['wikicode'].map({461:'pawiki'})

In [159]:
IN_median_vi = pd.concat([median_values,median_value_pa, tcy_sat])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [164]:
IN_median_vi.sort_values(['page_len']).reset_index(drop=True);

In [179]:
IN_median_vi.rename(columns={'page_len': 'mpl_index'}, inplace=True)

In [180]:
%store IN_median_vi

Stored 'IN_median_vi' (DataFrame)
