In [2]:
import os
from glob import glob
import pandas as pd

import wmfdata as wmf
from wmfdata import charting, mariadb, hive
from wmfdata.utils import pct_str, pd_display_all
from urllib.parse import unquote

You are using wmfdata 0.1.0 (latest).

You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata


In [82]:
df = pd.read_csv("../../data/raw/articles/2019/N_India_stats.csv")

In [83]:
#add wiki_db column for querying
df['wiki_db'] = df['Language']+'wiki'

In [11]:
contest_titles_denormalized = tuple(list(df['Articles']))

In [33]:
article_vars = dict(
    contest_titles_denormalized = contest_titles_denormalized,
    wiki_dbs = wiki_dbs
)

In [35]:
#get qids 
qid_r = wmf.mariadb.run("""
SELECT
  ips_site_page AS article,
  ips_item_id AS QID
FROM  wb_items_per_site  
WHERE ips_site_id IN {wiki_dbs} 
      AND ips_site_page IN {contest_titles_denormalized}
""".format(**article_vars), "wikidatawiki")

In [159]:
#get clean list

#merge in the clean list into the df
df_w_ids = pd.merge(df, qid_r[['article', 'QID']], how="left", left_on=['Articles'], right_on=['article']).drop('article', axis=1)

#CLEAN DF - ready to use, drop na, drop all dupes
df_w_ids_no_nulls = df_w_ids[df_w_ids['QID'].notna()]
df_w_ids_clean = df_w_ids_no_nulls.drop_duplicates(subset=['Language', 'Articles', 'wiki_db', 'QID'], keep=False)

In [145]:
#MISSING DF - to add data
articles_w_QID_missing = df_w_ids[df_w_ids['QID'].isnull()]

In [179]:
#identify dupes 
#articles_w_QID_duplicated = df_w_ids_no_nulls[df_w_ids_no_nulls.duplicated(subset=['Language', 'Articles', 'wiki_db', 'QID'], keep=False)]

#keep only first instance of dupes of full row duplicates
articles_w_QID_duplicated = df_w_ids_no_nulls[df_w_ids_no_nulls.duplicated(subset=['Language', 'Articles', 'wiki_db', 'QID'], keep='first')]

In [90]:
#MISSING DF - to add data
articles_w_QID_missing

#DUPES DF - to clean 
articles_w_QID_duplicated

#CLEAN DF - ready to use
df_w_ids_clean

In [186]:
df_updated = articles_w_QID_duplicated.append(df_w_ids_clean)

In [190]:
articles_w_QID_missing.to_csv("../../data/raw/articles/2019/articles_QID_missing.csv", sep=',', index=False, encoding='utf-8')

In [191]:
df_updated.to_csv("../../data/raw/articles/2019/contest_titles_n_updated.csv", sep=',', index=False, encoding='utf-8')

In [None]:
#df.replace(['ਸੁਦਾਮਾ_ਪਾਂਡੇ_ਧੂਮਿਲ""'], ["ਸੁਦਾਮਾ_ਪਾਂਡੇ_ਧੂਮਿਲ"], inplace=True)

## Method 2 - from API

In [3]:
# get list of wikis_in_submitted_articles_df from 1b notebook
%store -r wikis_in_submitted_articles_df

#get working contest api urls
ptp_2018_base = 'https://tools.wmflabs.org/fountain/api/editathons/project-tiger-2018-{}'
ptp_2_base = 'https://tools.wmflabs.org/fountain/api/editathons/project-tiger-2.0-{}'

#wikicodes = wikis_in_submitted_articles_df
wikicodes = ['as', 'bn', 'gu', 'hi', 'mli', 'or', 'pa', 'sat', 'ta', 'ur', 'te', 'mr', 'tcy', 'kn', 'sa', 'pnb']

def get_contest_urls(url_base):
    urls_to_review = list()
    for wikicode in wikicodes:
        urls = (url_base.format(wikicode))
        urls_to_review.append(urls)

    not_found_urls = []
    working_urls = []
    final_urls_list = []
    # Iterate here on the urls
    # The below code could be executed for each url
    for url in urls_to_review:
        r = requests.get(url)
        if r.status_code == 404:
            not_found_urls.append(url)

    working_urls = list(set(urls_to_review)-set(not_found_urls))
    return working_urls

In [4]:
def get_contest_data(working_urls):
    collected_data = pd.DataFrame([])
    for url in working_urls:
        URL = url
        r = requests.get(URL)
        pretty_json = json.loads(r.text)
        #pretty_data_dump = json.dumps(pretty_json, indent=2)
        data = json_normalize(pretty_json)
        core = data[['wiki','code', 'finish', 'start', 'jury', 'articles']]
        lens = [len(item) for item in core['articles']]
        explode_elongate_prep = pd.DataFrame({
            'wiki':np.repeat(core['wiki'].values, lens),
            'code':np.repeat(core['code'].values, lens),
            'finish':np.repeat(core['finish'].values, lens),
            'start':np.repeat(core['start'].values, lens),
            'jury':np.repeat(core['jury'].values, lens),
            'articles':np.hstack(core['articles']),
                              })
        explode_elongate = pd.concat([explode_elongate_prep.drop(['articles'], axis=1), explode_elongate_prep['articles'].apply(pd.Series)],axis=1)
        df = explode_elongate.rename(columns={'name': 'article_name', 'user': 'user_name', 'code': 'contest_code'})
        
        collected_data = collected_data.append(df, ignore_index=True, sort=False)
    return collected_data

In [5]:
ptp_2_w_urls = get_contest_urls(ptp_2_base)

In [6]:
ptp_2_data = get_contest_data(ptp_2_w_urls)

In [None]:
#clean data each df

df_ptp_2 = ptp_2_data[['wiki', 'contest_code', 'start', 'finish', 'jury', 'dateAdded', 'article_name', 'user_name']]
#cols_to_drop = ['marks','id']
#df_wlw_2019 = wlw_2019_data[wlw_2019_data.columns.drop(cols_to_drop)]
df_ptp_2.rename(columns={'article_name':'page_title'}, inplace=True)
df_ptp_2['database_code'] = df_ptp_2['wiki']+'wiki'