In [1]:
import requests
import json
from pandas.io.json import json_normalize
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
# get list of wikis_in_submitted_articles_df from 1b notebook
%store -r wikis_in_submitted_articles_df

#get working contest api urls
wlw_2019_base = 'https://tools.wmflabs.org/fountain/api/editathons/wlwsa2020-{}'
ptp_2018_base = 'https://tools.wmflabs.org/fountain/api/editathons/project-tiger-2018-{}'
wam_2019_base = 'https://tools.wmflabs.org/fountain/api/editathons/asian-month-2019-{}'

wikicodes = wikis_in_submitted_articles_df

def get_contest_urls(url_base):
    urls_to_review = list()
    for wikicode in wikicodes:
        urls = (url_base.format(wikicode))
        urls_to_review.append(urls)

    not_found_urls = []
    working_urls = []
    final_urls_list = []
    # Iterate here on the urls
    # The below code could be executed for each url
    for url in urls_to_review:
        r = requests.get(url)
        if r.status_code == 404:
            not_found_urls.append(url)

    working_urls = list(set(urls_to_review)-set(not_found_urls))
    return working_urls

In [95]:
def get_contest_data(working_urls):
    collected_data = pd.DataFrame([])
    for url in working_urls:
        URL = url
        r = requests.get(URL)
        pretty_json = json.loads(r.text)
        #pretty_data_dump = json.dumps(pretty_json, indent=2)
        data = json_normalize(pretty_json)
        core = data[['wiki','code', 'finish', 'start', 'jury', 'articles']]
        lens = [len(item) for item in core['articles']]
        explode_elongate_prep = pd.DataFrame({
            'wiki':np.repeat(core['wiki'].values, lens),
            'code':np.repeat(core['code'].values, lens),
            'finish':np.repeat(core['finish'].values, lens),
            'start':np.repeat(core['start'].values, lens),
            'jury':np.repeat(core['jury'].values, lens),
            'articles':np.hstack(core['articles']),
                              })
        explode_elongate = pd.concat([explode_elongate_prep.drop(['articles'], axis=1), explode_elongate_prep['articles'].apply(pd.Series)],axis=1)
        df = explode_elongate.rename(columns={'name': 'article_name', 'user': 'user_name', 'code': 'contest_code'})
        
        collected_data = collected_data.append(df, ignore_index=True, sort=False)
    return collected_data

In [None]:
#run
wlw_2019_w_urls = get_contest_urls(wlw_2019_base)
ptp_2018_w_urls = get_contest_urls(ptp_2018_base)
wam_2019_w_urls = get_contest_urls(wam_2019_base)

In [96]:
wlw_2019_data = get_contest_data(wlw_2019_w_urls)
ptp_2018_data = get_contest_data(ptp_2018_w_urls)
wam_2019_data = get_contest_data(wam_2019_w_urls)

In [97]:
#clean data each df

df_wam_2019 = wam_2019_data[['wiki', 'contest_code', 'start', 'finish', 'jury', 'dateAdded', 'article_name', 'user_name']]
cols_to_drop = ['marks','id']
df_wlw_2019 = wlw_2019_data[wlw_2019_data.columns.drop(cols_to_drop)]
df_ptp_2018 = ptp_2018_data[ptp_2018_data.columns.drop(cols_to_drop)]

In [107]:
#combine dfs
#indic_contests_data
idc_r = pd.concat([df_wlw_2019, df_ptp_2018, df_wam_2019])

#change date columns to datetime data type
idc_r[['start', 'finish', 'dateAdded']] = idc_r[['start', 'finish', 'dateAdded']].apply(pd.to_datetime)

#create seperate dfs from the raw df
idc_jury = idc_r[['contest_code','finish','jury','start','wiki'],sort=True]
idc = idc_r[['article_name','contest_code','dateAdded','finish','start','user_name','wiki'],sort=True]

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [131]:
idc_jury_clean = idc_jury.drop_duplicates(subset=['contest_code', 'finish', 'start', 'wiki'], keep='first')

In [123]:
dupes = idc[idc.duplicated(['article_name','contest_code','wiki','user_name'])]

In [124]:
dupes

Unnamed: 0,article_name,contest_code,dateAdded,finish,start,user_name,wiki


## Preparing to create table and load into hive

In [134]:
idc_r.to_csv("../../data/processed/contest_api/indic_contests_data_raw.csv", sep=',', encoding = 'utf-8', index=False)
idc.to_csv("../../data/processed/contest_api/indic_contests_data_clean.csv", sep=',', encoding = 'utf-8', index=False)
idc_jury_clean.to_csv("../../data/processed/contest_api/indic_contests_jury_data.csv", sep=',', encoding = 'utf-8', index=False)