# Load Approved Titles

In [1]:
import os
from glob import glob
import pandas as pd

import wmfdata as wmf
from wmfdata import charting, mariadb, hive
from wmfdata.utils import pct_str, pd_display_all
from urllib.parse import unquote

You are using wmfdata v1.0.3, but v1.0.4 is available.

To update, run `pip install --upgrade git+https://github.com/neilpquinn/wmfdata.git@release`.

To see the changes, refer to https://github.com/neilpquinn/wmfdata/blob/release/CHANGELOG.md


### Read Indonesia data

In [2]:
import glob
import pandas as pd

files = glob.glob('../../data/raw/articles/2019/Indonesia/*.csv')
dfs = []
for file in files:
    df = pd.read_csv(file)
    df['filename'] = file
    dfs.append(df)
articles_raw = pd.concat(dfs, ignore_index=True)

In [3]:
articles = articles_raw.copy(deep=True)

articles = articles.rename(columns={'Wiki':'language_name',
                                    'Type of article': 'contest_article_type',
                                    'Page name'  : 'url'  
                       })

#replace strings to comply with db table values
articles = articles.replace({'language_name' : { 'Bahasa Indonesia' : 'Indonesian',
                                                 'Sunda'            : 'Sundanese',
                                                 'Jawa'             : 'Javanese'
                                               }
                            })

#add page title column
articles['page_title'] =  articles['url'].str.split('.wikipedia.org/wiki/').str[1]

#remove typos - trailing comma at end of page_titles
remove_char_list = [',', '__', '_']
articles['page_title'] = articles['page_title'].str.rstrip(',')
articles['page_title'] = articles['page_title'].str.rstrip('_')
articles['page_title'] = articles['page_title'].str.rstrip('__')




#ensure article links and article info link
articles['filename'] =  articles['filename'].str.split('../../data/raw/articles/2019/Indonesia/').str[1]
articles['url_base'] =  articles['url'].str.split('https://').str[1]
articles['url_base'] = articles['url_base'].str.split('/').str[0]
articles['url_article_info'] = 'https://xtools.wmflabs.org/articleinfo/'+articles['url_base']+'/'+articles['page_title']

df = articles[['page_title', 'language_name', 'contest_article_type', 'filename', 'url', 'url_article_info']]

In [4]:
len(df[df['page_title'].str.contains('%',na=False)])

64

### Read India data

### Pull data

In [5]:
#get database_code and language_code, confirm language_code (if needed)
lang_names =tuple(df['language_name'].unique())

ci = wmf.hive.run("""
SELECT  language_code, database_code, language_name
FROM canonical_data.wikis
WHERE language_name IN {lang_names} AND database_group = 'wikipedia'
""".format(lang_names=lang_names))

In [6]:
#merge
df_ci = df.merge(ci, how="left", on=['language_name'])

In [7]:
df_ci_articles = df_ci.loc[df_ci['page_title'].notnull(), ['page_title', 
                                                           'language_name', 
                                                           'database_code', 
                                                           'language_code',
                                                           'filename',
                                                           'contest_article_type', 
                                                           'url',
                                                           'url_article_info'
                                                          ]]

In [8]:
#normalize the titles
df_ci_articles['page_title'] = df_ci_articles['page_title'].str.replace(' ', '_').copy(deep=False)

### Save df

In [10]:
df_ci_articles.to_csv("../../data/raw/articles/2019/Indonesia/compiled/articles.csv", sep=',', encoding = 'utf-8', index=False)