In [1]:
import os
from glob import glob
import pandas as pd

import wmfdata as wmf
from wmfdata import charting, mariadb, hive
from wmfdata.utils import pct_str, pd_display_all
from urllib.parse import unquote

In [2]:
filename = '../../data/raw/articles/2019/Tiger 2.0 - article page stats.xlsx'

cols2skip = [0,2,3]
cols = [i for i in range(10) if i not in cols2skip]
dfs = pd.read_excel(filename, sheet_name=None, usecols=cols, header=None) #skiprows=1

df = pd.concat([df.assign(name=n) for n,df in dfs.items()], sort=True)
df.columns = ['page_title', 'language_name']

In [3]:
#check the langauage names (aka sheets by wiki) and make sure we can identify each language
#pnbwiki	Western Punjabi	https://pnb.wikipedia.org/
#pawiki	Punjabi	Punjabi(Gurumukhi)	https://pa.wikipedia.org/

df['language_name'] = df['language_name'].replace('Bengali', 'Bangla')
df['language_name'] = df['language_name'].replace('Punjabi(Gurumukhi)', 'Punjabi')

lang_names =tuple(df['language_name'].unique())
len(lang_names)

15

In [4]:
ci = wmf.hive.run("""
SELECT  database_code, language_name, language_code
FROM canonical_data.wikis
WHERE language_name IN {lang_names} AND database_group = 'wikipedia'
""".format(lang_names=lang_names))

In [5]:
#merge
df_ci = df.merge(ci, how="left", on=['language_name'])

In [7]:
df_ci_articles = df_ci.loc[df_ci['page_title'].notnull(), ['page_title', 'language_name', 'database_code', 'language_code']]

In [8]:
df_ci_articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9173 entries, 0 to 9771
Data columns (total 4 columns):
page_title       9173 non-null object
language_name    9173 non-null object
database_code    9173 non-null object
language_code    9173 non-null object
dtypes: object(4)
memory usage: 358.3+ KB


In [10]:
#normalize the titles
df_ci_articles['page_title'] = df_ci_articles['page_title'].str.replace(' ', '_').copy(deep=False)

In [11]:
df_ci_articles.to_csv("../../data/raw/articles/2019/clean/articles.csv", sep=',', encoding = 'utf-8', index=False)

In [1]:
#read csv
articles = pd.read_csv("../../data/raw/articles/2019/clean/articles.csv") #assumes 2 columns (wiki, titles)
articles_s = articles[['page_title', 'language_code']]

In [2]:
articles_s['language_code'].unique()

array(['as', 'bn', 'gu', 'hi', 'kn', 'ml', 'mr', 'or', 'pa', 'sa', 'sat',
       'ta', 'te', 'tcy', 'ur'], dtype=object)

In [3]:
wikis_in_df = tuple(list(articles_s['language_code'].unique()))

In [4]:
articles_s

Unnamed: 0,page_title,language_code
0,আনোৱাৰুদ্দিন_চৌধুৰী,as
1,কাৱঁড়_যাত্ৰা,as
2,ভটিমা,as
3,বগা_বাবাৰ_মাজাৰ,as
4,পোৱামক্কা,as
...,...,...
9168,لاہور_کی_عزاداری,ur
9169,گایتری_منتر,ur
9170,مہابھارت,ur
9171,مومنہ_درید,ur


In [17]:
z=articles_s.groupby(['language_code']).agg(['nunique'])
z.reset_index().sort_values('page_title, nunique', ascending=False)

KeyError: 'page_title, nunique'

In [16]:
z.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15 entries, as to ur
Data columns (total 1 columns):
(page_title, nunique)    15 non-null int64
dtypes: int64(1)
memory usage: 240.0+ bytes


In [5]:
aswiki_titles_normalized = tuple(list(articles_s.loc[articles_s['language_code'] == 'as', 'page_title']))
bnwiki_titles_normalized = tuple(list(articles_s.loc[articles_s['language_code'] == 'bn', 'page_title']))
guwiki_titles_normalized = tuple(list(articles_s.loc[articles_s['language_code'] == 'gu', 'page_title']))
hiwiki_titles_normalized = tuple(list(articles_s.loc[articles_s['language_code'] == 'hi', 'page_title']))
knwiki_titles_normalized = tuple(list(articles_s.loc[articles_s['language_code'] == 'kn', 'page_title']))
mlwiki_titles_normalized = tuple(list(articles_s.loc[articles_s['language_code'] == 'ml', 'page_title']))
mrwiki_titles_normalized = tuple(list(articles_s.loc[articles_s['language_code'] == 'mr', 'page_title']))
orwiki_titles_normalized = tuple(list(articles_s.loc[articles_s['language_code'] == 'or', 'page_title']))
pawiki_titles_normalized = tuple(list(articles_s.loc[articles_s['language_code'] == 'pa', 'page_title']))
sawiki_titles_normalized = tuple(list(articles_s.loc[articles_s['language_code'] == 'sa', 'page_title']))
satwiki_titles_normalized = tuple(list(articles_s.loc[articles_s['language_code'] == 'sat', 'page_title']))
tawiki_titles_normalized = tuple(list(articles_s.loc[articles_s['language_code'] == 'ta', 'page_title']))
tewiki_titles_normalized = tuple(list(articles_s.loc[articles_s['language_code'] == 'te', 'page_title']))
tcywiki_titles_normalized = tuple(list(articles_s.loc[articles_s['language_code'] == 'tcy', 'page_title']))
urwiki_titles_normalized = tuple(list(articles_s.loc[articles_s['language_code'] == 'ur', 'page_title']))

In [6]:
len(aswiki_titles_normalized)

195

In [49]:
%store aswiki_titles_normalized
%store bnwiki_titles_normalized
%store guwiki_titles_normalized 
%store hiwiki_titles_normalized 
%store knwiki_titles_normalized
%store mlwiki_titles_normalized
%store mrwiki_titles_normalized
%store orwiki_titles_normalized
%store pawiki_titles_normalized
%store sawiki_titles_normalized
%store satwiki_titles_normalized
%store tawiki_titles_normalized 
%store tewiki_titles_normalized 
%store tcywiki_titles_normalized
%store urwiki_titles_normalized 

Stored 'aswiki_titles_normalized' (tuple)
Stored 'bnwiki_titles_normalized' (tuple)
Stored 'guwiki_titles_normalized' (tuple)
Stored 'hiwiki_titles_normalized' (tuple)
Stored 'knwiki_titles_normalized' (tuple)
Stored 'mlwiki_titles_normalized' (tuple)
Stored 'mrwiki_titles_normalized' (tuple)
Stored 'orwiki_titles_normalized' (tuple)
Stored 'pawiki_titles_normalized' (tuple)
Stored 'sawiki_titles_normalized' (tuple)
Stored 'satwiki_titles_normalized' (tuple)
Stored 'tawiki_titles_normalized' (tuple)
Stored 'tewiki_titles_normalized' (tuple)
Stored 'tcywiki_titles_normalized' (tuple)
Stored 'urwiki_titles_normalized' (tuple)
