# Table of Contents  <a class="anchor" id="toc"></a>

1. ['%' topic lists (topicality)](#tprcnt)

In [1]:
import os
from glob import glob
import pandas as pd
import numpy as np

import wmfdata as wmf
from wmfdata import charting, mariadb, hive
from wmfdata.utils import pct_str, pd_display_all
import urllib
from urllib.parse import unquote

You are using wmfdata 0.1.0 (latest).

You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata


In [2]:
#compile all of the lists

f_mask = r'../../../GLOW/data/raw/g_topic_lists/*.xlsx'

gtl = \
pd.concat([gtl.assign(file=os.path.splitext(os.path.basename(f))[0],
                     sheet=sheet)
           for f in glob(f_mask)
           for sheet, gtl in pd.read_excel(f, sheet_name=None).items()],
          ignore_index=True, sort=True)

full_topic_rec_df = gtl.copy()

In [3]:
#combine Topic & entity name > article_suggestion
full_topic_rec_df['article_suggestion'] = full_topic_rec_df['entity_name'].combine_first(full_topic_rec_df['Topic'])
#rename 'sheet' to Google_topic
full_topic_rec_df = full_topic_rec_df.rename(columns={'sheet':'g_category', 
                                                      'Topic': 'g_suggested_en_title', 
                                                      'entity_name': 'g_suggested_local_title',
                                                      'english_wikipedia':'english_wikipedia_URL', 
                                                      'local_wikipedia':'local_wikipedia_URL'
                                                     })

#extract wiki name & suggestion_type (translation or edit)
#full_topic_rec_df['wiki'] = full_topic_rec_df['file'].str.extract('(^[A-Z_]+([^\(-]+))', expand=True)
full_topic_rec_df[['language_name', 'suggestion_type']] = full_topic_rec_df['file'].str.split(" ", 1, expand=True)
full_topic_rec_df['suggestion_type'] = full_topic_rec_df['file'].str.rsplit("for ").str[-1]

#extract url title
full_topic_rec_df['local_encoded_title'] = full_topic_rec_df['local_wikipedia_URL'].str.extract('([^\/]+$)', expand=True)

#extract lang code
#full_topic_rec_df['url_language_code'] = full_topic_rec_df['local_wikipedia_URL'].str.rsplit("http://").str[-1]
#full_topic_rec_df['url_language_code'] = full_topic_rec_df['url_language_code'].str.extract('([^.]+)', expand=True)

#reorder for visual skimming's sake
full_topic_rec_df = full_topic_rec_df[['article_suggestion', 'local_encoded_title','g_category', 'language_name','suggestion_type', 'file']] #'url_language_code'

#replace double coded translation entries
full_topic_rec_df['suggestion_type']=full_topic_rec_df['suggestion_type'].replace('Translating EXTERNAL', 'Translation EXTERNAL')
full_topic_rec_df['language_name'] = full_topic_rec_df['language_name'].replace('Bengali', 'Bangla')

In [4]:
full_topic_rec_df[full_topic_rec_df['language_name'].isnull()]

Unnamed: 0,article_suggestion,local_encoded_title,g_category,language_name,suggestion_type,file


In [5]:
#get database_code and language_code, confirm language_code (if needed)
lang_names =tuple(full_topic_rec_df['language_name'].unique())

ci = wmf.hive.run("""
SELECT  language_code, database_code, language_name
FROM canonical_data.wikis
WHERE language_name IN {lang_names} AND database_group = 'wikipedia'
""".format(lang_names=lang_names))

In [6]:
#merge
full_topic_rec_df_ci = full_topic_rec_df.merge(ci, how="left", on=['language_name'])

In [7]:
full_topic_rec_df_ci.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34295 entries, 0 to 34294
Data columns (total 8 columns):
article_suggestion     34295 non-null object
local_encoded_title    20102 non-null object
g_category             34295 non-null object
language_name          34295 non-null object
suggestion_type        34295 non-null object
file                   34295 non-null object
language_code          34295 non-null object
database_code          34295 non-null object
dtypes: object(8)
memory usage: 2.4+ MB


In [8]:
#change titles from denormalized (spaces) to normalized (underscore) for querying the page table etc.
full_topic_rec_df_ci['article_suggestion'] = full_topic_rec_df_ci['article_suggestion'].str.replace(' ', '_')


In [9]:
#if pnb article_suggestions exist, confirm language_code = url_language_code(see code above to add in url_language_code column)
#pnb	pnbwiki	Western Punjabi 

In [10]:
#use groupby to get two seperate dfs for each suggestion_type
translation_topic_rec_df = full_topic_rec_df_ci[full_topic_rec_df_ci['suggestion_type'] == 'Translation EXTERNAL'].copy(deep=False)

#get clean list - drop duplicates
translation_topic_rec_df_CLEAN = translation_topic_rec_df.drop_duplicates(subset=['article_suggestion', 'local_encoded_title','g_category','suggestion_type', 'language_name', 'language_code', 'database_code', 'file'], keep='first').copy(deep=False)

#keep just the duplicates - for checking data later on
translation_topic_rec_df_Dupes = pd.concat([translation_topic_rec_df, translation_topic_rec_df_CLEAN]).loc[translation_topic_rec_df.index.symmetric_difference(translation_topic_rec_df_CLEAN.index)]

In [11]:
#use groupby to get two seperate dfs for each suggestion_type
editing_topic_rec_df = full_topic_rec_df_ci.loc[full_topic_rec_df_ci['suggestion_type'] == 'Editing EXTERNAL'].copy(deep=False)

In [12]:
#encoded URL to decoded title
editing_topic_rec_df['page_title'] = editing_topic_rec_df['local_encoded_title'].apply(lambda x: unquote(x)).copy(deep=False)
editing_topic_rec_df['page_title'] = editing_topic_rec_df['page_title'].str.replace(' ', '_')

In [15]:
#get clean list - drop duplicates
editing_topic_rec_df_CLEAN = editing_topic_rec_df.drop_duplicates(subset=['article_suggestion', 'page_title','local_encoded_title','g_category', 'suggestion_type', 'language_name', 'language_code', 'database_code', 'file'], keep='first')

In [16]:
editing_topic_rec_df_CLEAN.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20102 entries, 0 to 30319
Data columns (total 9 columns):
article_suggestion     20102 non-null object
local_encoded_title    20102 non-null object
g_category             20102 non-null object
language_name          20102 non-null object
suggestion_type        20102 non-null object
file                   20102 non-null object
language_code          20102 non-null object
database_code          20102 non-null object
page_title             20102 non-null object
dtypes: object(9)
memory usage: 1.5+ MB


In [17]:
wd_vars = {}

## ASSESS THE DFs

In [18]:
print("total values in full:", len(full_topic_rec_df_ci))
print('***')
print("total values in translation:", len(translation_topic_rec_df))
print("total values in editing list", len(editing_topic_rec_df))
print('***')
print("total values in clean translation editing:", len(translation_topic_rec_df_CLEAN))
print("total values in clean editing:", len(editing_topic_rec_df_CLEAN))

total values in full: 34295
***
total values in translation: 14193
total values in editing list 20102
***
total values in clean translation editing: 14155
total values in clean editing: 20102


In [19]:
editing_diff = len(editing_topic_rec_df)-len(editing_topic_rec_df_CLEAN)
translation_diff = len(translation_topic_rec_df)-len(translation_topic_rec_df_CLEAN)
diffs_sum = editing_diff+translation_diff
print("Translation Dupes DF len", len(translation_topic_rec_df_Dupes))
print("putting it all together:",len(translation_topic_rec_df_Dupes) +len(translation_topic_rec_df_CLEAN)+len(editing_topic_rec_df_CLEAN))

Translation Dupes DF len 38
putting it all together: 34295


## Editing

## Get article ids, redirects <a class="anchor" id="get_clean_list"></a>

In [20]:
nnpt = editing_topic_rec_df_CLEAN.loc[editing_topic_rec_df_CLEAN['page_title'].notnull(), ['database_code', 'page_title']]

In [21]:
nnpt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20102 entries, 0 to 30319
Data columns (total 2 columns):
database_code    20102 non-null object
page_title       20102 non-null object
dtypes: object(2)
memory usage: 471.1+ KB


In [22]:
nnpt['database_code'].unique()

array(['mlwiki', 'pawiki', 'hiwiki', 'orwiki', 'urwiki', 'tawiki',
       'knwiki', 'mrwiki', 'guwiki', 'tewiki', 'bnwiki'], dtype=object)

In [23]:
wikis = tuple(list(nnpt['database_code'].unique()))

wd_vars.update({'wikis': wikis})

In [24]:
pawiki_titles_normalized = tuple(list(nnpt.loc[nnpt['database_code'] == 'pawiki', 'page_title']))
mlwiki_titles_normalized = tuple(list(nnpt.loc[nnpt['database_code'] == 'mlwiki', 'page_title']))
hiwiki_titles_normalized = tuple(list(nnpt.loc[nnpt['database_code'] == 'hiwiki', 'page_title']))
orwiki_titles_normalized = tuple(list(nnpt.loc[nnpt['database_code'] == 'orwiki', 'page_title']))
urwiki_titles_normalized = tuple(list(nnpt.loc[nnpt['database_code'] == 'urwiki', 'page_title']))
tawiki_titles_normalized = tuple(list(nnpt.loc[nnpt['database_code'] == 'tawiki', 'page_title']))
knwiki_titles_normalized = tuple(list(nnpt.loc[nnpt['database_code'] == 'knwiki', 'page_title']))
mrwiki_titles_normalized = tuple(list(nnpt.loc[nnpt['database_code'] == 'mrwiki', 'page_title']))
guwiki_titles_normalized = tuple(list(nnpt.loc[nnpt['database_code'] == 'guwiki', 'page_title']))
tewiki_titles_normalized = tuple(list(nnpt.loc[nnpt['database_code'] == 'tewiki', 'page_title']))
bnwiki_titles_normalized = tuple(list(nnpt.loc[nnpt['database_code'] == 'bnwiki', 'page_title']))

In [25]:
#update the query variable to use it in queries
wd_vars.update({'pawiki_titles_normalized': pawiki_titles_normalized,
                     'mlwiki_titles_normalized': mlwiki_titles_normalized,
                     'hiwiki_titles_normalized': hiwiki_titles_normalized,
                     'orwiki_titles_normalized': orwiki_titles_normalized,
                     'urwiki_titles_normalized': urwiki_titles_normalized,
                     'tawiki_titles_normalized': tawiki_titles_normalized,
                     'knwiki_titles_normalized': knwiki_titles_normalized,
                     'mrwiki_titles_normalized': mrwiki_titles_normalized,
                     'guwiki_titles_normalized': guwiki_titles_normalized,
                     'tewiki_titles_normalized': tewiki_titles_normalized,
                     'bnwiki_titles_normalized': bnwiki_titles_normalized,
                    })

In [26]:
# adapted from https://github.com/nettrom/suggestbot/blob/master/tool-labs/link-rec/link-recommender.py#L208
#https://www.mediawiki.org/wiki/Manual:Redirect_table
#https://www.mediawiki.org/wiki/Manual:Page_table
#https://www.mediawiki.org/wiki/Manual:Pagelinks_table
#--rd.redirect_id -- where is this field located? in which table can it be found?
hi_id_r = wmf.mariadb.run("""
SELECT 
       p1.page_id  AS page_id,
       p1.page_title AS page_title,
       p1.page_is_redirect AS p1_is_redirect,
       p1.page_len AS page_len,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
FROM page AS p1 
LEFT JOIN redirect AS rd 
    ON p1.page_id=rd.rd_from 
LEFT JOIN page AS p2 
    ON (rd_namespace = p2.page_namespace)
        AND rd_title = p2.page_title  
WHERE p1.page_namespace = 0
      AND p1.page_title IN {hiwiki_titles_normalized}
""".format(**wd_vars), 'hiwiki')


or_id_r = wmf.mariadb.run("""
SELECT 
       p1.page_id  AS page_id,
       p1.page_title AS page_title,
       p1.page_is_redirect AS p1_is_redirect,
       p1.page_len AS page_len,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
FROM page AS p1 
LEFT JOIN redirect AS rd 
    ON p1.page_id=rd.rd_from 
LEFT JOIN page AS p2 
    ON (rd_namespace = p2.page_namespace)
        AND rd_title = p2.page_title  
WHERE p1.page_namespace = 0
      AND p1.page_title IN {orwiki_titles_normalized}
""".format(**wd_vars), 'orwiki')

ur_id_r = wmf.mariadb.run("""
SELECT 
       p1.page_id  AS page_id,
       p1.page_title AS page_title,
       p1.page_is_redirect AS p1_is_redirect,
       p1.page_len AS page_len,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
FROM page AS p1 
LEFT JOIN redirect AS rd 
    ON p1.page_id=rd.rd_from 
LEFT JOIN page AS p2 
    ON (rd_namespace = p2.page_namespace)
        AND rd_title = p2.page_title  
WHERE p1.page_namespace = 0
      AND p1.page_title IN {urwiki_titles_normalized}
""".format(**wd_vars), 'urwiki')


ta_id_r = wmf.mariadb.run("""
SELECT 
       p1.page_id  AS page_id,
       p1.page_title AS page_title,
       p1.page_is_redirect AS p1_is_redirect,
       p1.page_len AS page_len,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
FROM page AS p1 
LEFT JOIN redirect AS rd 
    ON p1.page_id=rd.rd_from 
LEFT JOIN page AS p2 
    ON (rd_namespace = p2.page_namespace)
        AND rd_title = p2.page_title  
WHERE p1.page_namespace = 0
      AND p1.page_title IN {tawiki_titles_normalized}
""".format(**wd_vars), 'tawiki')

kn_id_r = wmf.mariadb.run("""
SELECT 
       p1.page_id  AS page_id,
       p1.page_title AS page_title,
       p1.page_is_redirect AS p1_is_redirect,
       p1.page_len AS page_len,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
FROM page AS p1 
LEFT JOIN redirect AS rd 
    ON p1.page_id=rd.rd_from 
LEFT JOIN page AS p2 
    ON (rd_namespace = p2.page_namespace)
        AND rd_title = p2.page_title  
WHERE p1.page_namespace = 0
      AND p1.page_title IN {knwiki_titles_normalized}
""".format(**wd_vars), 'knwiki')

mr_id_r = wmf.mariadb.run("""
SELECT 
       p1.page_id  AS page_id,
       p1.page_title AS page_title,
       p1.page_is_redirect AS p1_is_redirect,
       p1.page_len AS page_len,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
FROM page AS p1 
LEFT JOIN redirect AS rd 
    ON p1.page_id=rd.rd_from 
LEFT JOIN page AS p2 
    ON (rd_namespace = p2.page_namespace)
        AND rd_title = p2.page_title  
WHERE p1.page_namespace = 0
      AND p1.page_title IN {mrwiki_titles_normalized}
""".format(**wd_vars), 'mrwiki')

gu_id_r = wmf.mariadb.run("""
SELECT 
       p1.page_id  AS page_id,
       p1.page_title AS page_title,
       p1.page_is_redirect AS p1_is_redirect,
       p1.page_len AS page_len,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
FROM page AS p1 
LEFT JOIN redirect AS rd 
    ON p1.page_id=rd.rd_from 
LEFT JOIN page AS p2 
    ON (rd_namespace = p2.page_namespace)
        AND rd_title = p2.page_title  
WHERE p1.page_namespace = 0
      AND p1.page_title IN {guwiki_titles_normalized}
""".format(**wd_vars), 'guwiki')


te_id_r = wmf.mariadb.run("""
SELECT 
       p1.page_id  AS page_id,
       p1.page_title AS page_title,
       p1.page_is_redirect AS p1_is_redirect,
       p1.page_len AS page_len,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
FROM page AS p1 
LEFT JOIN redirect AS rd 
    ON p1.page_id=rd.rd_from 
LEFT JOIN page AS p2 
    ON (rd_namespace = p2.page_namespace)
        AND rd_title = p2.page_title  
WHERE p1.page_namespace = 0
      AND p1.page_title IN {tewiki_titles_normalized}
""".format(**wd_vars), 'tewiki')

    
bn_id_r = wmf.mariadb.run("""
SELECT 
       p1.page_id  AS page_id,
       p1.page_title AS page_title,
       p1.page_is_redirect AS p1_is_redirect,
       p1.page_len AS page_len,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
FROM page AS p1 
LEFT JOIN redirect AS rd 
    ON p1.page_id=rd.rd_from 
LEFT JOIN page AS p2 
    ON (rd_namespace = p2.page_namespace)
        AND rd_title = p2.page_title  
WHERE p1.page_namespace = 0
      AND p1.page_title IN {bnwiki_titles_normalized}
""".format(**wd_vars), 'bnwiki')
    
    
pa_id_r = wmf.mariadb.run("""
SELECT 
       p1.page_id  AS page_id,
       p1.page_title AS page_title,
       p1.page_is_redirect AS p1_is_redirect,
       p1.page_len AS page_len,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
FROM page AS p1 
LEFT JOIN redirect AS rd 
    ON p1.page_id=rd.rd_from 
LEFT JOIN page AS p2 
    ON (rd_namespace = p2.page_namespace)
        AND rd_title = p2.page_title  
WHERE p1.page_namespace = 0
      AND p1.page_title IN {pawiki_titles_normalized}
""".format(**wd_vars), 'pawiki')


ml_id_r = wmf.mariadb.run("""
SELECT 
       p1.page_id  AS page_id,
       p1.page_title AS page_title,
       p1.page_is_redirect AS p1_is_redirect,
       p1.page_len AS page_len,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
FROM page AS p1 
LEFT JOIN redirect AS rd 
    ON p1.page_id=rd.rd_from 
LEFT JOIN page AS p2 
    ON (rd_namespace = p2.page_namespace)
        AND rd_title = p2.page_title  
WHERE p1.page_namespace = 0
      AND p1.page_title IN {mlwiki_titles_normalized}
""".format(**wd_vars), 'mlwiki')

In [27]:
#add dbcolumn to each query df
pa_id_r['database_code'] = 'pawiki' 
ml_id_r['database_code'] = 'mlwiki'
hi_id_r['database_code'] = 'hiwiki'
or_id_r['database_code'] = 'orwiki'
ur_id_r['database_code'] = 'urwiki'
ta_id_r['database_code'] = 'tawiki'
kn_id_r['database_code'] = 'knwiki'
mr_id_r['database_code'] = 'mrwiki'
gu_id_r['database_code'] = 'guwiki'
te_id_r['database_code'] = 'tewiki'
bn_id_r['database_code'] = 'bnwiki'

In [28]:
nppt_ids = pd.concat([pa_id_r, 
                      ml_id_r,
                      hi_id_r,
                      or_id_r,
                      ur_id_r,
                      ta_id_r,
                      kn_id_r,
                      mr_id_r,
                      gu_id_r,
                      te_id_r,
                      bn_id_r,
                     ], sort=True, ignore_index=True)

nppt_ids.reset_index(drop=True);

In [29]:
#we do not want any duplicates here
nppt_ids[nppt_ids.index.duplicated()]

Unnamed: 0,database_code,is_double_redirect,p1_is_redirect,page_id,page_len,page_title,rpage_id,rpage_len,rpage_title


In [30]:
# |
#check to see if any of the page_ids are redirects or double redirects
((nppt_ids['p1_is_redirect']==1) & (nppt_ids['is_double_redirect']==1)).any()

False

In [31]:
# |
#check to see if any of the page_ids are redirects or double redirects
((nppt_ids['p1_is_redirect']==1) | (nppt_ids['is_double_redirect']==1)).any()

True

In [32]:
# act on the results from nppt_ids
#create a df 
all_surviving_articles = nppt_ids[['page_id','page_title', 'page_len', 'database_code']] 
#seperate the redirected items into their own df
redirects = nppt_ids.loc[nppt_ids['p1_is_redirect']==1]
#pull only p1.page_id, p1.page_title, p1.page_len 
redirect_df = redirects[['page_id','page_title','page_len', 'database_code']] 

In [33]:
#remove the redirect items from the all_surviving_articles df & create global articles df
nppt_articles =  all_surviving_articles[~all_surviving_articles.isin(redirect_df)].dropna(how='all')

#create a new wikicode column using quality_vars['wiki_db']
#ffill could also work here
#articles['wikicode'] = quality_vars['wiki_db']

In [34]:
nppt_articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19316 entries, 0 to 19528
Data columns (total 4 columns):
page_id          19316 non-null float64
page_title       19316 non-null object
page_len         19316 non-null float64
database_code    19316 non-null object
dtypes: float64(2), object(2)
memory usage: 754.5+ KB


## Editing articles - edit date

In [35]:
pawiki_titles_norm_ed = tuple(list(nppt_articles.loc[nppt_articles['database_code'] == 'pawiki', 'page_title']))
mlwiki_titles_norm_ed = tuple(list(nppt_articles.loc[nppt_articles['database_code'] == 'mlwiki', 'page_title']))
hiwiki_titles_norm_ed = tuple(list(nppt_articles.loc[nppt_articles['database_code'] == 'hiwiki', 'page_title']))
orwiki_titles_norm_ed = tuple(list(nppt_articles.loc[nppt_articles['database_code'] == 'orwiki', 'page_title']))
urwiki_titles_norm_ed = tuple(list(nppt_articles.loc[nppt_articles['database_code'] == 'urwiki', 'page_title']))
tawiki_titles_norm_ed = tuple(list(nppt_articles.loc[nppt_articles['database_code'] == 'tawiki', 'page_title']))
knwiki_titles_norm_ed = tuple(list(nppt_articles.loc[nppt_articles['database_code'] == 'knwiki', 'page_title']))
mrwiki_titles_norm_ed = tuple(list(nppt_articles.loc[nppt_articles['database_code'] == 'mrwiki', 'page_title']))
guwiki_titles_norm_ed = tuple(list(nppt_articles.loc[nppt_articles['database_code'] == 'guwiki', 'page_title']))
tewiki_titles_norm_ed = tuple(list(nppt_articles.loc[nppt_articles['database_code'] == 'tewiki', 'page_title']))
bnwiki_titles_norm_ed = tuple(list(nppt_articles.loc[nppt_articles['database_code'] == 'bnwiki', 'page_title']))

#update the query variable to use it in queries
wd_vars.update({'pawiki_titles_norm_ed': pawiki_titles_norm_ed,
                     'mlwiki_titles_norm_ed': mlwiki_titles_norm_ed,
                     'hiwiki_titles_norm_ed': hiwiki_titles_norm_ed,
                     'orwiki_titles_norm_ed': orwiki_titles_norm_ed,
                     'urwiki_titles_norm_ed': urwiki_titles_norm_ed,
                     'tawiki_titles_norm_ed': tawiki_titles_norm_ed,
                     'knwiki_titles_norm_ed': knwiki_titles_norm_ed,
                     'mrwiki_titles_norm_ed': mrwiki_titles_norm_ed,
                     'guwiki_titles_norm_ed': guwiki_titles_norm_ed,
                     'tewiki_titles_norm_ed': tewiki_titles_norm_ed,
                     'bnwiki_titles_norm_ed': bnwiki_titles_norm_ed,
                    })

In [40]:
#https://www.mediawiki.org/wiki/Manual:Revision_table#rev_timestamp
#https://www.mediawiki.org/wiki/Manual:Timestamp

#filter for those edited during the contest - 10th oct 2019 & 11th jan 2020 ---> 20191010000000
#yyyymmddhhmmss --  August 9th, 2010 00:30:06 --- 20100809003006

ta_edits_r = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 20191010000000 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {tawiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'tawiki')

ml_edits_r = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 20191010000000 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {mlwiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'mlwiki')

hi_edits_r = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 20191010000000 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {hiwiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'hiwiki')

or_edits_r = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 20191010000000 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {orwiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'orwiki')

ur_edits_r = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 20191010000000 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {urwiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'urwiki')

kn_edits_r = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 20191010000000 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {knwiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'knwiki')

mr_edits_r = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 20191010000000 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {mrwiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'mrwiki')

gu_edits_r = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 20191010000000 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {guwiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'guwiki')

te_edits_r = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 20191010000000 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {tewiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'tewiki')

bn_edits_r = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 20191010000000 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {bnwiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'bnwiki')

pa_edits_r = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 20191010000000 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {pawiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'pawiki')






#add dbcolumn to each query df
pa_edits_r['database_code'] = 'pawiki' 
ml_edits_r['database_code'] = 'mlwiki'
hi_edits_r['database_code'] = 'hiwiki'
or_edits_r['database_code'] = 'orwiki'
ur_edits_r['database_code'] = 'urwiki'
ta_edits_r['database_code'] = 'tawiki'
kn_edits_r['database_code'] = 'knwiki'
mr_edits_r['database_code'] = 'mrwiki'
gu_edits_r['database_code'] = 'guwiki'
te_edits_r['database_code'] = 'tewiki'
bn_edits_r['database_code'] = 'bnwiki'

nppt_articles_edits = pd.concat([pa_edits_r, 
                      ml_edits_r,
                      hi_edits_r,
                      or_edits_r,
                      ur_edits_r,
                      ta_edits_r,
                      kn_edits_r,
                      mr_edits_r,
                      gu_edits_r,
                      te_edits_r,
                      bn_edits_r,
                     ], sort=True, ignore_index=True)

nppt_articles_edits.reset_index(drop=True);

nppt_articles_edits['edit_date'] = pd.to_datetime(nppt_articles_edits['edit_date'], format="%y-%m-%d")



In [41]:
nppt_articles_edits

Unnamed: 0,database_code,edit_date,page_id,revactor_actor


## > How many stump articles from the Google list were edited during the GLOW contest? 

#### 0 articles from the Google provided 'editing' list of articles have been edited since the contest started

In [38]:
test = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 20191010000000 
    AND (rev_deleted & 4) = 0
GROUP BY revactor_rev
LIMIT 10
""", 'pawiki')

In [39]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
page_id           10 non-null int64
edit_date         10 non-null object
revactor_actor    10 non-null int64
dtypes: int64(2), object(1)
memory usage: 368.0+ bytes


In [44]:
#test

#https://www.mediawiki.org/wiki/Manual:Revision_table#rev_timestamp
#https://www.mediawiki.org/wiki/Manual:Timestamp

#filter for those edited during the contest - 10th oct 2019 & 11th jan 2020 ---> 20191010000000
#yyyymmddhhmmss --  August 9th, 2010 00:30:06 --- 20100809003006

ta_edits_test = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 0 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {tawiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'tawiki')

ml_edits_test = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 0 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {mlwiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'mlwiki')

hi_edits_test = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 0 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {hiwiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'hiwiki')

or_edits_test = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 0 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {orwiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'orwiki')

ur_edits_test = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 0 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {urwiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'urwiki')

kn_edits_test = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 0 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {knwiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'knwiki')

mr_edits_test = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 0 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {mrwiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'mrwiki')

gu_edits_test = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 0 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {guwiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'guwiki')

te_edits_test = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 0 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {tewiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'tewiki')

bn_edits_test = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 0 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {bnwiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'bnwiki')

pa_edits_test = wmf.mariadb.run("""
SELECT 
    page_id,
    DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,
    revactor_actor
FROM revision_actor_temp
JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
JOIN page ON rev_page = page.page_id
WHERE rev_page = page_id
    AND rev_timestamp > 0 
    AND (rev_deleted & 4) = 0
    AND rev_page IN {pawiki_titles_norm_ed}
GROUP BY revactor_rev
""".format(**wd_vars), 'pawiki')






#add dbcolumn to each query df
pa_edits_test['database_code'] = 'pawiki' 
ml_edits_test['database_code'] = 'mlwiki'
hi_edits_test['database_code'] = 'hiwiki'
or_edits_test['database_code'] = 'orwiki'
ur_edits_test['database_code'] = 'urwiki'
ta_edits_test['database_code'] = 'tawiki'
kn_edits_test['database_code'] = 'knwiki'
mr_edits_test['database_code'] = 'mrwiki'
gu_edits_test['database_code'] = 'guwiki'
te_edits_test['database_code'] = 'tewiki'
bn_edits_test['database_code'] = 'bnwiki'

nppt_articles_edits_test = pd.concat([pa_edits_test, 
                      ml_edits_test,
                      hi_edits_test,
                      or_edits_test,
                      ur_edits_test,
                      ta_edits_test,
                      kn_edits_test,
                      mr_edits_test,
                      gu_edits_test,
                      te_edits_test,
                      bn_edits_test,
                     ], sort=True, ignore_index=True)

nppt_articles_edits_test.reset_index(drop=True);

nppt_articles_edits_test['edit_date'] = pd.to_datetime(nppt_articles_edits['edit_date'], format="%y-%m-%d")


In [45]:
#the above returns a df with a few hundred results, none from the glow tiger 2.0 contest period, all with null date fields
nppt_articles_edits_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 4 columns):
database_code     300 non-null object
edit_date         0 non-null datetime64[ns]
page_id           300 non-null object
revactor_actor    300 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 9.5+ KB


## QIDs & Sitelinks

### wikidata Q item

In [None]:
#https://www.mediawiki.org/wiki/Wikibase/Schema/wb_items_per_site
#https://www.mediawiki.org/wiki/Manual:Page_table
#wb_items_per_site site:quarry.wmflabs.org

## QUERY CLEAN EDITING SUBLIST

In [46]:
#change titles from denormalized (spaces) to normalized (underscore) for querying the page table etc.
#nppt_articles['article_suggestion'] = nppt_articles['article_suggestion'].str.replace('_', ' ')
nppt_articles['page_title'] = nppt_articles['page_title'].str.replace('_', ' ')
#create tuples of the article_suggestions and wiki_codes to use when querying for the wikidata items
editing_titles_denormalized_CLEAN = tuple(list(editing_topic_rec_df_CLEAN['page_title']))
editing_titles_denormalized_database_codes_CLEAN = tuple(list(editing_topic_rec_df_CLEAN['database_code']))

#set up a dict variable to use with .format when querying
wd_vars.update({
    'editing_titles_denormalized' : editing_titles_denormalized_CLEAN,
    'editing_titles_denormalized_db_codes' : editing_titles_denormalized_database_codes_CLEAN,
})

In [47]:
qid_r2_editing_CLEAN = wmf.mariadb.run("""
SELECT
      ips_site_page AS page_title,
      ips_item_id AS QID,
      ips_site_id AS database_code
FROM  wb_items_per_site  
WHERE ips_site_id IN {editing_titles_denormalized_db_codes} AND
      ips_site_page IN {editing_titles_denormalized}
""".format(**wd_vars), "wikidatawiki")

In [48]:
qid_r2_editing_CLEAN.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9038 entries, 0 to 9037
Data columns (total 3 columns):
page_title       9038 non-null object
QID              9038 non-null int64
database_code    9038 non-null object
dtypes: int64(1), object(2)
memory usage: 212.0+ KB


In [49]:
#merge in en query results to nppt_articles
#editing_topic_rec_df_CLEAN_ids_q = editing_topic_rec_df_CLEAN_ids.merge(qid_r2_editing_CLEAN, how="left", on=['page_title', 'database_code'])
nppt_articles_q = qid_r2_editing_CLEAN.merge(nppt_articles, how="left", on=['page_title', 'database_code'])

In [50]:
nppt_articles_q.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9038 entries, 0 to 9037
Data columns (total 5 columns):
page_title       9038 non-null object
QID              9038 non-null int64
database_code    9038 non-null object
page_id          8530 non-null float64
page_len         8530 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 423.7+ KB


## QUERY TRANSLATION SUBLIST FOR QITEMS ON ENWIKI

In [58]:
translation_topic_rec_df_CLEAN['article_suggestion'] = translation_topic_rec_df_CLEAN['article_suggestion'].str.replace('_', ' ')
titles_denormalized_translation_CLEAN = tuple(list(translation_topic_rec_df_CLEAN['article_suggestion']))

In [59]:
#get qids for translation articles
qid_r_en_CLEAN = wmf.mariadb.run("""
SELECT
  ips_site_page AS article_suggestion,
  ips_item_id AS QID
FROM  wb_items_per_site  
WHERE ips_site_id = 'enwiki' 
  AND ips_site_page IN {titles_denormalized_translation_CLEAN}
""".format(titles_denormalized_translation_CLEAN=titles_denormalized_translation_CLEAN), "wikidatawiki")

In [60]:
qid_r_en_CLEAN.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7647 entries, 0 to 7646
Data columns (total 2 columns):
article_suggestion    7647 non-null object
QID                   7647 non-null int64
dtypes: int64(1), object(1)
memory usage: 119.6+ KB


## QUERY translation rec qids for sitelinks

In [61]:
translation_qids = tuple(list(qid_r_en_CLEAN['QID']))

#set up a dict variable to use with .format when querying
wd_vars.update({
    'translation_qids' : translation_qids})

In [62]:
#https://www.wikidata.org/wiki/Help:Sitelinks
#https://www.wikidata.org/w/api.php?action=wbgetentities&ids=Q42&props=sitelinks

iwl_r = wmf.mariadb.run("""
SELECT
  linked_item.ips_item_id AS QID,
  GROUP_CONCAT(ips_site_id SEPARATOR ', ') AS iwsites,
  COUNT(ips_site_page) AS iwsitelinks
FROM (
      SELECT ips_item_id
      FROM wb_items_per_site
      WHERE ips_item_id IN {translation_qids}
      AND ips_site_id IN {wikis}
    ) AS linked_item
LEFT JOIN wb_items_per_site 
  ON linked_item.ips_item_id = wb_items_per_site.ips_item_id
LEFT JOIN page 
  ON linked_item.ips_item_id = page.page_id
GROUP BY page_id
""".format(**wd_vars), "wikidatawiki")

In [63]:
iwl_r.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4025 entries, 0 to 4024
Data columns (total 3 columns):
QID            4025 non-null int64
iwsites        4025 non-null object
iwsitelinks    4025 non-null int64
dtypes: int64(2), object(1)
memory usage: 94.5+ KB


In [64]:
t_iwl_q = iwl_r.merge(qid_r_en_CLEAN, how="left", on=['QID'])

In [65]:
t_iwl_q.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4025 entries, 0 to 4024
Data columns (total 4 columns):
QID                   4025 non-null int64
iwsites               4025 non-null object
iwsitelinks           4025 non-null int64
article_suggestion    4025 non-null object
dtypes: int64(2), object(2)
memory usage: 157.2+ KB


In [66]:
translation_topic_rec_df_CLEAN.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14155 entries, 8111 to 34294
Data columns (total 8 columns):
article_suggestion     14155 non-null object
local_encoded_title    0 non-null object
g_category             14155 non-null object
language_name          14155 non-null object
suggestion_type        14155 non-null object
file                   14155 non-null object
language_code          14155 non-null object
database_code          14155 non-null object
dtypes: object(8)
memory usage: 995.3+ KB


In [67]:
t_rec_iwl_q = t_iwl_q.merge(translation_topic_rec_df_CLEAN, how="left", on=['article_suggestion'])

In [68]:
t_rec_iwl_q.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8462 entries, 0 to 8461
Data columns (total 11 columns):
QID                    8462 non-null int64
iwsites                8462 non-null object
iwsitelinks            8462 non-null int64
article_suggestion     8462 non-null object
local_encoded_title    0 non-null object
g_category             8462 non-null object
language_name          8462 non-null object
suggestion_type        8462 non-null object
file                   8462 non-null object
language_code          8462 non-null object
database_code          8462 non-null object
dtypes: int64(2), object(9)
memory usage: 793.3+ KB


In [69]:
t_rec_iwl_q[t_rec_iwl_q.duplicated()]

Unnamed: 0,QID,iwsites,iwsitelinks,article_suggestion,local_encoded_title,g_category,language_name,suggestion_type,file,language_code,database_code


In [70]:
dupe_check = t_rec_iwl_q[t_rec_iwl_q.duplicated(['article_suggestion'])]

In [71]:
#the article is sometimes suggested as an article suggestion for more than one wiki
dupe_check.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4437 entries, 3 to 8460
Data columns (total 11 columns):
QID                    4437 non-null int64
iwsites                4437 non-null object
iwsitelinks            4437 non-null int64
article_suggestion     4437 non-null object
local_encoded_title    0 non-null object
g_category             4437 non-null object
language_name          4437 non-null object
suggestion_type        4437 non-null object
file                   4437 non-null object
language_code          4437 non-null object
database_code          4437 non-null object
dtypes: int64(2), object(9)
memory usage: 416.0+ KB


In [72]:
translation_rec_iwl_q = t_rec_iwl_q[['article_suggestion','QID','database_code', 'iwsites','iwsitelinks','language_code','file','suggestion_type','language_name','g_category',]]

In [73]:
translation_rec_iwl_q.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8462 entries, 0 to 8461
Data columns (total 10 columns):
article_suggestion    8462 non-null object
QID                   8462 non-null int64
database_code         8462 non-null object
iwsites               8462 non-null object
iwsitelinks           8462 non-null int64
language_code         8462 non-null object
file                  8462 non-null object
suggestion_type       8462 non-null object
language_name         8462 non-null object
g_category            8462 non-null object
dtypes: int64(2), object(8)
memory usage: 727.2+ KB


In [74]:
#create boolean column WHERE 'iwsites' column str contains 'database_code' value

In [75]:
translation_rec_iwl_q['database_code_in_iwsites'] = [x[0] in x[1] if x[0] is not None else False for x in zip(translation_rec_iwl_q['database_code'], translation_rec_iwl_q['iwsites'])]

In [76]:
translation_rec_iwl_q['database_code_in_iwsites'].values.sum()

1873

In [77]:
suggestions_created_a = translation_rec_iwl_q[translation_rec_iwl_q['database_code_in_iwsites'] == True]

## Translation articles - Qid match

In [78]:
#use groupby to get a df of articles that DON'T have an interwiki link associated with the 
#suggestion database_code, aka, they weren't created as far as we know so far

located =  translation_rec_iwl_q[translation_rec_iwl_q['database_code_in_iwsites'] == True]
not_yet_located = translation_rec_iwl_q[translation_rec_iwl_q['database_code_in_iwsites'] == False]

#located_list = list(translation_rec_iwl_q.loc[translation_rec_iwl_q['database_code_in_iwsites']== True, ['QID']])

In [79]:
x = translation_topic_rec_df_CLEAN.merge(qid_r_en_CLEAN, how="left", on=['article_suggestion'])

In [80]:
translation_sugg_to_cull = x.loc[x['QID'].notnull(), ['database_code', 'article_suggestion', 'QID']].reindex()

In [81]:
located_to_drop =located[['article_suggestion','database_code','QID']]

In [82]:
translation_sugg_not_found_interim = pd.merge(translation_sugg_to_cull,located_to_drop, indicator=True, how='outer')

In [83]:
translation_sugg_not_yet_found = translation_sugg_not_found_interim.loc[translation_sugg_not_found_interim['_merge']=='left_only', ['database_code', 'article_suggestion', 'QID']].reindex()

In [84]:
#fyi, titles are denormalized
translation_sugg_not_yet_found.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11876 entries, 7 to 13750
Data columns (total 3 columns):
database_code         11876 non-null object
article_suggestion    11876 non-null object
QID                   11876 non-null float64
dtypes: float64(1), object(2)
memory usage: 371.1+ KB


In [85]:
fountain_titles = pd.read_csv("../../data/raw/articles/2019/contest_titles_n_updated.csv", sep=',', encoding = 'utf-8')

In [86]:
fountain_titles_to_cull = fountain_titles[['wiki_db', 'QID']]

In [87]:
fountain_titles_to_cull = fountain_titles_to_cull.rename(columns={'wiki_db': 'database_code'})

In [88]:
translation_sugg = translation_sugg_not_yet_found[['database_code','QID']]

In [89]:
fountain_titles_to_cull.info(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12712 entries, 0 to 12711
Data columns (total 2 columns):
database_code    12712 non-null object
QID              12712 non-null float64
dtypes: float64(1), object(1)
memory usage: 198.8+ KB


In [90]:
translation_sugg.info(2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11876 entries, 7 to 13750
Data columns (total 2 columns):
database_code    11876 non-null object
QID              11876 non-null float64
dtypes: float64(1), object(1)
memory usage: 278.3+ KB


In [91]:
suggestions_created = pd.merge(fountain_titles_to_cull, translation_sugg, on=['database_code', 'QID'], how='inner')

In [92]:
len(suggestions_created)

133

## > How many translation articles from the Google list were created since the GLOW contest? 

### > 2006+ articles were picked from the suggestion lists (out of 34k+)

#### 1873+ articles from the Google provided 'translation' list of articles were created, since the contest started, and had the related iwl added 

In [93]:
translation_rec_iwl_q['database_code_in_iwsites'].values.sum()

1873

#### 133 articles from the Google provided 'translation' list of articles were created, since the contest started, and had a matching QID added

In [94]:
suggestions_created.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 133 entries, 0 to 132
Data columns (total 2 columns):
database_code    133 non-null object
QID              133 non-null float64
dtypes: float64(1), object(1)
memory usage: 3.1+ KB


#### to csv

In [95]:
#includes the 1873 with related interwiki links
suggestions_created_a.to_csv("../../data/processed/query_results/topic_lists/suggestions_created_a.csv", sep=',', encoding = 'utf-8', index=False)

In [96]:
#includes 133
suggestions_created.to_csv("../../data/processed/query_results/topic_lists/suggestions_created_b.csv", sep=',', encoding = 'utf-8', index=False)

#### To Do in the future, topics

In [98]:
editing_recs = qid_r2_editing_CLEAN[['page_title', 'QID']]

In [100]:
suggestion_qids = pd.concat([qid_r_en_CLEAN, editing_recs], sort=True, ignore_index=True)

In [101]:
suggestion_qids.to_csv("../../data/processed/query_results/topic_lists/suggestions_qids.csv", sep=',', encoding = 'utf-8', index=False)