# Get Recs Created/Edited - Ids & QIds

# Table of Contents  <a class="anchor" id="toc"></a>

Guiding Question:
Q: how many articles were created from the suggestions? 

Process:

Prep:
1. Combine and clean suggestion lists
2. groupby type: editing suggestions

    a. get pageids (not including redirects) 
    
    b. get wikidata items using the local language (for use with topics, if time permits)
    
3. groupby type: translation suggestions

    a. get wikidata items using enwiki 
    
    b. get iwlinks
    
4. Read and clean the list of articles created and submitted to GLOW (includes articles ultimately disapproved)


Analysis:
5. groupby type: editing suggestions

    a. count matches by ids - count ids that were edited during the contest period
    
6. groupby type: translation suggestions

    a. count matches by wikidata item in the suggested language
    
    b. count matches by iwlink in the suggested language
    
    c. get a sum of items a+b

NOTES:
1. Analysis notes
    a. If the editor uses Content Translation, it should automatically assign the right QID
    b. If the editor doesn't use CT, either they or someone else has to assign the QID
    c. We will miss articles that were not created via Content Translation and don't have a manually added QID and/or the editor changed the suggested title to something new. 
4. Suggestion notes
    a. From the creators of the suggestions: "As a reminder, we have 2 lists: a list of suggested topics that exist in the local language but could be edited to be more complete based on the corresponding English page, and a list of topics that can be translated from English to the local language.  Based on feedback from the initial Project Tiger, we've separated out the topics by categories so editors can focus on the areas they like to write about.  The lists are ordered by popularity of what local language users are looking for."

In [1]:
import os
from glob import glob
import pandas as pd
import numpy as np

import wmfdata as wmf
from wmfdata import charting, mariadb, hive
from wmfdata.utils import pct_str, pd_display_all
import urllib
from urllib.parse import unquote

You are using wmfdata v1.0.3, but v1.0.4 is available.

To update, run `pip install --upgrade git+https://github.com/neilpquinn/wmfdata.git@release`.

To see the changes, refer to https://github.com/neilpquinn/wmfdata/blob/release/CHANGELOG.md


### Indonesia - reading data

In [2]:
#compile all of the lists

f_mask = r'../../../GLOW/data/raw/g_topic_lists/Indonesia/*.xlsx'

gtl = \
pd.concat([gtl.assign(file=os.path.splitext(os.path.basename(f))[0],
                     sheet=sheet)
           for f in glob(f_mask)
           for sheet, gtl in pd.read_excel(f, sheet_name=None).items()],
          ignore_index=True, sort=True)

full_topic_rec_df = gtl.copy()

In [3]:
del full_topic_rec_df['local_wikipedia.1']

#get article suggestion from url, as opposed to getting it from the google 'topic' which is sometimes a short hand version of the title
full_topic_rec_df['article_suggestion'] = full_topic_rec_df['english_wikipedia'].str.rsplit(".org/wiki/").str[-1]
full_topic_rec_df.loc[full_topic_rec_df["article_suggestion"].isnull(),'article_suggestion'] = full_topic_rec_df["Topic"] 

#encoded to decoded -- if there are nulls it will provide a float type error
full_topic_rec_df['article_suggestion'] = full_topic_rec_df['article_suggestion'].apply(lambda x: unquote(x) if pd.notnull(x) else x).copy(deep=False) #apply if value not null

full_topic_rec_df = full_topic_rec_df.rename(columns={#'Topic': 'article_suggestion',
                                                      'sheet':'g_category',
                                                      'wikidata_item':'QID',
                                                     }
                                            )
full_topic_rec_df[['language_name', 'suggestion_type']] = full_topic_rec_df['file'].str.split(" ", 1, expand=True)
full_topic_rec_df['suggestion_type'] = full_topic_rec_df['file'].str.rsplit(" ").str[-1]
del full_topic_rec_df['file']
full_topic_rec_df = full_topic_rec_df.replace({'language_name' : { 'Sunda' : 'Sundanese', 'Jawa' : 'Javanese', 'Bahasa' : 'Indonesian' }})

#extract url title
full_topic_rec_df['local_encoded_title'] = full_topic_rec_df['local_wikipedia'].str.extract('([^\/]+$)', expand=True)

#encoded URL to decoded title -- if there are nulls it will provide a float type error
full_topic_rec_df['page_title'] = full_topic_rec_df['local_encoded_title'].apply(lambda x: unquote(x) if pd.notnull(x) else x).copy(deep=False) #apply if value not null

#change titles from denormalized (spaces) to normalized (underscore) for querying the page table etc.
full_topic_rec_df['page_title'] = full_topic_rec_df['page_title'].str.replace(' ', '_')

#remove Q from QID if exists
full_topic_rec_df['QID'] = full_topic_rec_df['QID'].str.replace('Q', '')

#replace empty strings with Nans
full_topic_rec_df['QID'] = full_topic_rec_df['QID'].replace(r'^\s*$', np.nan, regex=True)

#confirm
sub = '   '
full_topic_rec_df[full_topic_rec_df['QID'].str.contains(sub, na=False)]


#make int
#full_topic_rec_df['QID'] = full_topic_rec_df['QID'].astype(int)
full_topic_rec_df['QID'] = pd.to_numeric(full_topic_rec_df['QID'], errors='coerce', downcast='integer')

### India - reading data

In [4]:
#if pnb article_suggestions exist, confirm language_code = url_language_code(see code above to add in url_language_code column)
#pnb	pnbwiki	Western Punjabi 

### Pulling Data

In [5]:
#get database_code and language_code, confirm language_code (if needed)
lang_names =tuple(full_topic_rec_df['language_name'].unique())

ci = wmf.hive.run("""
SELECT  language_code, database_code, language_name
FROM canonical_data.wikis
WHERE language_name IN {lang_names} AND database_group = 'wikipedia'
""".format(lang_names=lang_names))

In [6]:
#merge
full_topic_rec_df_ci = full_topic_rec_df.merge(ci, how="left", on=['language_name'])

In [7]:
full_topic_rec_df_ci['suggestion_type'].unique()

array(['Translating', 'Editing'], dtype=object)

In [8]:
#use groupby to get two seperate dfs for each suggestion_type
translation_topic_rec_df = full_topic_rec_df_ci[full_topic_rec_df_ci['suggestion_type'] == 'Translating'].copy(deep=False)

#get clean list - drop duplicates
translation_topic_rec_df_CLEAN = translation_topic_rec_df.drop_duplicates(subset=['article_suggestion', 'local_encoded_title','g_category','suggestion_type', 'language_code'], keep='first').copy(deep=False)

#keep just the duplicates - for checking data later on
translation_topic_rec_df_Dupes = pd.concat([translation_topic_rec_df, translation_topic_rec_df_CLEAN]).loc[translation_topic_rec_df.index.symmetric_difference(translation_topic_rec_df_CLEAN.index)]

In [9]:
#use groupby to get two seperate dfs for each suggestion_type
editing_topic_rec_df = full_topic_rec_df_ci.loc[full_topic_rec_df_ci['suggestion_type'] == 'Editing'].copy(deep=False)

#get clean list - drop duplicates
editing_topic_rec_df_CLEAN = editing_topic_rec_df.drop_duplicates(subset=['article_suggestion', 'local_encoded_title','g_category','suggestion_type', 'language_code'], keep='first')

## Editing subset

## Get article ids, redirects <a class="anchor" id="get_clean_list"></a>

In [10]:
# adapted from https://github.com/nettrom/suggestbot/blob/master/tool-labs/link-rec/link-recommender.py#L208
#https://www.mediawiki.org/wiki/Manual:Redirect_table
#https://www.mediawiki.org/wiki/Manual:Page_table
#https://www.mediawiki.org/wiki/Manual:Pagelinks_table
#--rd.redirect_id -- where is this field located? in which table can it be found?

articles = []

def get_clean_ids_mariadb(df):
    
    '''
    Connect to the MediaWiki databases for the wikis found
    in the given `pandas.DataFrame` `df` and return a
    DataFrame with pageids for non redirect articles
    '''

    clean_id_query = '''
    SELECT 
       p1.page_title AS page_title,
       DATABASE() AS database_code,
       p1.page_id  AS page_id,
       p1.page_is_redirect AS p1_is_redirect,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
    FROM page AS p1 
    LEFT JOIN redirect AS rd 
        ON p1.page_id=rd.rd_from 
    LEFT JOIN page AS p2 
        ON (rd_namespace = p2.page_namespace)
            AND rd_title = p2.page_title  
    WHERE p1.page_namespace = 0
        AND p1.page_title IN {raw_articles}
    '''
    
    clean_id_query_one_article = '''
    SELECT 
       p1.page_title AS page_title,
       DATABASE() AS database_code,
       p1.page_id  AS page_id,
       p1.page_is_redirect AS p1_is_redirect,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
    FROM page AS p1 
    LEFT JOIN redirect AS rd 
        ON p1.page_id=rd.rd_from 
    LEFT JOIN page AS p2 
        ON (rd_namespace = p2.page_namespace)
            AND rd_title = p2.page_title  
    WHERE p1.page_namespace = 0
        AND p1.page_title = '{raw_articles}'
    '''

    for wiki in df['database_code'].unique():
        print('***')
        print(wiki)
        
        grouping = df.loc[df['database_code'] == wiki].groupby('database_code')['page_title'].apply(pd.DataFrame)
        raw_articles = tuple(list(grouping[wiki]))
        article = grouping.reset_index(drop=True).iloc[0][0]
        
        if len(raw_articles)>= 2:
            redirects_r = mariadb.run(clean_id_query.format(raw_articles=raw_articles), wiki )
        else: redirects_r = mariadb.run(clean_id_query_one_article.format(raw_articles=article), wiki )
        articles.append(redirects_r)   
    
    return(articles)

In [11]:
get_clean_ids_mariadb(editing_topic_rec_df);
edit_id_results = pd.concat(articles)

***
jvwiki
***
idwiki
***
suwiki


In [12]:
#merge
editing_df = editing_topic_rec_df_CLEAN.merge(edit_id_results, how="left", on=['page_title', 'database_code'])

In [13]:
# |
#check to see if any of the page_ids are redirects or double redirects
((editing_df['p1_is_redirect']==1) & (editing_df['is_double_redirect']==1)).any()

False

In [14]:
# |
#check to see if any of the page_ids are redirects or double redirects
((editing_df['p1_is_redirect']==1) | (editing_df['is_double_redirect']==1)).any()

True

In [15]:
editing_df_missing_pageid = editing_df[editing_df['page_id'].isnull()]

##### wikidata Q item

In [16]:
#https://www.mediawiki.org/wiki/Wikibase/Schema/wb_items_per_site
#https://www.mediawiki.org/wiki/Manual:Page_table
#wb_items_per_site site:quarry.wmflabs.org

#### wikidata items

In [17]:
#change titles from denormalized (spaces) to normalized (underscore) for querying the page table etc.
editing_df['page_title'] = editing_df['page_title'].str.replace('_', ' ')

#select rows that are missing wd items & split into two dfs
missing_wd = editing_df[editing_df['QID'].isnull()]
w_wd = editing_df[~editing_df['QID'].isnull()]

#create tuples of the article_suggestions and wiki_codes to use when querying for the wikidata items
editing_titles_denormalized_CLEAN = tuple(list(missing_wd['page_title']))
editing_titles_denormalized_database_codes_CLEAN = tuple(list(missing_wd['database_code']))

#set up a dict variable to use with .format when querying
wd_vars = {}
wd_vars.update({
    'editing_titles_denormalized' : editing_titles_denormalized_CLEAN,
    'editing_titles_denormalized_db_codes' : editing_titles_denormalized_database_codes_CLEAN,
})

In [18]:
#TODO if redirect then page_title = rpage_title

qid_editing_CLEAN_r = wmf.mariadb.run("""
SELECT
      ips_site_page AS page_title,
      ips_item_id AS QID,
      ips_site_id AS database_code
FROM  wb_items_per_site  
WHERE ips_site_id IN {editing_titles_denormalized_db_codes} AND
      ips_site_page IN {editing_titles_denormalized}
""".format(**wd_vars), "wikidatawiki")

In [19]:
qid_editing_query_results = qid_editing_CLEAN_r.copy()
editing_articles_qid_query = missing_wd.merge(qid_editing_query_results, how= 'left', on=['page_title', 'database_code']).drop('QID_x', axis=1).rename({'QID_y': 'QID'}, axis=1)

editing_df_qid_base = pd.concat([editing_articles_qid_query, w_wd], sort=True)


##### Redirects from the edit df

In [20]:
editing_df_qid_base['rpage_title'] = editing_df_qid_base['rpage_title'].str.replace('_', ' ')

e_missing_qid = editing_df_qid_base[editing_df_qid_base.rpage_title.notna() & editing_df_qid_base['QID'].isnull()]

#change titles from denormalized (spaces) to normalized (underscore) for querying the page table etc.

#create tuples of the article_suggestions and wiki_codes to use when querying for the wikidata items
editing_titles_denormalized_CLEAN_redirects = tuple(list(e_missing_qid['rpage_title']))
editing_titles_denormalized_database_codes_CLEAN_redirects = tuple(list(e_missing_qid['database_code']))

#set up a dict variable to use with .format when querying
wd_vars = {}
wd_vars.update({
    'editing_titles_denormalized_redirects' : editing_titles_denormalized_CLEAN_redirects,
    'editing_titles_denormalized_db_codes_redirects' : editing_titles_denormalized_database_codes_CLEAN_redirects,
})

In [21]:
qid_editing_CLEAN_redirects = wmf.mariadb.run("""
SELECT
      ips_site_page AS rpage_title,
      ips_item_id AS QID,
      ips_site_id AS database_code
FROM  wb_items_per_site  
WHERE ips_site_id IN {editing_titles_denormalized_db_codes_redirects} AND
      ips_site_page IN {editing_titles_denormalized_redirects}
""".format(**wd_vars), "wikidatawiki")

In [22]:
editing_df_qid = editing_df_qid_base.merge(qid_editing_CLEAN_redirects, how= 'left', on=['rpage_title', 'database_code'], suffixes=('_x', '_y'))

In [23]:
editing_df_qid["QID_x"] = editing_df_qid["QID_x"].fillna(0)
editing_df_qid["QID_y"] = editing_df_qid["QID_y"].fillna(0)
editing_df_qid['QID'] = editing_df_qid['QID_x'] + editing_df_qid['QID_y']
editing_df_qid['QID'] = editing_df_qid['QID'].replace(0,np.nan)
editing_df_qid.drop(['QID_x', 'QID_y'], axis=1, inplace=True)

#### get sitelinks

In [25]:
clean = editing_df_qid[~editing_df_qid['QID'].isnull()]
editing_qids = tuple(list(clean['QID'])) #w/o nulls


#set up a dict variable to use with .format when querying
wd_vars.update({
    'editing_qids' : editing_qids})

In [26]:
#https://www.wikidata.org/wiki/Help:Sitelinks
#https://www.wikidata.org/w/api.php?action=wbgetentities&ids=Q42&props=sitelinks

iwl_e_r = wmf.mariadb.run("""
SELECT
  ips_item_id AS QID,
  GROUP_CONCAT(ips_site_id SEPARATOR ', ') AS iwsites,
  COUNT(ips_site_page) AS iwsitelinks
FROM wb_items_per_site
      WHERE ips_item_id IN {editing_qids}
""".format(**wd_vars), "wikidatawiki")

In [27]:
qid_e_query_results = iwl_e_r.copy()

In [28]:
editing_df_qid_iwl = editing_df_qid.merge(qid_e_query_results, how='left', on=['QID'])

In [29]:
print("# of suggestions:", len(editing_df))
print('missing_wd:', len(missing_wd))
print('w_wd:', len(w_wd))
print('qid_editing_query_results:', len(qid_editing_query_results))
print("editing_articles_qid_query:", len(editing_articles_qid_query))
print('editing_df_qid:', len(editing_df_qid))

# of suggestions: 8393
missing_wd: 5360
w_wd: 3033
qid_editing_query_results: 7218
editing_articles_qid_query: 5360
editing_df_qid: 8393


## TRANSLATION SUBLIST 

#### QUERY FOR QITEMS ON ENWIKI

In [30]:
#change titles from denormalized (spaces) to normalized (underscore) for querying the page table etc.
translation_topic_rec_df_CLEAN['article_suggestion'] = translation_topic_rec_df['article_suggestion'].str.replace('_', ' ')
translation_topic_rec_df_CLEAN['Topic'] = translation_topic_rec_df['Topic'].str.replace('_', ' ')

t = translation_topic_rec_df_CLEAN.copy(deep=True)

In [33]:
#wikidata items via article_suggestion
#select only rows that don't have missing article_suggestion
ttrd_not_null = t[~t['article_suggestion'].isnull()] #check nulls

#select rows that are missing wd items
t_missing_wd = ttrd_not_null[ttrd_not_null['QID'].isnull()]

titles_denormalized_translation_CLEAN = tuple(list(t_missing_wd['article_suggestion']))

#get qids for translation articles
qid_en_CLEAN_r = wmf.mariadb.run("""
SELECT
  ips_site_page AS article_suggestion,
  ips_item_id AS QID
FROM  wb_items_per_site  
WHERE ips_site_id = 'enwiki' 
  AND ips_site_page IN {titles_denormalized_translation_CLEAN}
""".format(titles_denormalized_translation_CLEAN=titles_denormalized_translation_CLEAN), "wikidatawiki")

qid_t_query_results = qid_en_CLEAN_r.copy()

In [34]:
t_w_t_query = t.merge(qid_t_query_results, how='left', on=['article_suggestion'])

In [35]:
t_w_t_query['QID_x'].equals(t_w_t_query['QID_y'])

False

In [36]:
t_w_t_query.loc[~t_w_t_query["QID_y"].isnull(),'QID_x'] = t_w_t_query["QID_y"] 
t_w_t_query = t_w_t_query.drop('QID_y', axis=1).rename({'QID_x': 'QID'}, axis=1)

In [37]:
#wikidata items via Topic, for those that are still nulls after the above query
#select only rows that don't have missing topic
qrwdmq_not_null = t_w_t_query[~t_w_t_query['Topic'].isnull()] #check nulls

#select rows that are missing wd items 
qrwdmq_missing_wd = qrwdmq_not_null[qrwdmq_not_null['QID'].isnull()]

topics_denormalized_translation_CLEAN = tuple(list(qrwdmq_missing_wd['Topic']))

#get qids for translation articles
qid_en_topics_CLEAN_raw = wmf.mariadb.run("""
SELECT
  ips_site_page AS Topic,
  ips_item_id AS QID
FROM  wb_items_per_site  
WHERE ips_site_id = 'enwiki' 
  AND ips_site_page IN {topics_denormalized_translation_CLEAN}
""".format(topics_denormalized_translation_CLEAN=topics_denormalized_translation_CLEAN), "wikidatawiki")
qid_en_topics_CLEAN_r = qid_en_topics_CLEAN_raw.copy()

In [38]:
query_results_w_df_missing_qids2 = t_w_t_query.merge(qid_en_topics_CLEAN_r, how='left', on=['Topic'])#.drop('QID_x', axis=1).rename({'QID_y': 'QID'}, axis=1)
query_results_w_df_missing_qids2.loc[~query_results_w_df_missing_qids2["QID_y"].isnull(),'QID_x'] = query_results_w_df_missing_qids2["QID_y"] 
query_results_w_df_missing_qids2 = query_results_w_df_missing_qids2.drop('QID_y', axis=1).rename({'QID_x': 'QID'}, axis=1)

In [None]:
#deal with redirect suggestions - part 1: get redir titles
#change titles from denormalized (spaces) to normalized (underscore) for querying the page table etc.
translation_df_qid_base['article_suggestion'] = query_results_w_df_missing_qids2['article_suggestion'].str.replace(' ', '_')

t_qid_nulls = query_results_w_df_missing_qids2[query_results_w_df_missing_qids2['QID'].isnull()]

article_suggestions_tuple = tuple(list(t_qid_nulls['article_suggestion']))

#get ids for translation articles
t_ids_raw = wmf.mariadb.run("""
    SELECT 
       p1.page_title AS article_suggestion,
       DATABASE() AS database_code,
       p1.page_id  AS page_id,
       p1.page_is_redirect AS p1_is_redirect,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
    FROM page AS p1 
    LEFT JOIN redirect AS rd 
        ON p1.page_id=rd.rd_from 
    LEFT JOIN page AS p2 
        ON (rd_namespace = p2.page_namespace)
            AND rd_title = p2.page_title  
    WHERE p1.page_namespace = 0
        AND p1.page_title IN {article_suggestions_tuple}
""".format(article_suggestions_tuple=article_suggestions_tuple), 'enwiki')

t_ids_r = t_ids_raw.copy(deep=True)



#deal with redirect suggestions - part 2: get wiki data items
t_ids_r['rpage_title'] = t_ids_r['rpage_title'].str.replace('_', ' ')
t_ids_r['article_suggestion'] = t_ids_r['article_suggestion'].str.replace('_', ' ')

r_page_titles = t_ids_r[~t_ids_r['rpage_id'].isnull()]
r_page_title_tuple = tuple(list(r_page_titles['rpage_title']))

#get qids for translation articles
qid_en_from_rtitles_raw = wmf.mariadb.run("""
SELECT
  ips_item_id AS QID,
  ips_site_page AS rpage_title
FROM  wb_items_per_site  
WHERE ips_site_id = 'enwiki' 
  AND ips_site_page IN {r_page_title_tuple}
""".format(r_page_title_tuple=r_page_title_tuple), "wikidatawiki")

qid_en_from_rtitles_r = qid_en_from_rtitles_raw.copy(deep=True)

In [None]:
redirect_suggs_info = t_ids_r.merge(qid_en_from_rtitles_r, how='left', on=['rpage_title'])

redirect_suggs_info_df = t_qid_nulls.merge(redirect_suggs_info, how='left', on=['article_suggestion'])
#z.drop('QID_x', axis=1, inplace=True)

redirect_suggs_info_df["QID_x"] = redirect_suggs_info_df["QID_x"].fillna(0)
redirect_suggs_info_df["QID_y"] = redirect_suggs_info_df["QID_y"].fillna(0)
redirect_suggs_info_df['QID'] = redirect_suggs_info_df['QID_x'] + redirect_suggs_info_df['QID_y']
redirect_suggs_info_df['QID'] = redirect_suggs_info_df['QID'].replace(0,np.nan)
redirect_suggs_info_df.drop(['QID_x', 'QID_y'], axis=1, inplace=True)

redirect_suggs_info_df.drop('database_code_y', axis=1, inplace=True)
redirect_suggs_info_df = redirect_suggs_info_df.rename(columns={'database_code_x':'database_code'})

redirect_suggs_info_df['rpage_title'] = redirect_suggs_info_df['rpage_title'].str.replace(' ', '_')

In [None]:
to_merge_in = redirect_suggs_info_df[~redirect_suggs_info_df['QID'].isnull()]
to_merge_in = to_merge_in [['Topic', 'article_suggestion', 'QID']]

In [None]:
translation_df_qid = query_results_w_df_missing_qids2.merge(to_merge_in, how='left', on=['Topic',
                                                                               'article_suggestion',
                                                                              ])

translation_df_qid.loc[~translation_df_qid["QID_y"].isnull(),'QID_x'] = translation_df_qid["QID_y"] 
translation_df_qid = translation_df_qid.drop('QID_y', axis=1).rename({'QID_x': 'QID'}, axis=1)

In [None]:
left = redirect_suggs_info_df[redirect_suggs_info_df['QID'].isnull()]


left_topics = left[left['QID'].isnull()]
left_topics_tuple = tuple(list(left_topics['Topic']))

len(left)

In [None]:
#https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles=Steam_(software)&normalize=1

import requests

S = requests.Session()

URL = "https://www.mediawiki.org/w/api.php"

PARAMS = {
    "action": "wbgetentities",
    "sites": "enwiki",
    "titles": "Steam_(software)",
    "normalize": "1",
    "format": "json"
}

R = S.get(url=URL, params=PARAMS)
DATA = R.json()

In [None]:
DATA

In [None]:
print('translation_topic_rec_df_CLEAN', len(translation_topic_rec_df_CLEAN))
print('t_missing_wd:', len(t_missing_wd))
print('t_w_wd:', len(t_w_wd))
print('queried & found:', len(qid_t_query_results))
print('queried again and found redirect suggs:', len(redirect_suggs_info_df))
print('final df len:', len(translation_df_qid))

#### Query for sitelinks

In [None]:
t_clean = translation_df_qid[~translation_df_qid['QID'].isnull()]#w/o nulls

translation_qids = tuple(list(t_clean['QID']))

#set up a dict variable to use with .format when querying
wd_vars.update({
    'translation_qids' : translation_qids})

In [None]:
#https://www.wikidata.org/wiki/Help:Sitelinks
#https://www.wikidata.org/w/api.php?action=wbgetentities&ids=Q42&props=sitelinks

iwl_r = wmf.mariadb.run("""
SELECT
  ips_item_id AS QID,
  GROUP_CONCAT(ips_site_id SEPARATOR ', ') AS iwsites,
  COUNT(ips_site_page) AS iwsitelinks
FROM wb_items_per_site
      WHERE ips_item_id IN {translation_qids} 
GROUP BY QID
""".format(**wd_vars), "wikidatawiki")

In [None]:
#merge to get df with article_suggestion, QID, iwsites, iwsitelinks
translation_df_qid_iwl = iwl_r.merge(translation_df_qid, how="right", on=['QID'])

In [None]:
#check for duplicates
translation_df_qid_iwl[translation_df_qid_iwl.duplicated()]

## Compile and clean

In [None]:
dfs = [translation_df_qid_iwl, editing_df_qid_iwl]

df = pd.concat(dfs, sort=True)
#https://stackoverflow.com/questions/59124863/how-to-concat-or-merge-three-tables-with-different-number-of-columns-in-pandas

In [None]:
print('full_topic_rec_df:', len(full_topic_rec_df))
print('processed df:', len(df))

In [None]:
#the article is sometimes suggested as an article suggestion for more than one wiki
duplicated_translation_recs = df[df.duplicated(['article_suggestion'])]
dupe_check = df[df.duplicated(['article_suggestion', 'database_code', 'local_encoded_title'])]

print("duplicated_translation_recs", len(duplicated_translation_recs))
print("dupes", len(dupe_check))

In [None]:
df.to_csv("../../data/processed/query_results/topic_lists/indonesia/rec_qids_iwls.csv", sep=',', encoding = 'utf-8', index=False) 