# Get Page Ids

# load_submitted_titles_get_survival_rate

In [1]:
import requests
import json
from pandas.io.json import json_normalize
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
import os
from glob import glob
import pandas as pd

import wmfdata as wmf
from wmfdata import charting, mariadb, hive, spark
from wmfdata.utils import pct_str, pd_display_all
import urllib
from urllib.parse import unquote

%run 2b_data_handling.ipynb

You are using wmfdata v1.0.3, but v1.0.4 is available.

To update, run `pip install --upgrade git+https://github.com/neilpquinn/wmfdata.git@release`.

To see the changes, refer to https://github.com/neilpquinn/wmfdata/blob/release/CHANGELOG.md


Stored 'query_vars' (dict)
Stored 'quality_vars' (dict)


In [3]:
df = pd.read_csv("../../data/raw/articles/2019/Indonesia/compiled/articles.csv")
#trailing rstrip
df['page_title'] = df['page_title'].str.rstrip('_')
#encoded URL to decoded title -- if there are nulls it will provide a float type error
df['page_title'] = df['page_title'].apply(lambda x: unquote(x) if pd.notnull(x) else x).copy(deep=False) #apply if value not null
df['page_title'] = df['page_title'].replace(' ', '_', regex=True)

#also, drop items after a hashtag. eg: https://min.wikipedia.org/wiki/Senja_di_Jakarta#Carito_singkek

## Data pull functions

In [4]:
first_edit_timestamp_articles = []

def get_clean_ids_hive(df):
    
    '''
    Connect to the MediaWiki databases for the wikis found
    in the given `pandas.DataFrame` `df` and return a
    DataFrame with pageids including redirect status info
    '''

    get_date_info_mult_articles_query = """
    SELECT
        page_title,
        wiki_db AS database_code,
        page_id
    FROM wmf.mediawiki_history 
    WHERE
        snapshot = '2020-07'
        AND page_namespace = 0
        AND wiki_db = '{wiki_db}'
        AND page_title IN {raw_articles}
    GROUP BY 
        page_title, wiki_db, page_id
    """
    
    get_date_info_single_articles_query = """
    SELECT
        page_title,
        wiki_db AS database_code,
        page_id
    FROM wmf.mediawiki_history 
    WHERE
        snapshot = '2020-07'
        AND page_namespace = 0
        AND wiki_db = '{wiki_db}'
        AND page_title = '{article}'
    GROUP BY 
        page_title, wiki_db, page_id   
    """
        
    for wiki in df['database_code'].unique(): 
        grouping = df.loc[df['database_code'] == wiki].groupby('database_code')['page_title'].apply(pd.DataFrame)
        raw_articles = tuple(list(grouping[wiki]))
        article = grouping.reset_index(drop=True).iloc[0][0]
        wiki_db = wiki
        if len(raw_articles)>= 2:
            date_results = spark.run(get_date_info_mult_articles_query.format(raw_articles=raw_articles, wiki_db=wiki_db ))
        else: date_results = spark.run(get_date_info_single_articles_query.format(article=article, wiki_db=wiki_db ))
        first_edit_timestamp_articles.append(date_results)
    return(first_edit_timestamp_articles)

In [5]:
# adapted from https://github.com/nettrom/suggestbot/blob/master/tool-labs/link-rec/link-recommender.py#L208
#https://www.mediawiki.org/wiki/Manual:Redirect_table
#https://www.mediawiki.org/wiki/Manual:Page_table
#https://www.mediawiki.org/wiki/Manual:Pagelinks_table
#--rd.redirect_id -- where is this field located? in which table can it be found?

articles = []

def get_clean_ids_mariadb(df):
    
    '''
    Connect to the MediaWiki databases for the wikis found
    in the given `pandas.DataFrame` `df` and return a
    DataFrame with pageids for non redirect articles
    '''

    clean_id_query = '''
    SELECT 
       p1.page_title AS page_title,
       DATABASE() AS database_code,
       p1.page_id  AS page_id
    FROM page AS p1 
        AND p1.page_title IN {raw_articles}
    '''
    
    clean_id_query_one_article = '''
    SELECT 
       p1.page_title AS page_title,
       DATABASE() AS database_code,
       p1.page_id  AS page_id
    FROM page AS p1  
    WHERE p1.page_namespace = 0
        AND p1.page_title = '{raw_articles}'
    '''

    for wiki in df['database_code'].unique():
        print('***')
        print(wiki)
        
        grouping = df.loc[df['database_code'] == wiki].groupby('database_code')['page_title'].apply(pd.DataFrame)
        raw_articles = tuple(list(grouping[wiki]))
        article = grouping.reset_index(drop=True).iloc[0][0]
        
        if len(raw_articles)>= 2:
            redirects_r = mariadb.run(clean_id_query.format(raw_articles=raw_articles), wiki )
        else: redirects_r = mariadb.run(clean_id_query_one_article.format(raw_articles=article), wiki )
        articles.append(redirects_r)   
    
    return(articles)

## Step 1: Read df and query using Hive

In [6]:
get_clean_ids_hive(df);

In [7]:
results= pd.concat(first_edit_timestamp_articles)

In [8]:
results_deduped_prep = results.sort_values(by=["page_title", 'database_code', 'page_id'], na_position='last').groupby(["page_title", 'database_code'])["page_id"].first().reset_index()
results_deduped = results_deduped_prep.drop_duplicates(subset=["page_title", 'database_code'], keep='first')
articles_pageids = results_deduped.merge(df, how="right", on=['page_title', 'database_code'])

In [None]:
missing = articles_pageids[articles_pageids['page_id'].isnull()]
missing

In [11]:
#manual review and data entry for those which obtained NaN results in query
articles_pageids.loc[(articles_pageids.database_code == 'minwiki') & (articles_pageids.page_id == np.nan) & (articles_pageids.page_title == 'Senja_di_Jakarta#Carito_singkek'),'page_id'] = 326575
#manual review and data entry for those which obtained NaN results in query
articles_pageids.loc[(articles_pageids.database_code == 'jvwiki') & (articles_pageids.page_id == np.nan) & (articles_pageids.page_title == 'Surabaya_Samator#Pemain_Surabaya_Samator_2019'),'page_id'] = 163855

articles_pageids.at[1792, 'page_id'] = 326575
articles_pageids.at[1793, 'page_id'] = 163855

In [None]:
#check
articles_pageids.loc[articles_pageids['page_title'] == 'Senja_di_Jakarta#Carito_singkek']

In [13]:
articles_pageids.to_csv("../../data/processed/query_results/content_quality/indonesia/articles_pageids_CLEAN.csv", sep=',', encoding = 'utf-8', index=False)

## Step 2: query using MariaDB for remaining IDS

In [None]:
get_clean_ids_mariadb(articles_df);

In [None]:
indonesia_articles = pd.concat(missing)