# GLOW  - Content Quality & Details

# Table of Contents  <a class="anchor" id="toc"></a>

* [Content](#top)
    1. Identify and filter out redirects
    [num editors/article](#nea)
    2. translation
    3. [pagelen & pagelen relative (composition)](#pagelen)
    4. wikidata item
    5. article creation date
    6. [pageviews(use/utility)](#pv)
    7. filter for new articles
    8. edits & editors per article
    9. edits & timestamps
    10. editors per article
    11. [talk page activity](#tpa) 
    12. article watch count
    13. [revert rate(use/utility)](#rr)
    14. [links(importance/integration)](#ol)   

In [189]:
# import wmfdata as wmf
import wmfdata as wmf
from wmfdata import charting, mariadb, hive, spark
from wmfdata.utils import pct_str, pd_display_all

import requests
import re
from urllib import request
import json

import logging
import gc
import weakref
from functools import reduce
from pathlib import Path
import requests
import pprint

#import jupyter_contrib_nbextensions
import pandas as pd
import numpy as np

import time
import datetime as dt 
from datetime import datetime, timedelta, date
import dateutil

#%load_ext sql_magic

In [190]:
#ensure proper country and contest dates selected in data handling content quality section 
%run 2b_data_handling.ipynb
#%store -r IN_median_vi
%store -r median_values
#%run ./data_collection/collecting_articles.ipynb

Stored 'query_vars' (dict)
Stored 'quality_vars' (dict)


# 3. Content Quality<a class="anchor" id="stage1"></a>
[Back to Table of Contents](#toc)

### Get Data India

### Get Data Indonesia

In [191]:
df = pd.read_csv("../../data/processed/query_results/content_quality/indonesia/articles_pageids_CLEAN.csv", sep=',', encoding = 'utf-8')

In [192]:
df['page_id'] = df['page_id'].astype(int)
df['filename'] = df['filename'].replace(' ', '_', regex=True)

### set variables which will be used in the notebook for querying

In [5]:
# adapted from https://github.com/nettrom/suggestbot/blob/master/tool-labs/link-rec/link-recommender.py#L208
#https://www.mediawiki.org/wiki/Manual:Redirect_table
#https://www.mediawiki.org/wiki/Manual:Page_table
#https://www.mediawiki.org/wiki/Manual:Pagelinks_table
#--rd.redirect_id -- where is this field located? in which table can it be found?

page_lens_redir_articles = []

def get_page_lens_redirects_mariadb(df):
    
    '''
    Connect to MariaDb for the wikis found
    in the given `pandas.DataFrame` `df` and return a
    DataFrame with page info for non redirect articles
    '''

    clean_id_query = '''
    SELECT 
       p1.page_title AS page_title,
       DATABASE() AS database_code,
       p1.page_id  AS page_id,
       p1.page_len AS page_len,
       p1.page_is_redirect AS p1_is_redirect,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
    FROM page AS p1 
    LEFT JOIN redirect AS rd 
        ON p1.page_id=rd.rd_from 
    LEFT JOIN page AS p2 
        ON (rd_namespace = p2.page_namespace)
            AND rd_title = p2.page_title  
    WHERE p1.page_namespace = 0
        AND p1.page_id IN {ids}
    '''
    
    clean_id_query_one_article = '''
    SELECT 
       p1.page_title AS page_title,
       DATABASE() AS database_code,
       p1.page_id  AS page_id,
       p1.page_len AS page_len,
       p1.page_is_redirect AS p1_is_redirect,
       p2.page_id AS rpage_id,
       p2.page_title AS rpage_title,
       p2.page_len rpage_len,
       p2.page_is_redirect AS is_double_redirect
    FROM page AS p1 
    LEFT JOIN redirect AS rd 
        ON p1.page_id=rd.rd_from 
    LEFT JOIN page AS p2 
        ON (rd_namespace = p2.page_namespace)
            AND rd_title = p2.page_title  
    WHERE p1.page_namespace = 0
        AND p1.page_id = {ids}
    '''

    for wiki in df['database_code'].unique():
        grouping = df.loc[df['database_code'] == wiki].groupby('database_code')['page_id'].apply(pd.DataFrame)
        ids = tuple(list(grouping[wiki]))
        _id_ = grouping.reset_index(drop=True).iloc[0][0]
        
        if len(ids)>= 2:
            redirects_r = mariadb.run(clean_id_query.format(ids=ids), wiki)
        else: redirects_r = mariadb.run(clean_id_query_one_article.format(ids=_id_), wiki)
        page_lens_redir_articles.append(redirects_r)   
    
    return(page_lens_redir_articles)



#""".format(start="2017-06", end="2018-06"), "wikishared")
#""".format(pa_articles_2018=pa_articles_2018)
# MIN(p1.page_touched) AS last_modified,

In [6]:
get_page_lens_redirects_mariadb(df)
redirects_r = pd.concat(page_lens_redir_articles)

### Relative length

In [193]:
#IN_median_vi
articles = redirects_r.merge(df, on=['database_code', 'page_id', 'page_title'], how='right').merge(median_values, on='database_code', how='left')#.fillna(0)

In [194]:
articles['relative_page_len'] = articles['page_len']/articles['FA_median_len']
articles['relative_page_len'] = articles['relative_page_len'].clip(upper=1)
del articles['FA_median_len']

## Translation

In [10]:
#mediawiki_page_history - https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Edits/Mediawiki_page_history
#fyi you need to truncate. Recommend 30 day period: https://meta.wikimedia.org/wiki/Research:Autoconfirmed_article_creation_trial#H14:_The_survival_rate_of_newly_created_articles_by_autoconfirmed_users_will_remain_stable
# Article deletion rates and article timespan - http://files.grouplens.org/papers/lam_group2009_wikipedia-longer-tail.pdf

#note - archive table in Mariadb holds revisions for pages that have been deleted

#snapshot = "{MWH_SNAPSHOT}"
#AND event_timestamp >="{contest_start}"
#AND event_timestamp <"{contest_end}"
        
at_edit_articles = []

def get_translation_edits_hive(df):
    
    '''
    Connect to the MediaWiki databases for the wikis found
    in the given `pandas.DataFrame` `df` and return a
    DataFrame with translation tool use data 
    '''
    
    #https://phabricator.wikimedia.org/T201539
    #get list of articles that have an edit associated with the content translation tool

    at_edits = """
    SELECT
        page_id, 
        revision_tags AS at_edits,
        wiki_db AS database_code
    FROM wmf.mediawiki_history
    WHERE
        snapshot = '2020-07'
        AND event_timestamp >= '2019-11-01'
        AND event_timestamp < '2020-07-01'
        AND page_namespace = 0
        AND event_entity = 'revision'
        AND revision_is_identity_reverted = False 
        AND revision_is_deleted_by_page_deletion = False
        AND array_contains(revision_tags, "contenttranslation")   
        AND wiki_db = '{wiki_db}' 
        AND page_id IN {ids}
    GROUP BY 
        page_id, revision_tags, wiki_db
    """
        
    for wiki in df['database_code'].unique():
        grouping = df.loc[df['database_code'] == wiki].groupby('database_code')['page_id'].apply(pd.DataFrame)
        ids = tuple(list(grouping[wiki]))
        _id_ = grouping.reset_index(drop=True).iloc[0][0]
        wiki_db = wiki
        translation_edits = spark.run(at_edits.format(ids=ids, wiki_db=wiki_db))                
        at_edit_articles.append(translation_edits)   
    
    return(at_edit_articles)


In [11]:
at_create_articles = []

def get_translation_create_hive(df):
    
    '''
    Connect to the MediaWiki databases for the wikis found
    in the given `pandas.DataFrame` `df` and return a
    DataFrame with translation tool use data 
    '''

    #https://phabricator.wikimedia.org/T201539
    #get list of articles that were created, instep with use of the article translation tool 
    at_create = """
    SELECT
        page_id, 
        revision_tags AS at_create,
        wiki_db AS database_code
    FROM wmf.mediawiki_history
    WHERE
        snapshot = '2020-07'
        AND event_timestamp >= '2019-11-01'
        AND event_timestamp < '2020-07-01'
        AND page_namespace = 0
        AND event_entity = 'page'
        AND event_type = 'create'
        AND revision_is_identity_reverted = False 
        AND revision_is_deleted_by_page_deletion = False
        AND array_contains(revision_tags, "contenttranslation")   
        AND wiki_db = '{wiki_db}' 
        AND page_id IN {ids}
    GROUP BY 
        page_id, revision_tags, wiki_db
    """
    
    
        
    for wiki in df['database_code'].unique():
        grouping = df.loc[df['database_code'] == wiki].groupby('database_code')['page_id'].apply(pd.DataFrame)
        ids = tuple(list(grouping[wiki]))
        _id_ = grouping.reset_index(drop=True).iloc[0][0]
        wiki_db = wiki
        translation_edits = spark.run(at_create.format(ids=ids, wiki_db=wiki_db))                
        at_create_articles.append(translation_edits)   
    
    return(at_create_articles)

In [12]:
get_translation_edits_hive(articles)
at_edits = pd.concat(at_edit_articles)

In [13]:
get_translation_create_hive(articles)
at_create = pd.concat(at_create_articles)

In [196]:
translation = pd.merge(at_edits, at_create, how="left", on=['page_id', 'database_code']).fillna(False)

In [197]:
#create a new column from at_edits, if there's data in at_edits
translation['translation_tool'] = np.where(translation.at_edits.str.len()>1, True, False)

In [198]:
#merge 
articles = pd.merge(articles, translation, how="left", on=['page_id', 'database_code'])#.fillna(False)

### wikidata Q item

In [24]:
#https://www.mediawiki.org/wiki/Wikibase/Schema/wb_items_per_site
#https://www.mediawiki.org/wiki/Manual:Page_table
#wb_items_per_site site:quarry.wmflabs.org

In [200]:
#change titles from normalized (underscore) to denormalized (spaces) for querying the wikidata table etc.
articles['page_title'] = articles['page_title'].str.replace('_', ' ')
articles['rpage_title'] = articles['rpage_title'].str.replace('_', ' ')

#select rows that have rtitles
rtitles = articles[~articles['rpage_id'].isnull()]

#create tuples of the article_suggestions and wiki_codes to use when querying for the wikidata items
editing_titles_denormalized_CLEAN = tuple(list(articles['page_title']))
editing_titles_denormalized_database_codes_CLEAN = tuple(list(articles['database_code']))

r_editing_titles_denormalized_CLEAN = tuple(list(rtitles['rpage_title']))
r_editing_titles_denormalized_database_codes_CLEAN = tuple(list(rtitles['database_code']))

#set up a dict variable to use with .format when querying
wd_vars_3b = {}
wd_vars_3b.update({
    'editing_titles_denormalized' : editing_titles_denormalized_CLEAN,
    'editing_titles_denormalized_db_codes' : editing_titles_denormalized_database_codes_CLEAN,
    'r_editing_titles_denormalized': r_editing_titles_denormalized_CLEAN,
    'r_editing_titles_denormalized_db_codes': r_editing_titles_denormalized_database_codes_CLEAN,
    
})

In [19]:
qid_simple_r = wmf.mariadb.run("""
SELECT
      ips_site_page AS page_title,
      ips_item_id AS QID,
      ips_site_id AS database_code
FROM  wb_items_per_site  
WHERE ips_site_id IN {editing_titles_denormalized_db_codes} AND
      ips_site_page IN {editing_titles_denormalized}
""".format(**wd_vars_3b), "wikidatawiki")
#""".format(**quality_vars), "wikidatawiki")

In [20]:
r_qid_simple_r = wmf.mariadb.run("""
SELECT
      ips_site_page AS rpage_title,
      ips_item_id AS QID,
      ips_site_id AS database_code
FROM  wb_items_per_site  
WHERE ips_site_id IN {r_editing_titles_denormalized_db_codes} AND
      ips_site_page IN {r_editing_titles_denormalized}
""".format(**wd_vars_3b), "wikidatawiki")
#""".format(**quality_vars), "wikidatawiki")

In [201]:
articles = articles.merge(qid_simple_r, on=['page_title', 'database_code'], how="left")#.fillna(0)
articles = articles.merge(r_qid_simple_r, on=['rpage_title', 'database_code'], how="left")#.fillna(0)
articles.loc[articles["QID_x"].isnull(),'QID_x'] = articles["QID_y"] 
articles = articles.drop(['QID_y'], axis=1).rename(columns={'QID_x':'QID'})

In [202]:
#change titles from denormalized (spaces)  to normalized (underscore) 
articles['page_title'] = articles['page_title'].str.replace(' ', '_')
articles['rpage_title'] = articles['rpage_title'].str.replace(' ', '_')

# iwsitelinks

##### create qid list from column for pulling categories (use quid_simple_r from above)

In [24]:
#https://www.wikidata.org/wiki/Help:Sitelinks
#https://www.wikidata.org/w/api.php?action=wbgetentities&ids=Q42&props=sitelinks

ips_sites_articles = []
qids = tuple(list(articles['QID']))

def get_ips_sites_mariadb(df):
    
    '''
    Connect to the MariaDB for the wikis found
    in the given `pandas.DataFrame` `df` and return a
    DataFrame with interwiki links data 
    '''
    
    ips_sites_query = """
    SELECT
      linked_item.ips_item_id AS QID,
      GROUP_CONCAT(ips_site_id SEPARATOR ', ') AS iwsites,
      COUNT(ips_site_page) AS iwsitelinks
    FROM (
          SELECT ips_item_id
          FROM wb_items_per_site
          WHERE ips_site_id = '{wiki_db}' 
          AND ips_item_id IN {qids}
        ) AS linked_item
    LEFT JOIN wb_items_per_site 
      ON linked_item.ips_item_id = wb_items_per_site.ips_item_id
    LEFT JOIN page 
      ON linked_item.ips_item_id = page.page_id
    GROUP BY page_id
    """
    
        
    for wiki in df['database_code'].unique():
        grouping = df.loc[df['database_code'] == wiki].groupby('database_code')['QID'].apply(pd.DataFrame)
        qids = tuple(list(grouping[wiki]))
        _qid_ = grouping.reset_index(drop=True).iloc[0][0]
        wiki_db = wiki
        ips_sites_query_results = wmf.mariadb.run(ips_sites_query.format(qids=qids, wiki_db=wiki_db), "wikidatawiki")                
        ips_sites_articles.append(ips_sites_query_results)   
    
    return(ips_sites_articles)

In [25]:
get_ips_sites_mariadb(qid_simple_r)
ips_sites = pd.concat(ips_sites_articles)

In [203]:
articles = articles.merge(ips_sites[['iwsitelinks', 'iwsites', 'QID']], on='QID', how="left")#.fillna(0)

In [204]:
articles['QID'].fillna(0, inplace=True)

#### clean QID 

In [205]:
articles['QID'] = articles['QID'].astype(int)
#articles['QID'] = 'Q' + articles['QID'].astype(str)

In [206]:
articles = articles.drop_duplicates(subset=['page_title', 'database_code', 'page_id', 'QID'], keep='first') 

# links <a class="anchor" id="links"></a>
[Back to Table of Contents](#toc)

In [29]:
# pagelinks and redirects examples
#https://github.com/nettrom/suggestbot/blob/master/tool-labs/link-rec/inlink-table-updater.py#L227

# incoming pagelinks: links from within the same wiki
#https://www.mediawiki.org/wiki/Manual:Pagelinks_table
#resource: https://github.com/nettrom/suggestbot/blob/master/tool-labs/link-rec/inlink-table-updater.py#L229
#	pl_from, pl_from_namespace #anchor
#	pl_namespace	pl_title #target

#linking
#https://en.wikipedia.org/wiki/Special:WhatLinksHere/Wikipedia:Manual_of_Style/Linking
#https://en.wikipedia.org/wiki/Wikipedia:Manual_of_Style/Linking#General_principles
#https://www.mediawiki.org/wiki/API:Links

#backlinks+
#https://dispenser.info.tm/~dispenser/cgi-bin/backlinkscount.py (backlinks)
#https://github.com/wikimedia/mediawiki-api-demos/blob/master/python/get_backlinks.py

#tables
#pagelinks contains links to other pages on the same wiki...provide cohesion and utility
#https://www.mediawiki.org/wiki/Manual:Pagelinks_table (internal links in the same wiki, from the page)

#externallinks contains links to elsewhere, outside of all wikis
#https://www.mediawiki.org/wiki/Manual:Externallinks_table (external links, from the page)

#interwikilinks links an article in one language to the same article in another language. For most articles these are stored on Wikidata. 
#https://en.wikipedia.org/wiki/Help:Interwiki_linking
#https://www.mediawiki.org/wiki/Manual:Iwlinks_table

#langlinks links that point to a page on another wiki (e.g. [[mw:Product Analytics]] links to the PA team’s page on MediaWiki-wiki.
#https://en.wikipedia.org/wiki/Help:Interlanguage_links#Local_links
#https://www.mediawiki.org/wiki/Manual:Langlinks_table

In [30]:
#TODO Handle entries where we didn't get page_id at the top (below queries rely on page_id)


oel_articles = []
ipl_articles = []
opl_articles = []

def get_links_info(df):
    
    '''
    Connect to the MariaDB for the wikis found
    in the given `pandas.DataFrame` `df` and return a
    DataFrame with outgoing external links, outgoing page links 
    and incoming page links data 
    '''
    
    #pl.DATABASE() AS database_code,
    opl_r = """
    SELECT 
        DATABASE() AS database_code,
        pl.pl_from, 
        link.page_id AS plpage,
        link.page_title AS plpage_title,
        redir.page_id AS rpage,
        redir.page_title AS rpage_title,
        redir.page_is_redirect AS is_double_redirect
    FROM pagelinks AS pl
    JOIN page AS link
        ON (pl.pl_namespace=link.page_namespace
        AND pl.pl_title=link.page_title)
    LEFT JOIN redirect AS rd
        ON link.page_id=rd.rd_from
    LEFT JOIN page AS redir
        ON (rd.rd_namespace=redir.page_namespace
        AND rd.rd_title=redir.page_title)
    WHERE pl.pl_from IN {ids}
    """
    
    #link.DATABASE() AS database_code,
    ipl_r = """
    SELECT 
        DATABASE() AS database_code,
        link.page_id AS page_id,
        pl.pl_title AS page_title,
        pl.pl_from AS in_pagelinks
    FROM pagelinks AS pl
    JOIN page AS link
        ON (pl.pl_namespace=link.page_namespace
        AND pl.pl_title=link.page_title)
    LEFT JOIN redirect AS rd
        ON link.page_id=rd.rd_from
    LEFT JOIN page AS redir
        ON (rd.rd_namespace=redir.page_namespace
        AND rd.rd_title=redir.page_title)
    WHERE pl.pl_namespace=0
        AND link.page_id IN {ids}
    """
    
    oel_r = """
    SELECT el_from AS page_id,
    el_to AS oel_links
    FROM externallinks
    WHERE el_from IN {ids}
    """
    
    for wiki in df['database_code'].unique():
        grouping = df.loc[df['database_code'] == wiki].groupby('database_code')['page_id'].apply(pd.DataFrame)
        ids = tuple(list(grouping[wiki]))
        _id_ = grouping.reset_index(drop=True).iloc[0][0]
        oel_query_results = wmf.mariadb.run(oel_r.format(ids=ids), wiki)                
        oel_articles.append(oel_query_results)   
        ipl_query_results = wmf.mariadb.run(ipl_r.format(ids=ids), wiki) 
        ipl_articles.append(ipl_query_results) 
        opl_query_results = wmf.mariadb.run(opl_r.format(ids=ids), wiki)                
        opl_articles.append(opl_query_results) 
    
    return(oel_articles, ipl_articles, opl_articles)

In [31]:
get_links_info(articles);

### pagelinks <a class="anchor" id="pagelinks"></a>
[Back to Table of Contents](#toc)

In [32]:
# pagelinks: linking to articles within the same wiki

In [208]:
opl = pd.concat(opl_articles)

In [209]:
# check to see if any of the page_ids are double redirects
((opl['is_double_redirect']==1).any())

False

In [210]:
# check to see if an anchor's target is duplicated...for duplicate instances of a link within a single page
#pagelinks_r[(pagelinks_r.duplicated('pl_from') & pagelinks_r.duplicated('lpage'))] #checks for duplicates in either column
opl[opl.duplicated(['pl_from','plpage'])] #checks for duplicates in two columns at the same time

Unnamed: 0,database_code,pl_from,plpage,plpage_title,rpage,rpage_title,is_double_redirect


In [211]:
opl = opl[['database_code', 'pl_from']]

In [212]:
# because there are no duplicated targets from each anchor, we can count the number of occurrences for each anchor as the target_count
#opls = opl['pl_from'].value_counts().to_frame().reset_index().rename(columns={'index':'page_id', 'pl_from':'oplinks'})

opls = opl.groupby(['pl_from', 'database_code']).size().reset_index().rename(columns={'pl_from':'page_id', 0:'oplinks'})

In [213]:
#merge_in_content(opls)
articles = articles.merge(opls, on=['page_id', 'database_code'], how="left")#.fillna(0)

### external links <a class="anchor" id="extlinks"></a>
[Back to Table of Contents](#toc)

In [214]:
oel = pd.concat(oel_articles)

In [215]:
# if no external links are duplicated, then count targets arising from each anchor:
#because there are no duplicated targets from each anchor, we can count the number of occurrences for each anchor as the target_count
#oextlinks = oel_r['el_from'].value_counts().to_frame().reset_index().rename(columns={'index':'page_id', 'el_from':'oelinks'})

#count unique links 
oextlinks = oel.groupby('page_id')['oel_links'].nunique().to_frame().reset_index()#.rename(columns={'el_from':'page_id', 'el_to': 'oelinks'})

In [216]:
articles = articles.merge(oextlinks, on=['page_id'], how="left")

### Incoming pagelinks <a class="anchor" id="inpagelinks"></a>
[Back to Table of Contents](#toc)

### NOTES FROM MORTEN:
Use backlinks query from here: https://github.com/nettrom/suggestbot/blob/master/tool-labs/link-rec/inlink-table-updater.py#L229 It already uses a list of page IDs as the basis for the query, but does limit links to within the article namespace.

Then I'd just remove `AS ilc_page_id` and change `AS ilc_numlinks` to `AS numlinks` or something. That query is pretty optimized, and also counts inlinks coming in through redirects. 

In [174]:
ipl = pd.concat(ipl_articles)

In [217]:
#incoming_pagelinks = incoming_pagelinks_r['page_id'].value_counts().to_frame().reset_index().rename(columns={'index':'page_id', 'page_id':'ipl_count'})

#count unique links 
incoming_pagelinks = ipl.groupby(['page_id', 'database_code'])['in_pagelinks'].nunique().to_frame().reset_index().rename(columns={'in_pagelinks':'ipl_count'})

In [218]:
#merge_in_content(incoming_pagelinks)
articles = articles.merge(incoming_pagelinks, on=['page_id', 'database_code'], how="left")#.fillna(0)

### Article's talk page activity

In [219]:
# https://meta.wikimedia.org/wiki/Research:Usage_of_talk_pages/2019-11-11#arwiki
#https://meta.wikimedia.org/wiki/Research:Newsletter/2011/August
#http://jodischneider.com/pubs/sac2011.pdf
#https://meta.wikimedia.org/wiki/Research:Newsletter/2015/May#Editors_who_use_user_talk_pages_are_more_involved_in_high-quality_articles
#https://meta.wikimedia.org/wiki/Research:New     vI will be happy to take a look at your queries. I will be happy to take a look at your queries. vsletter/2017/May#cite_note-9
#https://www.opensym.org/wp-content/uploads/2018/07/OpenSym2018_paper_14.pdf
#https://phabricator.wikimedia.org/T214935 -- on talk page click through rates
#SQL:
#https://github.com/wikimedia-research/Talkcicity/blob/master/retrieve_talkpage_data.R
#https://github.com/x-tools/xtools/blob/master/src/AppBundle/Repository/ArticleInfoRepository.php#L221-L226
#https://github.com/wikimedia-research/2019-10-talk-pages-baseline-metrics/blob/master/2019-10-talk-page-contributors-analysis.ipynb


In [220]:
#https://www.wikidata.org/wiki/Help:Sitelinks
#https://www.wikidata.org/w/api.php?action=wbgetentities&ids=Q42&props=sitelinks

talk_page_articles = []
article_watch_count_articles = []
rr_replicas_articles = []

def get_talk_watch_rr_mariadb(df):
    
    '''
    Connect to the MariaDB for the wikis found
    in the given `pandas.DataFrame` `df` and return a
    DataFrame with data for talk page activity, watch count and reverts 
    '''
    
    # https://phabricator.wikimedia.org/source/mediawiki/browse/master/maintenance/tables.sql
    article_watch_count_query = """
    SELECT 
           page_id,
           DATABASE() AS database_code,
           COUNT(*) AS watch_count
    FROM watchlist
    JOIN page 
         ON (wl_title = page_title AND wl_namespace = page_namespace)
    WHERE page_namespace = 0 
        AND page_id IN {ids}
    GROUP BY page_id
    """ 
    
    talk_page_edits_query = """
    SELECT 
        DATABASE() AS database_code,
        pa.page_id, 
        SUM(IF(rev_id IS NOT NULL, 1, 0)) AS talk_page_edits
    FROM page pa
    LEFT JOIN page pt
        ON pa.page_title = pt.page_title
        AND pt.page_namespace = 1
    LEFT JOIN revision
        ON pt.page_id = rev_page
    WHERE pa.page_id IN {ids}
        AND pt.page_namespace = 1
        AND (rev_deleted & 4) = 0
    GROUP BY pa.page_id
    """
    
    # see the revert rate (rr) notebook
    rr_replicas_query = """
    SELECT page_id, 
           DATABASE() AS database_code,
           COUNT(DISTINCT rev_id) AS revertrate
    FROM revision
     JOIN change_tag 
         ON ct_rev_id = rev_id
     JOIN change_tag_def 
         ON ct_id = ctd_id
     JOIN page 
         ON rev_page = page_id
    WHERE ctd_name IN ('mw-rollback', 'mw-undo')
        AND page_id IN {ids}
    GROUP BY page_id
    """
    
    
    for wiki in df['database_code'].unique():
        grouping = df.loc[df['database_code'] == wiki].groupby('database_code')['page_id'].apply(pd.DataFrame)
        ids = tuple(list(grouping[wiki]))
        _id_ = grouping.reset_index(drop=True).iloc[0][0]
        wiki_db = wiki
        
        talk_page_edits_results = wmf.mariadb.run(talk_page_edits_query.format(ids=ids), wiki)
        talk_page_articles.append(talk_page_edits_results)
        
        article_watch_count_results = wmf.mariadb.run(article_watch_count_query.format(ids=ids), wiki)
        article_watch_count_articles.append(article_watch_count_results)
        
        rr_replicas_results = wmf.mariadb.run(rr_replicas_query.format(ids=ids), wiki) 
        rr_replicas_articles.append(rr_replicas_results)
        
    return(talk_page_articles, article_watch_count_articles, rr_replicas_articles)

In [221]:
get_talk_watch_rr_mariadb(articles);

In [222]:
talk_page_edits = pd.concat(talk_page_articles)
article_watch_count = pd.concat(article_watch_count_articles)
rr_replicas = pd.concat(rr_replicas_articles)

#merge_in_content(talk_page_edits)
#merge_in_content(article_watch_count)
#merge_in_content(rr_replicas)
articles = articles.merge(talk_page_edits, on=['page_id', 'database_code'], how="left").merge(article_watch_count, on=['page_id', 'database_code'], how="left").merge(rr_replicas, on=['page_id', 'database_code'], how="left")

### Editors

In [80]:
tuepa_articles = []
tunbre_articles = []
tueme_articles = []
tuipe_articles = []

def get_editor_data_mariadb(df):
    
    '''
    Connect to the MariaDB for the wikis found
    in the given `pandas.DataFrame` `df` and return a
    DataFrame with data for the articles' editor data 
    '''
    
    # editors: total number of unique editors, including IP editors and bots...all editors of all edits, microcontributions
    #https://www.mediawiki.org/wiki/Manual:Page_table
    #https://www.mediawiki.org/wiki/Manual:Revision_actor_temp_table
    #https://www.mediawiki.org/wiki/Manual:Revision_table
    #https://www.mediawiki.org/wiki/Help:RevisionDelete
    #adapted from https://phabricator.wikimedia.org/T231598#5465711
    #questioned in https://phabricator.wikimedia.org/T234560#5545319
    #taken into account: rev_deleted to avoid leaking information re: how many distinct users were involved in revision-deleted edits

    tuepa_q = """
    SELECT 
        page.page_id AS page_id,
        COUNT(DISTINCT revactor_actor) AS all_editors_of_all_edits,
        DATABASE() AS database_code
    FROM revision_actor_temp
    JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
    JOIN page ON rev_page = page.page_id  
    WHERE rev_page = page.page_id 
    AND (rev_deleted & 4) = 0
    AND page.page_id IN {ids}
    GROUP BY page_id
    """
    
    # query post Morten review
    #editors: total, unique, non-bot, registered editors that made non-minor edits
    tunbre_q = """
    SELECT 
        revision.rev_page AS page_id,
        COUNT(DISTINCT revactor_actor) AS editors_nm,
        DATABASE() AS database_code
    FROM revision_actor_temp
    JOIN revision ON (revactor_rev = rev_id)
    JOIN actor ON (revactor_actor = actor_id)
    WHERE (rev_deleted & 4) = 0
    AND rev_minor_edit = 0
    AND actor_user IS NOT NULL -- user cannot be non-registered
    AND actor_user NOT IN (SELECT ug_user FROM user_groups WHERE ug_group = "bot") -- not a bot
    AND revision.rev_page IN {ids}
    GROUP BY revision.rev_page
    """
    
    # editors: total number of unique editors, including IP editors and bots...all editors of all edits, microcontributions
    #https://www.mediawiki.org/wiki/Manual:Page_table
    #https://www.mediawiki.org/wiki/Manual:Revision_actor_temp_table
    #https://www.mediawiki.org/wiki/Manual:Revision_table
    #https://www.mediawiki.org/wiki/Help:RevisionDelete
    #adapted from https://phabricator.wikimedia.org/T231598#5465711
    #questioned in https://phabricator.wikimedia.org/T234560#5545319
    #taken into account: rev_deleted to avoid leaking information re: how many distinct users were involved in revision-deleted edits
    
    tueme_q = """
    SELECT 
        page.page_id,
        COUNT(DISTINCT revactor_actor) AS micro_editors,
        DATABASE() AS database_code
    FROM revision_actor_temp
      JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
      JOIN page ON rev_page = page.page_id  
    WHERE rev_page = page.page_id 
      AND (rev_deleted & 4) = 0
      AND page.page_id IN {ids}
      AND rev_minor_edit = 1
    GROUP BY page_id
    """
    
    # updated!
    #total unique IP editors, 
    #see also: https://phabricator.wikimedia.org/T231605
    #https://meta.wikimedia.org/wiki/IP_Editing:_Privacy_Enhancement_and_Abuse_Mitigation/Research
    #https://github.com/nettrom/AHT-block-effectiveness-2018
    #https://github.com/wikimedia-research/AHT-IP-edits-2019/blob/master/edit_usefulness.ipynb
    #https://meta.wikimedia.org/wiki/User:Benjamin_Mako_Hill/Research_on_the_value_of_IP_Editing
    tuipe_q = """
    SELECT 
      revision.rev_page AS page_id,
      COUNT(DISTINCT revactor_actor) AS IP_editors,
      DATABASE() AS database_code
    FROM revision
      JOIN revision_actor_temp ON (rev_id = revactor_rev)
      JOIN actor ON (revactor_actor = actor_id)
    WHERE (rev_deleted & 4) = 0
      AND actor_user IS NULL -- non-registered user
      AND revision.rev_page IN {ids}
    GROUP BY rev_page
    """

    
    
    for wiki in df['database_code'].unique():
        grouping = df.loc[df['database_code'] == wiki].groupby('database_code')['page_id'].apply(pd.DataFrame)
        ids = tuple(list(grouping[wiki]))
        _id_ = grouping.reset_index(drop=True).iloc[0][0]
        wiki_db = wiki
        
        tuepa_results = wmf.mariadb.run(tuepa_q.format(ids=ids), wiki)
        tuepa_articles.append(tuepa_results)
        
        tunbre_results = wmf.mariadb.run(tunbre_q.format(ids=ids), wiki)
        tunbre_articles.append(tunbre_results)
        
        tueme_results = wmf.mariadb.run(tueme_q.format(ids=ids), wiki) 
        tueme_articles.append(tueme_results)
        
        tuipe_results = wmf.mariadb.run(tuipe_q.format(ids=ids), wiki) 
        tuipe_articles.append(tuipe_results)
        
    return(tuepa_articles, tunbre_articles, tueme_articles, tuipe_articles)

In [81]:
get_editor_data_mariadb(articles);

In [178]:
tuepa_r = pd.concat(tuepa_articles)
tunbre_r = pd.concat(tunbre_articles)
tueme_r = pd.concat(tueme_articles)
tuipe_r = pd.concat(tuipe_articles)

#### editor calculations

In [223]:
editor_calculations = pd.merge(tuepa_r, tunbre_r[['page_id', 'editors_nm', 'database_code']], on=['page_id','database_code'], how='left').fillna(0)

In [224]:
editor_calculations = pd.merge(editor_calculations, tuipe_r[['page_id', 'IP_editors','database_code']],
                               on=['page_id','database_code'], 
                               how='left').merge(tueme_r[['page_id', 'micro_editors','database_code']], 
                               on=['page_id','database_code'], 
                               how='left').fillna(0)

In [225]:
articles = articles.merge(editor_calculations, on=['page_id', 'database_code'], how="left")
#merge_in_content(editor_calculations);

In [182]:
# This timestamp is updated whenever the page changes in a way requiring it to be re-rendered, invalidating caches. 
#Aside from editing, this includes permission changes, creation or deletion of linked pages, and alteration of contained templates. 
#[[mw:Manual:Revision_table]] and [[mw:Manual:Page_table]]. Only show latest edits does an inner join from revision table to page table on rev_id = page_latest .
#https://www.mediawiki.org/wiki/Manual:Revision_table
#https://github.com/x-tools/xtools/blob/master/src/AppBundle/Repository/ArticleInfoRepository.php#L162-L171
#https://xtools.wmflabs.org/articleinfo/pa.wikipedia.org/ਏਸ਼ੀਆ

edits_articles = []
timestamps_articles = []

def get_edits_timestamps_mariadb(df):
    
    edits_q = """
    SELECT 
        rev_page AS page_id, 
        DATABASE() AS database_code,
        COUNT(rev_id) AS num_edits_all_time,
        SUM(rev_minor_edit) AS minor_edits_all_time
    FROM revision
    JOIN page ON page_id = rev_page
    WHERE rev_page = page_id
        AND rev_timestamp > 0 
        AND rev_page IN {ids}
    GROUP BY rev_page
    """


    timestamps_q = """
    SELECT 
        rev_page AS page_id, 
        DATABASE() AS database_code,
        max(rev_timestamp) AS last_edited 
    FROM revision 
    JOIN page 
      ON page_id = rev_page
    WHERE rev_page IN {ids}
    AND rev_id = page_latest
    GROUP BY rev_page
    """

    for wiki in df['database_code'].unique():
        grouping = df.loc[df['database_code'] == wiki].groupby('database_code')['page_id'].apply(pd.DataFrame)
        ids = tuple(list(grouping[wiki]))
        _id_ = grouping.reset_index(drop=True).iloc[0][0]
        wiki_db = wiki
        
        edits_results = wmf.mariadb.run(edits_q.format(ids=ids), wiki)
        edits_articles.append(edits_results)
        
        timestamps_results = wmf.mariadb.run(timestamps_q.format(ids=ids), wiki)
        timestamps_articles.append(timestamps_results)
        
    return(edits_articles, timestamps_articles)

In [183]:
get_edits_timestamps_mariadb(articles);

edits = pd.concat(edits_articles)
timestamps = pd.concat(timestamps_articles)



In [226]:
#merge_in_content(edits)
#merge_in_content(timestamps)

timestamps['last_edited']= pd.to_datetime(timestamps['last_edited']) 
timestamps['last_edited'] = timestamps['last_edited'].dt.normalize()

articles = articles.merge(edits, on=['page_id', 'database_code'], how="left").merge(timestamps, on=['page_id', 'database_code'], how="left")

## Article Creation Date

In [101]:
# DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS first_edited

first_edit_timestamp_articles = []

def get_first_edit_timestamp_mariadb(df):
    
    first_edit_timestamp_q = """
    SELECT 
        page_id, 
        DATABASE() AS database_code,
        rev_timestamp AS first_edited
    FROM revision 
    JOIN page ON page_id = rev_page
    WHERE rev_page IN {ids}
    GROUP BY page_id
    """

    for wiki in df['database_code'].unique():
        grouping = df.loc[df['database_code'] == wiki].groupby('database_code')['page_id'].apply(pd.DataFrame)
        ids = tuple(list(grouping[wiki]))
        _id_ = grouping.reset_index(drop=True).iloc[0][0]
        wiki_db = wiki
        
        first_edit_timestamp_results = wmf.mariadb.run(first_edit_timestamp_q.format(ids=ids), wiki)
        first_edit_timestamp_articles.append(first_edit_timestamp_results)
        
    return(first_edit_timestamp_articles)

In [185]:
get_first_edit_timestamp_mariadb(articles);
first_edit_timestamp = pd.concat(first_edit_timestamp_articles)

In [227]:
first_edit_timestamp['first_edited']= pd.to_datetime(first_edit_timestamp['first_edited']) 
first_edit_timestamp['first_edited'] = first_edit_timestamp['first_edited'].dt.normalize()

In [228]:
#merge_in_content(first_edit_timestamp)
articles = articles.merge(first_edit_timestamp, on=['page_id', 'database_code'], how="left")

In [230]:
articles = articles.drop_duplicates(subset=['page_title', 'database_code', 'page_id', 'QID','contest_article_type'], keep='first') 

### Identify article types (expanded, new, post)

In [233]:
# Indonesia articles which have three categories with different start/end dates
articles['filename'] = articles['filename'].replace(' ', '_', regex=True)
articles['first_edited'] = pd.to_datetime(articles['first_edited'], errors='coerce')

mc = ['monthly_challenge.csv']
wc = ['writing_contest.csv']
gc = ['Indonesia_grant.csv', 'Jawa_grant.csv', 'Minangkabau_grant.csv',
       'Sunda_grant.csv']


#articles["article_type"] = np.nan #add new empty column to fill below

articles.loc[(articles["filename"].isin(mc)) & (articles["first_edited"] < contest_start_indonesia_monthly_challenge), "article_type"] = 'expanded'
articles.loc[(articles["filename"].isin(mc)) & (articles["first_edited"] >= contest_start_indonesia_monthly_challenge), "article_type"] = 'new'
articles.loc[(articles["filename"].isin(mc)) & (articles["first_edited"] > contest_end_indonesia_monthly_challenge), "article_type"] = 'post'

articles.loc[(articles["filename"].isin(wc)) & (articles["first_edited"] < contest_start_indonesia_writing_contest), "article_type"] = 'expanded'
articles.loc[(articles["filename"].isin(wc)) & (articles["first_edited"] >= contest_start_indonesia_writing_contest), "article_type"] = 'new'
articles.loc[(articles["filename"].isin(wc)) & (articles["first_edited"] > contest_end_indonesia_writing_contest), "article_type"] = 'post'

articles.loc[(articles["filename"].isin(gc)) & (articles["first_edited"] < contest_start_indonesia_grantee_comm_gathering), "article_type"] = 'expanded'
articles.loc[(articles["filename"].isin(gc)) & (articles["first_edited"] >= contest_start_indonesia_grantee_comm_gathering), "article_type"] = 'new'
articles.loc[(articles["filename"].isin(gc)) & (articles["first_edited"] > contest_end_indonesia_grantee_comm_gathering), "article_type"] = 'post'

### Edits & Editors per article for time filtering <a class="anchor" id="editors_active"></a>
[Back to Table of Contents](#toc)

In [234]:

#select only new articles
#articles_new = articles[articles.article_type =='new']
select_list = ['new', 'post']
articles_new = articles[articles['article_type'].isin(select_list)]

#clean_new_pageids = tuple(list(articles_new['page_id']))
#clean_new_pagetitles_list = list(articles_new['page_title'])
#quality_vars.update({'clean_new_pageids': clean_new_pageids})

In [128]:
# DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS first_edited
#https://phabricator.wikimedia.org/T231598
#The (rev_deleted & 4) = 0 condition is to exclude revisions where the user has been RevDeled, as we don't want to leak information about how many such users there are.

edits_editors_all_articles = []
edits_editors_reg_articles = []

def get_edits_editors_mariadb(df):
    
    edits_editors_all_q = """
    SELECT 
        page_id,
        DATABASE() AS database_code,
        DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,    
        revactor_actor
    FROM revision_actor_temp
    JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
    JOIN page ON rev_page = page.page_id
    WHERE rev_page = page_id
        AND rev_timestamp > 0 
        AND (rev_deleted & 4) = 0
        AND rev_page IN {ids}
    GROUP BY revactor_rev
    """

    edits_editors_reg_q = """
    SELECT 
        page_id,
        DATABASE() AS database_code,
        DATE_FORMAT(rev_timestamp,"%y-%m-%d") AS edit_date,    
        revactor_actor
    FROM revision_actor_temp
    JOIN revision ON(revactor_rev = rev_id AND revactor_page = rev_page)
    JOIN page ON rev_page = page.page_id
    JOIN actor ON (revactor_actor = actor_id)
    WHERE rev_page = page_id
        AND rev_timestamp > 0 
        AND (rev_deleted & 4) = 0
        AND actor_user IS NOT NULL -- user cannot be non-registered
        AND actor_user NOT IN (SELECT ug_user FROM user_groups WHERE ug_group = "bot") -- not a bot
        AND rev_page IN {ids}
    GROUP BY revactor_rev
    """
    
    
    for wiki in df['database_code'].unique():
        grouping = df.loc[df['database_code'] == wiki].groupby('database_code')['page_id'].apply(pd.DataFrame)
        ids = tuple(list(grouping[wiki]))
        _id_ = grouping.reset_index(drop=True).iloc[0][0]
        wiki_db = wiki
        
        edits_editors_all_results = wmf.mariadb.run(edits_editors_all_q.format(ids=ids), wiki)
        edits_editors_all_articles.append(edits_editors_all_results)
        
        edits_editors_reg_results = wmf.mariadb.run(edits_editors_reg_q.format(ids=ids), wiki)
        edits_editors_reg_articles.append(edits_editors_reg_results)
        
    return(edits_editors_all_articles, edits_editors_reg_articles)

In [129]:
get_edits_editors_mariadb(articles_new)
edits_editors_all = pd.concat(edits_editors_all_articles)
edits_editors_reg_r = pd.concat(edits_editors_reg_articles)

In [235]:
edits_editors_all['edit_date'] = pd.to_datetime(edits_editors_all['edit_date'], format="%y-%m-%d")
edits_editors_reg_r['edit_date'] = pd.to_datetime(edits_editors_reg_r['edit_date'], format="%y-%m-%d")

In [236]:
# merge 
ee_fe = pd.merge(edits_editors_all, articles[['page_id','page_title', 'first_edited', 'database_code']], on=['page_id', 'database_code'], how='left').fillna(0)
# merge 
eern_fe = pd.merge(edits_editors_reg_r, articles[['page_id','page_title', 'first_edited','database_code']], on=['page_id', 'database_code'], how='left').fillna(0)

In [237]:
#create a timedelta column
eern_fe['edit_td'] = eern_fe['edit_date']-eern_fe['first_edited']

# filter for only edits in first 30 days
m1 = eern_fe[eern_fe['edit_td'] <= pd.Timedelta(30, unit='d')]

# filter for only edits in first 60 days
m2 = eern_fe[eern_fe['edit_td'] <= pd.Timedelta(60, unit='d')]

#### filter for edits by time period and use those in groupby counts below

In [238]:
m1['revactor_actor'].nunique()

214

In [239]:
# filter for only edits in first 30 days
m1 = eern_fe[eern_fe['edit_td'] <= pd.Timedelta(30, unit='d')]

# filter for only edits in first 60 days
m2 = eern_fe[eern_fe['edit_td'] <= pd.Timedelta(60, unit='d')]

#### editor counts per article at 1M, 2M

In [240]:
editors_1M_counts = m1.groupby(['page_id', 'page_title', 'database_code'])['revactor_actor'].nunique().reset_index(name='editors_1stM')

In [241]:
editors_2M_counts = m2.groupby(['page_id', 'page_title', 'database_code'])['revactor_actor'].nunique().reset_index(name='editors_2ndM')

In [242]:
editors_by_M_calculations = pd.merge(editors_1M_counts, editors_2M_counts[['page_id', 'editors_2ndM', 'database_code']],
                               on=['page_id', 'database_code'], 
                               how='left').fillna(0)

#### edit counts per article at 1M, 2M

In [243]:
edit_counts_1M = m1.groupby(['page_id', 'page_title', 'database_code'])['edit_date'].agg('count').reset_index(name='edits_1M')

In [244]:
edit_counts_2M = m2.groupby(['page_id', 'page_title', 'database_code'])['edit_date'].agg('count').reset_index(name='edits_2M')

In [245]:
edits_by_M_calculations = pd.merge(edit_counts_1M, edit_counts_2M[['page_id', 'edits_2M', 'database_code']],
                               on=['page_id', 'database_code'], 
                               how='left').fillna(0)

In [246]:
#merge

In [247]:
articles = pd.merge(articles, editors_1M_counts[['page_id', 'editors_1stM', 'database_code']],
                               on=['page_id', 'database_code'], 
                               how='left').merge(edit_counts_1M[['page_id', 'edits_1M', 'database_code']], 
                               on=['page_id', 'database_code'], 
                               how='left')#.fillna(0)

## Clean

In [248]:
articles = articles.drop_duplicates(subset=['page_title', 'database_code', 'page_id', 'filename'], keep='first') 

In [None]:
articles.loc[articles['QID']==0]

## Save data

In [249]:
articles.to_csv("../../data/processed/query_results/content_quality/indonesia/CQ_all_articles.csv", sep=',', encoding = 'utf-8', index=False) 