# GLOW - Baselines

## Table of Contents  <a class="anchor" id="toc"></a>

* [GLOW Wiki Baselines](#top)

    1. [Editors](#editors)
        1. [Editors](#editors_monthly)
        2. [Monthly Active Editors](#editors_active)
        3. [Monthly New Editors](#editors_new)
        4. [New editor retention](#new_editor_retention)
    2. [Articles](#articles)
        1. [Articles Count by wiki](#articles_count) 
        2. [New Articles](#new_articles)
        3. [Edits to existing articles](#article_edits)
        4. [New articles: by date/exp/survival](#new_articles_filtered)
    3. [Readers](#readers)
        1. [Pageviews](#pageviews_detailed)
    4. [Geo](#stage1b)
        1. [Monthly Unique Devices](#editors_activity_countries)
        2. [Edits geolocated](#editors_activity_countries)
        3. [Editors geolocated](#editors_activity_countries)
        4. [Pageviews across countries & wikis](#pageviews)

In [1]:
from pyspark.sql.types import ArrayType, StringType

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

import datetime as dt

import pandas as pd
import numpy as np

In [2]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

# Monthly Averages Baseline

## Wikis<a class="anchor" id="editors"></a>
[Back to Table of Contents](#toc)

In [9]:
#canonical data in hive
#https://github.com/wikimedia-research/canonical-data/blob/master/countries.csv
countries_r = '''
SELECT
  name, 
  iso_code
FROM canonical_data.countries
WHERE name in ({glow_country_codes})
'''

In [14]:
# Gather all content wikis
#https://github.com/wikimedia-research/canonical-data/blob/master/countries.csv
wikis_r = '''
SELECT
  database_code,
  database_group AS project_code,
  language_code,
  language_name,
  english_name as wiki_name,
  CONCAT("https://", domain_name) AS domain_name
FROM canonical_data.wikis
WHERE
  database_group in ("mediawiki", "wikidata", "wikipedia") 
  AND status = "open" 
  AND visibility = "public" 
  AND editability = "public"
  AND database_code IN ({glow_wiki_dbs})
'''

## Editors<a class="anchor" id="editors"></a>
[Back to Table of Contents](#toc)

#### Monthly editors <a class="anchor" id="editors_monthly"></a>
[Back to Table of Contents](#toc)

In [None]:
#adapted from:
#https://github.com/wikimedia-research/wiki-segmentation
#https://github.com/wikimedia-research/Editing-movement-metrics

mce_r = '''
select
    wiki_db AS database_code,
    SUM(namespace_zero_distinct_editors) / 12 AS monthly_content_editors
from wmf.geoeditors_monthly
where 
    month >= "{Y_START_DATE}"
    AND month < "{TODAY_DATE}"
    AND wiki_db IN ({glow_wiki_dbs})
group by wiki_db
'''

#### Monthly New Active Editors <a class="anchor" id="editors_active"></a>
[Back to Table of Contents](#toc)

In [19]:
#monthly active editors
#adapted from:
#https://github.com/wikimedia-research/wiki-segmentation
#https://github.com/wikimedia-research/Editing-movement-metrics

mnae_r = '''
SELECT
    wiki AS database_code,
    COUNT(*) / 12 AS monthly_active_editors,
    SUM(
        CAST(TRUNC(user_registration, 'MM') = TRUNC(month, 'MM') AS INT)
        )/ 12 AS monthly_new_active_editors
FROM cchen.editor_month
WHERE
    content_edits >= 5 
    AND month >= "{Y_START_DATE_FULL}" 
    AND month < "{TODAY_DATE_FULL}" 
    AND wiki IN ({glow_wiki_dbs}) 
    AND user_id != 0 
    AND bot_by_group = FALSE 
    AND (
        user_name not regexp "bot\\b" or
        user_name in ("Paucabot", "Niabot", "Marbot")    
    )    
GROUP BY wiki
'''

#### Monthly New Editors <a class="anchor" id="editors_new"></a>
[Back to Table of Contents](#toc)

In [21]:
#monthly new editors

#adapted from:
#https://github.com/wikimedia-research/wiki-segmentation

mne_r = '''
select
    wiki AS database_code,
    sum(CAST(TRUNC(user_registration, 'MM') = TRUNC(month, 'MM') AS INT))/ 12 AS monthly_new_editors
from neilpquinn.editor_month
where
    month >= "{Y_START_DATE_FULL}"
    AND month < "{TODAY_DATE_FULL}" 
    AND wiki IN ({glow_wiki_dbs})
    AND user_id != 0
    AND bot_by_group = 0 and (
        user_name not regexp "bot\\b" or
        user_name in ("Paucabot", "Niabot", "Marbot")    
    )
group by wiki
'''

#### New editor retention <a class="anchor" id="new_editor_retention"></a>
[Back to Table of Contents](#toc)

In [23]:
#adapted from:
#https://github.com/wikimedia-research/wiki-segmentation
#https://github.com/wikimedia-research/Editing-movement-metrics

ner_r = '''
select 
    wiki AS database_code,
    sum(cast(2nd_month_edits >= 1 as int)) / sum(cast(1st_month_edits >= 1 as int)) AS new_editor_retention
from neilpquinn.new_editors
where 
    cohort >= "{Y_START_DATE}"
    AND cohort < "{TODAY_DATE}"
    AND wiki IN ({glow_wiki_dbs})
group by wiki
limit 1000
'''

### Language Switching <a class="anchor" id="language_switching"></a>
[Back to Table of Contents](#toc)

In [None]:
#https://github.com/geohci/language-switching
language_switching = hive.run("""
SELECT reflect('org.apache.commons.codec.digest.DigestUtils', 'sha512Hex', CONCAT(user_agent, client_ip, "{SALT}")) AS user,
       concat(translate(normalized_host.project, '-', '_'), 'wiki') AS project,
       COALESCE(pageview_info["page_title"], "EDITATTEMPT") as page_title,
       page_id AS page_id,
       dt,
       geocoded_data['country'] AS country
  FROM wmf.webrequest 
 WHERE normalized_host.project_family = "wikipedia"
       AND ((is_pageview AND namespace_id = 0)
            OR (uri_query LIKE '%action=edit%' OR uri_query LIKE '%action=visualeditor%'
                OR uri_query LIKE '%&intestactions=edit&intestactionsdetail=full&uiprop=options%'))
       AND agent_type = "user" 
       AND year = {YEAR} AND month = {MONTH} AND day = {DAY}
       AND SUBSTR(ip, -1, 1) = {# FROM 0-9}
""".format(**query_vars))
           

## Readers<a class="anchor" id="readers"></a>
[Back to Table of Contents](#toc)

#### PageViews by referer_class and access_method <a class="anchor" id="pageviews_detailed"></a>
[Back to Table of Contents](#toc)

In [None]:
pv_rc_r = '''
SELECT 
  country_code,
  project,
  SUM(view_count)/12 as view_count,
  referer_class,
FROM wmf.pageview_hourly
WHERE
  CONCAT(year,LPAD(month,2,'0')) >= {Y_START_DATE_pv}
  AND CONCAT(year,LPAD(month,2,'0')) < {TODAY_DATE_pv}
  AND agent_type='user'
  AND country_code = {india_country_codes}
  AND project IN ({india_wiki_projects})
GROUP BY 
  country_code, project, referer_class, year, month
'''

## Devices <a class="anchor" id="article_detail"></a>
[Back to Table of Contents](#toc)

#### Monthly Unique Devices <a class="anchor" id="devices"></a>
[Back to Table of Contents](#toc)

In [None]:
mca_uds_r = '''
select
    country,
    year,
    sum(uniques_estimate) 
from wmf.unique_devices_per_domain_monthly
WHERE 
    CONCAT(year,LPAD(month,2,'0')) >=  ({Y_START_DATE_pv})
    AND CONCAT(year,LPAD(month,2,'0')) < ({TODAY_DATE_pv})
    AND country_code IN ({glow_iso_codes})
group by country, year
''' 

## Articles<a class="anchor" id="editors"></a>
[Back to Table of Contents](#toc)

#### New articles <a class="anchor" id="new_articles"></a>
[Back to Table of Contents](#toc)

In [94]:
# adapted from https://github.com/wikimedia-research/2018-19-Language-annual-plan-metrics/blob/master/Language-metrics.ipynb
# https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Edits/Mediawiki_history
# 6 months ≈ 26 weeks = 252 days
# period below starts 2019/07 

# Making the first edit to a page

m_new_article_counts_r = ''' 
select
    wiki,
    count(*)/12 as mon_new_articles
from wmf.mediawiki_history mh
left join event_sanitized.serversideaccountcreation ssac
on
    ssac.event.username = event_user_text and
    ssac.year >= 0
where
    mh.snapshot = "{MWH_SNAPSHOT}"
    AND mh.event_timestamp >= "{Y_START_DATE_FULL}"
    AND mh.event_timestamp < "{TODAY_DATE_FULL}" 
    AND event_entity = "revision"
    AND event_type = "create"
    AND wiki in ({india_wiki_dbs})
GROUP BY wiki
''' 

#### avg_num_new_articles_edited <a class="anchor" id="new_articles"></a>
[Back to Table of Contents](#toc)

In [None]:
#adapted from https://github.com/wikimedia-research/2018-19-Language-annual-plan-metrics/blob/master/Language-metrics.ipynb
#https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Edits/Mediawiki_history
#6 months ≈ 26 weeks = 252 days

#Making the first edit to a page
m_new_articles_edited_r = ''' 
SELECT
    wiki_db, 
    count(*)/12 AS mon_new_articles_edited
FROM wmf.mediawiki_history mh
WHERE
    mh.snapshot = "{MWH_SNAPSHOT}" 
    AND mh.event_timestamp >= "{Y_START_DATE_FULL}"
    AND mh.event_timestamp < "{TODAY_DATE_FULL}" 
    AND event_entity = "page"
    AND event_type = "create"
    AND wiki_db in ({glow_wiki_dbs})
GROUP BY wiki_db
''' 

#AND ssac.webhost LIKE '%wikipedia.org'

#### Existing articles, recently edited <a class="anchor" id="article_edits"></a>
[Back to Table of Contents](#toc)

In [None]:
#adapted from https://github.com/wikimedia-research/2018-19-Language-annual-plan-metrics/blob/master/Language-metrics.ipynb
#https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Edits/Mediawiki_history
#6 months ≈ 26 weeks = 252 days
#period below starts 2019/07 

eae_r = ''' 
select
    wiki_db,
    count(*)/12 as avg_n_existing_articles_edited
from wmf.mediawiki_history mh
where
    mh.snapshot = "{MWH_SNAPSHOT}"  
    AND mh.event_timestamp >= "{Y_START_DATE}" 
    AND mh.event_timestamp < "{TODAY_DATE}" 
    AND event_entity = "revision"
    AND event_type = "create" 
    AND wiki_db in ({glow_wiki_dbs})
GROUP BY wiki_db
''' 

#### Daily revisions by wiki <a class="anchor" id="daily_wiki_revisions"></a>
[Back to Table of Contents](#toc)

In [None]:
##Daily revisions by wiki

#`dr` stands for "daily revisions"
dra_r = ''' 
    select
        wiki_db AS wiki,
        sum(if(metric = "daily_edits", value, 0)) - sum(if(metric = "daily_edits_by_bot_users", value, 0))/12 as nonbot_revs
    from wmf.mediawiki_metrics
    where
        snapshot = "{MWH_SNAPSHOT}" 
        AND dt >="{Y_START_DATE_FULL}"
        AND dt <"{TODAY_DATE_FULL}"
        AND metric in ("daily_edits", "daily_edits_by_bot_users")
        AND wiki_db IN ({glow_wiki_dbs})
    group by wiki_db, dt
''' 

see content notebook