# Collect Baselines - Monthly wikis

# Monthly Averages Baseline

## Wikis<a class="anchor" id="editors"></a>
[Back to Table of Contents](#toc)

In [1]:
#canonical data in hive
#https://github.com/wikimedia-research/canonical-data/blob/master/countries.csv
countries_r = '''
SELECT
  name, 
  iso_code
FROM canonical_data.countries
WHERE name in ({glow_countries})
'''

In [2]:
# Gather all content wikis
#https://github.com/wikimedia-research/canonical-data/blob/master/countries.csv
wikis_r = '''
SELECT
  database_code,
  database_group AS project_code,
  language_code,
  language_name,
  english_name as wiki_name,
  CONCAT("https://", domain_name) AS domain_name
FROM canonical_data.wikis
WHERE
  database_group in ("mediawiki", "wikidata", "wikipedia") 
  AND status = "open" 
  AND visibility = "public" 
  AND editability = "public"
  AND database_code IN ({india_wiki_dbs})
'''

## Devices <a class="anchor" id="article_detail"></a>
[Back to Table of Contents](#toc)

#### Monthly Unique Devices <a class="anchor" id="devices"></a>
[Back to Table of Contents](#toc)

In [None]:
mca_uds_r = '''
SELECT
    regexp_replace(
        regexp_replace(
            regexp_replace(domain, "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.') AS domain_name,
  SUM(uniques_estimate) / 12 AS monthly_unique_devices
FROM wmf.unique_devices_per_domain_monthly
WHERE 
    CONCAT(year,LPAD(month,2,'0')) >= ({contest_start_dt_12m_pre_pv})
    AND CONCAT(year,LPAD(month,2,'0')) < ({contest_start_dt_pv})
    AND country_code IN ('IN')
    AND domain IN ({india_domains})
GROUP BY    
    regexp_replace(
        regexp_replace(
            regexp_replace(domain, "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.')
''' 

## Editors<a class="anchor" id="editors"></a>
[Back to Table of Contents](#toc)

#### Monthly editors & monthly new <a class="anchor" id="editors_monthly"></a>
[Back to Table of Contents](#toc)

In [2]:
#adapted from:
#https://github.com/wikimedia-research/wiki-segmentation
#https://github.com/wikimedia-research/Editing-movement-metrics

mce_r = '''
SELECT
    wiki AS database_code,
    COUNT(*) / 12 AS monthly_editors,
    sum(CAST(TRUNC(user_registration, 'MM') = TRUNC(month, 'MM') AS INT))/ 12 AS monthly_new_editors
FROM cchen.editor_month
WHERE
    month >= "{contest_start_dt_12m_pre}"
    AND month < "{contest_start_dt}"
    AND wiki IN ({india_wiki_dbs}) 
    AND user_id != 0 
    AND bot_by_group = FALSE
    AND (user_name not regexp "bot\\b" or user_name in ("Paucabot", "Niabot", "Marbot"))
GROUP BY wiki
'''

#### Monthly New Active Editors & monthly active editors <a class="anchor" id="editors_active"></a>
[Back to Table of Contents](#toc)

In [19]:
#monthly active editors
#adapted from:
#https://github.com/wikimedia-research/wiki-segmentation
#https://github.com/wikimedia-research/Editing-movement-metrics

mnae_r = '''
SELECT
    wiki AS database_code,
    COUNT(*) / 12 AS monthly_active_editors,
    SUM(
        CAST(TRUNC(user_registration, 'MM') = TRUNC(month, 'MM') AS INT)
        )/ 12 AS monthly_new_active_editors
FROM cchen.editor_month
WHERE
    content_edits >= 5 
    AND month >= "{contest_start_dt_12m_pre}"
    AND month < "{contest_start_dt}" 
    AND wiki IN ({india_wiki_dbs}) 
    AND user_id != 0 
    AND bot_by_group = FALSE 
    AND (user_name not regexp "bot\\b" or user_name in ("Paucabot", "Niabot", "Marbot"))    
GROUP BY wiki
'''

#### Monthly Editors - including group of big wikis <a class="anchor" id="editors_active"></a>
[Back to Table of Contents](#toc)

In [3]:
mae_r = '''
SELECT
    em.wiki AS database_code,
    COUNT(*) / 12 AS indic_editors_on_big_wikis_m
FROM cchen.editor_month em
WHERE
    em.month >= "{contest_start_dt_12m_pre}"
    AND em.month < "{contest_start_dt}"
    AND em.wiki IN {wikis_big} 
    AND em.user_id != 0 
    AND em.bot_by_group = FALSE
    AND (em.user_name not regexp "bot\\b" or em.user_name in ("Paucabot", "Niabot", "Marbot"))  
GROUP BY em.wiki
'''

#### New editor retention <a class="anchor" id="new_editor_retention"></a>
[Back to Table of Contents](#toc)

In [None]:
#adapted from:
#https://github.com/wikimedia-research/wiki-segmentation
#https://github.com/wikimedia-research/Editing-movement-metrics

ner_r = '''
select 
    wiki AS database_code,
    sum(cast(2nd_month_edits >= 1 as int)) / sum(cast(1st_month_edits >= 1 as int)) AS new_editor_retention
from neilpquinn.new_editors
where 
    cohort >= "{contest_start_dt_12m_pre}"
    AND cohort < "{contest_start_dt}"
    AND wiki IN ({india_wiki_dbs})
group by wiki
'''

## Articles<a class="anchor" id="editors"></a>
[Back to Table of Contents](#toc)

#### New articles <a class="anchor" id="new_articles"></a>
[Back to Table of Contents](#toc)

In [94]:
#adapted from https://github.com/wikimedia-research/2018-19-Language-annual-plan-metrics/blob/master/Language-metrics.ipynb
#https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Edits/Mediawiki_history
#revision	create = Making an edit
# 6 months ≈ 26 weeks = 252 days
# period below starts 2019/07 

# Making the first edit to a page

m_new_article_counts_r = ''' 
select
    wiki AS database_code,
    count(*)/12 as mon_new_articles
from wmf.mediawiki_history mh
left join event_sanitized.serversideaccountcreation ssac
on
    ssac.event.username = event_user_text and
    ssac.year >= 0
where
    mh.snapshot = "{MWH_SNAPSHOT}"
    AND mh.event_timestamp >= "{contest_start_dt_12m_pre_FULL}"
    AND mh.event_timestamp < "{contest_start_dt_FULL}" 
    AND event_entity = "revision"
    AND event_type = "create"
    AND wiki in ({india_wiki_dbs})
GROUP BY wiki
''' 

#### avg_num_new_articles_edited <a class="anchor" id="new_articles"></a>
[Back to Table of Contents](#toc)

In [None]:
#adapted from https://github.com/wikimedia-research/2018-19-Language-annual-plan-metrics/blob/master/Language-metrics.ipynb
#https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Edits/Mediawiki_history
#page	create = Making the first edit to a page

#6 months ≈ 26 weeks = 252 days

#Making the first edit to a page
m_new_articles_edited_r = ''' 
SELECT
    wiki_db AS database_code, 
    count(*)/12 AS mon_new_articles_edited
FROM wmf.mediawiki_history mh
WHERE
    mh.snapshot = "{MWH_SNAPSHOT}" 
    AND mh.event_timestamp >= "{contest_start_dt_12m_pre_FULL}"
    AND mh.event_timestamp < "{contest_start_dt_FULL}" 
    AND event_entity = "page"
    AND event_type = "create"
    AND wiki_db in ({india_wiki_dbs})
GROUP BY wiki_db
''' 

#AND ssac.webhost LIKE '%wikipedia.org'

#### Existing articles, recently edited <a class="anchor" id="article_edits"></a>
[Back to Table of Contents](#toc)

In [None]:
#adapted from https://github.com/wikimedia-research/2018-19-Language-annual-plan-metrics/blob/master/Language-metrics.ipynb
#https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Edits/Mediawiki_history
#6 months ≈ 26 weeks = 252 days
#period below starts 2019/07 

eae_r = ''' 
select
    wiki_db AS database_code, 
    count(*)/12 as avg_n_existing_articles_edited
from wmf.mediawiki_history mh
where
    mh.snapshot = "{MWH_SNAPSHOT}"  
    AND mh.event_timestamp >= "{contest_start_dt_12m_pre_FULL}"
    AND mh.event_timestamp < "{contest_start_dt_FULL}" 
    AND event_entity = "revision"
    AND event_type = "create" 
    AND wiki_db in ({india_wiki_dbs})
GROUP BY wiki_db
''' 

#### Daily revisions by wiki <a class="anchor" id="daily_wiki_revisions"></a>
[Back to Table of Contents](#toc)

In [None]:
##Daily revisions by wiki

#`dr` stands for "daily revisions"
dra_r = ''' 
    select
        wiki_db AS database_code, 
        sum(if(metric = "daily_edits", value, 0)) - sum(if(metric = "daily_edits_by_bot_users", value, 0))/5 as nonbot_revs
    from wmf.mediawiki_metrics
    where
        snapshot = "{MWH_SNAPSHOT}" 
        AND dt >="{contest_start_dt_12m_pre_FULL}"
        AND dt <"{contest_start_dt_FULL}"
        AND metric in ("daily_edits", "daily_edits_by_bot_users")
        AND wiki_db IN ({india_wiki_dbs})
    group by wiki_db
''' 