In [18]:
import pandas as pd
import requests
import re
import json

import wmfdata as wmf

# List of wikis

In [2]:
wikis = wmf.mariadb.run("""
select
    site_global_key as code,
    site_group as project,
    site_language as language,
    concat("https://", trim(leading "." from reverse(site_domain))) as domain
from enwiki.sites
where site_group in (
    'commons', 'incubator', 'foundation', 'mediawiki', 'meta', 'sources',
    'species', 'wikibooks', 'wikidata', 'wikinews', 'wikipedia', 'wikiquote',
    'wikisource', 'wikiversity', 'wikivoyage', 'wiktionary'
    )
""")

In [3]:
wikis.head(10)

Unnamed: 0,code,project,language,domain
0,aawiki,wikipedia,aa,https://aa.wikipedia.org
1,aawiktionary,wiktionary,aa,https://aa.wiktionary.org
2,aawikibooks,wikibooks,aa,https://aa.wikibooks.org
3,abwiki,wikipedia,ab,https://ab.wikipedia.org
4,abwiktionary,wiktionary,ab,https://ab.wiktionary.org
5,acewiki,wikipedia,ace,https://ace.wikipedia.org
6,afwiki,wikipedia,af,https://af.wikipedia.org
7,afwiktionary,wiktionary,af,https://af.wiktionary.org
8,afwikibooks,wikibooks,af,https://af.wikibooks.org
9,afwikiquote,wikiquote,af,https://af.wikiquote.org


In [4]:
urls = [
    "https://raw.githubusercontent.com/wikimedia/mediawiki-extensions-cldr/master/CldrNames/CldrNamesEn.php",
    "https://raw.githubusercontent.com/wikimedia/mediawiki-extensions-cldr/master/LocalNames/LocalNamesEn.php"
]

def get_lang_names(url):
    r = requests.get(url)
    m = re.search(r"languageNames = (\[[\s\S]+?\])", r.text)
    php_ln = m.group(1)
    
    repl = [
        # Convert from PHP array format to JSON
        (" =>", ":"),
        ("\[", "{"),
        ("\]", "}"),
        # Trailing commas will cause problems
        (",\n}", "\n}"),
        # ...so will single quotes
        ("'", '"'),
        # ...and comments
        (r"/\*[\s\S]*?\*/", ""),
        (r"#(.*?)\n", ""),
        # One hack to deal with a single quote in a language name
        ('O"odham', "O'odham")
    ]
    
    json_ln = php_ln
    for old, new in repl:
        json_ln = re.sub(old, new, json_ln)
    
    py_ln = json.loads(json_ln)
    return py_ln

langs = {}
for url in urls:
    langs.update(get_lang_names(url))

In [5]:
# Add languages not included in the CLDR files
langs.update({
    "als": "Tosk",
    "atj": "Atikamekw",
    "diq": "Zazaki",
    "fiu-vro": "Võro",
    "map-bms": "Banyumasan",
    "nah": "Nahuatl",
    "pih": "Norfuk-Pitkern",
    "rmy": "Vlax Romani",
    "simple": "Simple English"
})

In [6]:
nonstandard_projects = {
    "commons": "Wikimedia Commons",
    "foundation": "Wikimedia Foundation website",
    "mediawiki": "MediaWiki.org",
    "meta": "Meta-Wiki",
    "sources": "Multilingual Wikisource",
    "species": "Wikispecies",
    "wikidata": "Wikidata"
}

def full_name(row):
    project = row.loc["project"]
    nonstandard = nonstandard_projects.get(project)
    
    if nonstandard:
        return nonstandard
    else:
        lang = row["language"]
        lang = langs[lang]
        project = project.title()
        return " ".join([lang, project])

wikis["name"] = wikis.apply(full_name, axis=1)

In [7]:
wikis.head()

Unnamed: 0,code,project,language,domain,name
0,aawiki,wikipedia,aa,https://aa.wikipedia.org,Afar Wikipedia
1,aawiktionary,wiktionary,aa,https://aa.wiktionary.org,Afar Wiktionary
2,aawikibooks,wikibooks,aa,https://aa.wikibooks.org,Afar Wikibooks
3,abwiki,wikipedia,ab,https://ab.wikipedia.org,Abkhazian Wikipedia
4,abwiktionary,wiktionary,ab,https://ab.wiktionary.org,Abkhazian Wiktionary


In [8]:
wikis = wikis.sort_values("name").reset_index(drop=True)

In [9]:
wikis.to_csv("data/wikis.tsv", sep="\t", index=False)

# Data

In [10]:
data = wikis.copy().rename(columns={"code": "wiki"})

## New editor retention

In [None]:
with open("queries/new_editor_retention.hql") as f:
    q = f.read()

ner = wmf.hive.run(
    q.format(start = "2017-04", end = "2018-04")
)

In [None]:
ner.tail()

In [None]:
data = pd.merge(data, ner, how="left", on="wiki")

## Monthly active editors

In [None]:
mae = wmf.mariadb.run("""
select
    wiki,
    count(*) / 12 as monthly_active_editors,
    sum(
        extract(year_month from user_registration) = extract(year_month from month)
    ) / 12 as monthly_new_active_editors
from editor_month
where
    content_edits >= 5 and
    month >= "{start}" and
    month < "{end}" and
    local_user_id != 0 and
    bot_flag = 0 and (
        convert(user_name using utf8) not regexp "bot\\\\b" or
        convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")    
    )
group by wiki;
""".format(start="2017-06-01", end="2018-06-01"))

In [None]:
mae.tail()

In [None]:
data = pd.merge(data, mae, how="left", on="wiki")

## Monthly unique devices

In [None]:
mud = wmf.hive.run("""
select
    regexp_replace(
        regexp_replace(
            regexp_replace(domain, "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.') as domain,
  sum(uniques_estimate) / 12 as monthly_unique_devices
FROM wmf.unique_devices_per_domain_monthly
WHERE 
    concat(year, month) >= "201706" and
    concat(year, month) < "201806"
group by    
    regexp_replace(
        regexp_replace(
            regexp_replace(domain, "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.')
""")

mud["domain"] = "https://" + mud["domain"]

In [None]:
mud.sample(25)

In [None]:
data = pd.merge(data, mud, how="left", on="domain")

## Mobile editing

In [None]:
mep = wmf.mariadb.run("""
    select 
        wiki,
        sum(mobile_web_edits + mobile_app_edits) / sum(edits) as mobile_editing_proportion
    from staging.editor_month
    where
        month >= "{start}" and
        month < "{end}" and
        -- A user is a bot if they have a matching name or have the bot flag on *any* wiki
        -- See https://meta.wikimedia.org/wiki/Research:Active_editor and https://meta.wikimedia.org/wiki/Research:Bot_user
        bot_flag = 0 and (
            -- Convert from BINARY to CHAR so that case-insentive regexes work
            convert(user_name using utf8) not regexp "bot\\\\b" or
            convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")    
        )
    group by wiki;
""".format(start="2017-06-01", end="2018-06-01"))

In [None]:
mep.tail()

In [None]:
data = pd.merge(data, mep, how="left", on="wiki")

## Bot editing

In [None]:
bep = wmf.mariadb.run("""
    select 
        wiki,
        sum(
            if(
                bot_flag = 1 or convert(user_name using utf8) regexp "bot\\\\b", 
                edits,
                0
            )
        ) / sum(edits) as bot_editing_proportion
    from staging.editor_month
    where
        month >= "{start}" and
        month < "{end}"
    group by wiki;
""".format(start="2017-06-01", end="2018-06-01"))

In [None]:
bep.tail()

In [None]:
data = pd.merge(data, bep, how="left", on="wiki")

## Revert rate

In [None]:
rr = wmf.hive.run("""
    select
        wiki_db as wiki,
        sum(cast(revision_is_identity_reverted as int)) / count(*) as revert_rate
    from
        wmf.mediawiki_history
    where
        event_entity = "revision" and
        event_type = "create" and
        snapshot = "{snapshot}" and
        event_timestamp >= "{start}" and
        event_timestamp < "{end}" and
        event_user_is_bot_by_name = false and
        array_contains(event_user_groups, "bot") = false
    group by wiki_db
""".format(snapshot="2018-05", start="2017-06", end="2018-06")
)

In [None]:
rr.tail()

In [None]:
data = pd.merge(data, rr, how="left", on="wiki")

## Anonymous editing proportion

In [None]:
aep = wmf.mariadb.run("""
    select 
        wiki,
        sum(if(local_user_id = 0, edits, 0)) / sum(edits) as anonymous_editing_proportion
    from staging.editor_month
    where
        month >= "{start}" and
        month < "{end}" and
        -- A user is a bot if they have a matching name or have the bot flag on *any* wiki
        -- See https://meta.wikimedia.org/wiki/Research:Active_editor and https://meta.wikimedia.org/wiki/Research:Bot_user
        bot_flag = 0 and (
            -- Convert from BINARY to CHAR so that case-insentive regexes work
            convert(user_name using utf8) not regexp "bot\\\\b" or
            convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")    
        )
    group by wiki;
""".format(start="2017-06-01", end="2018-06-01"))

In [None]:
aep.tail()

In [None]:
aep = pd.merge(data, rr, how="left", on="wiki")

## Monthly active administrators

In [17]:
mae = wmf.hive.run("""
    select 
        wiki,
        sum(monthly_active_administrators) / 12 as monthly_active_administrators
    from (
        select
            wiki_db as wiki,
            substr(log_timestamp, 1, 6) as month,
            count(distinct log_user) as monthly_active_administrators
        from wmf_raw.mediawiki_logging
        where
            log_type in ("block", "protect", "delete", "rights") and
            log_timestamp >= "{start}" and
            log_timestamp < "{end}" and
            snapshot = "{snapshot}"
        group by wiki_db, substr(log_timestamp, 1, 6)
    ) mae
    group by wiki
""".format(start="201706", end="201806", snapshot="201805"))

# Readying for spreadsheet

In [None]:
data = data.sort_values("name").fillna(0)

def print_ser(ser):
    for val in ser.values:
        print(val)

In [None]:
print_ser(data["revert_rate"])