In [1]:
import numpy as np
import pandas as pd
import requests
import re
import json
import datetime
from imp import reload

import wmfdata as wmf

You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata


# List of wikis

In [2]:
wikis = wmf.mariadb.run("""
select
    site_global_key as code,
    site_group as project,
    site_language as language,
    concat("https://", trim(leading "." from reverse(site_domain))) as domain
from enwiki.sites
where site_group in (
    'commons', 'incubator', 'foundation', 'mediawiki', 'meta', 'sources',
    'species', 'wikibooks', 'wikidata', 'wikinews', 'wikipedia', 'wikiquote',
    'wikisource', 'wikiversity', 'wikivoyage', 'wiktionary'
    )
""")

In [3]:
wikis.head(10)

Unnamed: 0,code,project,language,domain
0,aawiki,wikipedia,aa,https://aa.wikipedia.org
1,aawiktionary,wiktionary,aa,https://aa.wiktionary.org
2,aawikibooks,wikibooks,aa,https://aa.wikibooks.org
3,abwiki,wikipedia,ab,https://ab.wikipedia.org
4,abwiktionary,wiktionary,ab,https://ab.wiktionary.org
5,acewiki,wikipedia,ace,https://ace.wikipedia.org
6,afwiki,wikipedia,af,https://af.wikipedia.org
7,afwiktionary,wiktionary,af,https://af.wiktionary.org
8,afwikibooks,wikibooks,af,https://af.wikibooks.org
9,afwikiquote,wikiquote,af,https://af.wikiquote.org


In [4]:
urls = [
    "https://raw.githubusercontent.com/wikimedia/mediawiki-extensions-cldr/master/CldrNames/CldrNamesEn.php",
    "https://raw.githubusercontent.com/wikimedia/mediawiki-extensions-cldr/master/LocalNames/LocalNamesEn.php"
]

def get_lang_names(url):
    r = requests.get(url)
    m = re.search(r"languageNames = (\[[\s\S]+?\])", r.text)
    php_ln = m.group(1)
    
    repl = [
        # Convert from PHP array format to JSON
        (" =>", ":"),
        ("\[", "{"),
        ("\]", "}"),
        # Trailing commas will cause problems
        (",\n}", "\n}"),
        # ...so will single quotes
        ("'", '"'),
        # ...and comments
        (r"/\*[\s\S]*?\*/", ""),
        (r"#(.*?)\n", ""),
        # One hack to deal with a single quote in a language name
        ('O"odham', "O'odham")
    ]
    
    json_ln = php_ln
    for old, new in repl:
        json_ln = re.sub(old, new, json_ln)
    
    py_ln = json.loads(json_ln)
    return py_ln

langs = {}
for url in urls:
    langs.update(get_lang_names(url))

In [5]:
# Add languages not included in the CLDR files
langs.update({
    "als": "Tosk",
    "atj": "Atikamekw",
    "diq": "Zazaki",
    "fiu-vro": "Võro",
    "map-bms": "Banyumasan",
    "nah": "Nahuatl",
    "pih": "Norfuk-Pitkern",
    "rmy": "Vlax Romani",
    "simple": "Simple English"
})

In [6]:
nonstandard_projects = {
    "commons": "Wikimedia Commons",
    "foundation": "Wikimedia Foundation website",
    "mediawiki": "MediaWiki.org",
    "meta": "Meta-Wiki",
    "sources": "Multilingual Wikisource",
    "species": "Wikispecies",
    "wikidata": "Wikidata"
}

def full_name(row):
    project = row.loc["project"]
    nonstandard = nonstandard_projects.get(project)
    
    if nonstandard:
        return nonstandard
    else:
        lang = row["language"]
        lang = langs[lang]
        project = project.title()
        return " ".join([lang, project])

wikis["name"] = wikis.apply(full_name, axis=1)

In [7]:
wikis.head()

Unnamed: 0,code,project,language,domain,name
0,aawiki,wikipedia,aa,https://aa.wikipedia.org,Afar Wikipedia
1,aawiktionary,wiktionary,aa,https://aa.wiktionary.org,Afar Wiktionary
2,aawikibooks,wikibooks,aa,https://aa.wikibooks.org,Afar Wikibooks
3,abwiki,wikipedia,ab,https://ab.wikipedia.org,Abkhazian Wikipedia
4,abwiktionary,wiktionary,ab,https://ab.wiktionary.org,Abkhazian Wiktionary


In [8]:
wikis = wikis.sort_values("name").reset_index(drop=True)

In [9]:
wikis.to_csv("data/wikis.tsv", sep="\t", index=False)

# Data

In [10]:
try:
    data
except NameError:
    data = wikis.copy().rename(columns={"code": "wiki"})

In [11]:
def merge_into_data(df, on="wiki"):
    global data
    data = pd.merge(data, df, how="left", on=on).fillna(0)

In [12]:
def top_10(df, col):
    return df.sort_values(col, ascending=False).head(10)

## Monthly active editors

In [13]:
mae = wmf.mariadb.run("""
select
    wiki,
    count(*) / 12 as monthly_active_editors,
    sum(
        extract(year_month from user_registration) = extract(year_month from month)
    ) / 12 as monthly_new_active_editors
from editor_month
where
    content_edits >= 5 and
    month >= "{start}" and
    month < "{end}" and
    local_user_id != 0 and
    bot_flag = 0 and (
        convert(user_name using utf8) not regexp "bot\\\\b" or
        convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")    
    )
group by wiki;
""".format(start="2017-06-01", end="2018-06-01"))

mae.tail()

Unnamed: 0,wiki,monthly_active_editors,monthly_new_active_editors
667,zhwikiquote,11.3333,1.3333
668,zhwikisource,35.5,2.9167
669,zhwikivoyage,9.4167,1.75
670,zhwiktionary,16.9167,2.25
671,zuwiki,2.9167,0.3333


In [14]:
data = pd.merge(data, mae, how="left", on="wiki")

## Monthly unique devices

In [15]:
mud = wmf.hive.run("""
select
    regexp_replace(
        regexp_replace(
            regexp_replace(domain, "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.') as domain,
  sum(uniques_estimate) / 12 as monthly_unique_devices
FROM wmf.unique_devices_per_domain_monthly
WHERE 
    concat(year, month) >= "201706" and
    concat(year, month) < "201806"
group by    
    regexp_replace(
        regexp_replace(
            regexp_replace(domain, "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.')
""")

mud["domain"] = "https://" + mud["domain"]

mud.sample(20)

Unnamed: 0,domain,monthly_unique_devices
532,https://nl.wikimedia.org,4916.167
372,https://jbo.wikipedia.org,8239.5
310,https://hr.wikiquote.org,25444.5
472,https://mh.wikipedia.org,1915.75
450,https://ln.wikipedia.org,13006.5
192,https://en.wikibooks.org,4560447.0
185,https://el.wikipedia.org,4842761.0
408,https://koi.wikipedia.org,7286.333
563,https://or.wikisource.org,3398.75
632,https://ru.wikiversity.org,184171.7


In [16]:
data = pd.merge(data, mud, how="left", on="domain")

## Overall size rank

In [17]:
size = np.sqrt(data["monthly_unique_devices"] * data["monthly_active_editors"])
rank = size.rank(method="min", na_option="bottom", ascending=False)
data["overall_size_rank"] = rank

## New editor retention

In [None]:
with open("queries/new_editor_retention.hql") as f:
    q = f.read()

ner = wmf.hive.run(
    q.format(start = "2017-04", end = "2018-04")
)

In [None]:
ner.tail()

In [None]:
data = pd.merge(data, ner, how="left", on="wiki")

## Mobile editing

In [None]:
mep = wmf.mariadb.run("""
    select 
        wiki,
        sum(mobile_web_edits + mobile_app_edits) / sum(edits) as mobile_editing_proportion
    from staging.editor_month
    where
        month >= "{start}" and
        month < "{end}" and
        -- A user is a bot if they have a matching name or have the bot flag on *any* wiki
        -- See https://meta.wikimedia.org/wiki/Research:Active_editor and https://meta.wikimedia.org/wiki/Research:Bot_user
        bot_flag = 0 and (
            -- Convert from BINARY to CHAR so that case-insentive regexes work
            convert(user_name using utf8) not regexp "bot\\\\b" or
            convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")    
        )
    group by wiki;
""".format(start="2017-06-01", end="2018-06-01"))

In [None]:
mep.tail()

In [None]:
data = pd.merge(data, mep, how="left", on="wiki")

## Bot editing proportion

In [None]:
bep = wmf.mariadb.run("""
    select 
        wiki,
        sum(
            if(
                bot_flag = 1 or convert(user_name using utf8) regexp "bot\\\\b", 
                edits,
                0
            )
        ) / sum(edits) as bot_editing_proportion
    from staging.editor_month
    where
        month >= "{start}" and
        month < "{end}"
    group by wiki;
""".format(start="2017-06-01", end="2018-06-01"))

In [None]:
bep.tail()

In [None]:
data = pd.merge(data, bep, how="left", on="wiki")

## Anonymous editing proportion

In [None]:
aep = wmf.mariadb.run("""
    select 
        wiki,
        sum(if(local_user_id = 0, edits, 0)) / sum(edits) as anonymous_editing_proportion
    from staging.editor_month
    where
        month >= "{start}" and
        month < "{end}" and
        -- A user is a bot if they have a matching name or have the bot flag on *any* wiki
        -- See https://meta.wikimedia.org/wiki/Research:Active_editor and https://meta.wikimedia.org/wiki/Research:Bot_user
        bot_flag = 0 and (
            -- Convert from BINARY to CHAR so that case-insentive regexes work
            convert(user_name using utf8) not regexp "bot\\\\b" or
            convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")    
        )
    group by wiki;
""".format(start="2017-06-01", end="2018-06-01"))

In [None]:
aep.sample(10)

In [None]:
data = pd.merge(data, aep, how="left", on="wiki")

## Majority-mobile editors proportion

In [None]:
mmep = wmf.mariadb.run("""
    select
        wiki,
        sum(mobile_editing_proportion > 0.5) / count(*) as majority_mobile_editors_proportion
    from (
        select
            wiki,
            sum(mobile_web_edits + mobile_app_edits) / sum(edits) as mobile_editing_proportion
        from staging.editor_month
        where
            month >= "{start}" and
            month < "{end}" and
            local_user_id != 0 and
            -- A user is a bot if they have a matching name or have the bot flag on *any* wiki
            -- See https://meta.wikimedia.org/wiki/Research:Active_editor and https://meta.wikimedia.org/wiki/Research:Bot_user
            bot_flag = 0 and (
                -- Convert from BINARY to CHAR so that case-insentive regexes work
                convert(user_name using utf8) not regexp "bot\\\\b" or
                convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")    
            )
        group by wiki, user_name
    ) user_edits
    group by wiki
""".format(start="2017-06-01", end="2018-06-01"))

In [None]:
merge_into_data(mmep)

## Revert rate

In [None]:
rr = wmf.hive.run("""
    select
        wiki_db as wiki,
        sum(cast(revision_is_identity_reverted as int)) / count(*) as revert_rate
    from
        wmf.mediawiki_history
    where
        event_entity = "revision" and
        event_type = "create" and
        snapshot = "{snapshot}" and
        event_timestamp >= "{start}" and
        event_timestamp < "{end}" and
        event_user_is_bot_by_name = false and
        array_contains(event_user_groups, "bot") = false
    group by wiki_db
""".format(snapshot="2018-05", start="2017-06", end="2018-06")
)

In [None]:
rr.tail()

In [None]:
data = pd.merge(data, rr, how="left", on="wiki")

## Mobile pageviews proportion

In [None]:
mpp = wmf.hive.run("""
select 
    concat("https://", project, ".org") as domain,
    sum(if(access_method = "mobile web", view_count, 0)) / sum(view_count) as mobile_web_pageviews_proportion,
    sum(if(access_method = "mobile app", view_count, 0)) / sum(view_count) as mobile_app_pageviews_proportion
from wmf.projectview_hourly
where
    agent_type = "user" and
    concat(year, month) >= "201706" and
    concat(year, month) < "201806"
group by concat("https://", project, ".org")
""")

mpp.sample(20)

In [None]:
merge_into_data(mpp, on="domain")

## Monthly active administrators

In [None]:
maa = wmf.hive.run("""
    select 
        wiki,
        sum(monthly_active_administrators) / 12 as monthly_active_administrators
    from (
        select
            wiki_db as wiki,
            substr(log_timestamp, 1, 6) as month,
            count(distinct log_user) as monthly_active_administrators
        from wmf_raw.mediawiki_logging
        where
            log_type in ("block", "protect", "delete", "rights") and
            log_timestamp >= "{start}" and
            log_timestamp < "{end}" and
            snapshot = "{snapshot}"
        group by wiki_db, substr(log_timestamp, 1, 6)
    ) mae
    group by wiki
""".format(start="201706", end="201806", snapshot="2018-05"))

In [None]:
maa.tail()

In [None]:
merge_into_data(maa)

## Monthly non-bot edits

In [None]:
mnbe = wmf.mariadb.run("""
    select 
        wiki,
        sum(edits) / 12 as monthly_nonbot_edits
    from staging.editor_month
    where
        month >= "{start}" and
        month < "{end}" and
        -- A user is a bot if they have a matching name or have the bot flag on *any* wiki
        -- See https://meta.wikimedia.org/wiki/Research:Active_editor and https://meta.wikimedia.org/wiki/Research:Bot_user
        bot_flag = 0 and (
            -- Convert from BINARY to CHAR so that case-insentive regexes work
            convert(user_name using utf8) not regexp "bot\\\\b" or
            convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")    
        )
    group by wiki;
""".format(start="2017-06-01", end="2018-06-01"))

In [None]:
mnbe.sample(20)

In [None]:
merge_into_data(mnbe)

## Edits Gini coefficient

In [None]:
user_edits = wmf.hive.run("""
    select
        wiki_db as wiki,
        count(*) as user_edits
    from
        wmf.mediawiki_history
    where
        event_entity = "revision" and
        event_type = "create" and
        snapshot = "{snapshot}" and
        event_timestamp >= "{start}" and
        event_timestamp < "{end}" and
        event_user_is_bot_by_name = false and
        array_contains(event_user_groups, "bot") = false
    group by event_user_id, wiki_db
""".format(snapshot="2018-05", start="2017-06", end="2018-06"))

In [None]:
# from https://github.com/oliviaguest/gini
def gini(array):
    """Calculate the Gini coefficient of a numpy array."""
    # based on bottom eq:
    # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg
    # from:
    # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    # All values are treated equally, arrays must be 1d:
    array = array.flatten()
    if np.amin(array) < 0:
        # Values cannot be negative:
        array -= np.amin(array)
    # Values cannot be 0:
    array = array + 0.0000001
    # Values must be sorted:
    array = np.sort(array)
    # Index per array element:
    index = np.arange(1,array.shape[0]+1)
    # Number of array elements:
    n = array.shape[0]
    # Gini coefficient:
    return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array)))

In [None]:
egc = user_edits.groupby("wiki").apply(lambda g: gini(g["user_edits"].values)).reset_index()

In [None]:
egc.columns = ["wiki", "edits_Gini_coefficient"]

In [None]:
merge_into_data(egc)

## Monthly editors

In [None]:
me = wmf.mariadb.run("""
select
    wiki,
    count(*) / 12 as monthly_editors
from editor_month
where
    month >= "{start}" and
    month < "{end}" and
    local_user_id != 0 and
    bot_flag = 0 and (
        convert(user_name using utf8) not regexp "bot\\\\b" or
        convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")    
    )
group by wiki;
""".format(start="2017-06-01", end="2018-06-01"))

In [None]:
top_10(me, "monthly_editors")

In [None]:
merge_into_data(me)

## Unique devices per editor

In [None]:
data["unique_devices_per_editor"] = data["monthly_unique_devices"] / data["monthly_editors"]

In [None]:
data = data.replace([np.inf], 0)

## Article count

In [18]:
wikis_list = data["wiki"].tolist()

In [None]:
ac = wmf.mariadb.multirun("""
select
    database() as wiki,
    ss_good_articles as article_count
from site_stats
""", wikis = wikis_list)

In [20]:
top_10(ac, "article_count")

Unnamed: 0,wiki,article_count
806,wikidatawiki,49101461
807,commonswiki,47451237
194,enwiki,5672350
199,enwiktionary,5653770
117,cebwiki,5382737
454,mgwiktionary,4722722
683,svwiki,3782109
236,frwiktionary,3308064
253,dewiki,2193979
231,frwiki,1995860


In [21]:
merge_into_data(ac)

## Cumulative content edits

In [23]:
ce = wmf.hive.run("""
    select
        wiki_db as wiki,
        count(*) as cumulative_content_edits
    from
        wmf.mediawiki_history
    where
        event_entity = "revision" and
        event_type = "create" and
        snapshot = "{snapshot}" and
        page_namespace_is_content = true and
        event_user_is_bot_by_name = false and
        array_contains(event_user_groups, "bot") = false
    group by wiki_db
""".format(snapshot="2018-05")
)

In [24]:
top_10(ce, "cumulative_content_edits")

Unnamed: 0,wiki,cumulative_content_edits
638,enwiki,501038078
20,wikidatawiki,215483470
490,commonswiki,149098365
392,dewiki,104154896
568,frwiki,81828624
191,eswiki,68501562
47,ruwiki,53871124
68,jawiki,51231382
518,itwiki,49099062
149,ptwiki,26961422


In [25]:
merge_into_data(ce)

## Edits per content page

In [27]:
data["edits_per_content_page"] = data["cumulative_content_edits"] / data["article_count"]

# Readying for spreadsheet

In [32]:
def print_col(col):
    global data
    data = data.sort_values("overall_size_rank").fillna(0)
    for val in data[col].values:
        print(val)

In [33]:
print_col("edits_per_content_page")

88.32989466446887
48.08179896229911
47.47305967832874
46.11194197774662
40.99918030322768
36.39176092050927
3.142138633814752
33.96594347571631
26.563755922356
26.962635318589335
20.071537580482982
12.354275831912828
15.070133148099751
10.217721230282862
28.519951923076924
9.907233517777799
3.95081606585109
2.1552516285593506
13.920165097885905
4.388534793292607
19.269365510918036
50.716294666525286
6.831068726342225
30.763939776963568
21.492721865345274
22.278301940373446
31.317664179993496
17.78352854289544
9.863019854141273
26.345180885529157
13.276754936025275
18.354424939422458
9.581921039002431
9.137709296454727
17.19084180731733
1.6441450346788937
17.533564881417238
25.69095405569985
11.13897801240326
55.95894796309083
10.143256216904321
1.8530618777456456
31.237467693301745
4.656784730135382
13.996233189135104
14.805615221987315
3.0433446542356104
9.883415308945589
14.052008480565371
11.193252689741563
11.605823692176271
7.553523822120452
91.8349210813091
18.79978983463304
51.5