In [125]:
import numpy as np
import pandas as pd
import requests
import re
import json
import datetime

import wmfdata as wmf

# List of wikis

In [188]:
# Gather all content wikis
wikis = wmf.mariadb.run("""
select
    site_global_key as database_code,
    site_group as project_code,
    site_language as language_code,
    concat("https://", trim(leading "." from reverse(site_domain))) as domain_name
from enwiki.sites
where site_group in (
    'betawikiversity', 'commons', 'incubator', 'sources', 'species', 
    'wikibooks', 'wikidata', 'wikinews', 'wikipedia', 'wikiquote',
    'wikisource', 'wikiversity', 'wikivoyage', 'wiktionary'
)
""")

wikis = wikis.set_index("database_code")

In [190]:
urls = [
    "https://raw.githubusercontent.com/wikimedia/mediawiki-extensions-cldr/master/CldrNames/CldrNamesEn.php",
    "https://raw.githubusercontent.com/wikimedia/mediawiki-extensions-cldr/master/LocalNames/LocalNamesEn.php"
]

def get_lang_names(url):
    r = requests.get(url)
    m = re.search(r"languageNames = (\[[\s\S]+?\])", r.text)
    php_ln = m.group(1)
    
    repl = [
        # Convert from PHP array format to JSON
        (" =>", ":"),
        ("\[", "{"),
        ("\]", "}"),
        # Trailing commas will cause problems
        (",\n}", "\n}"),
        # ...so will single quotes
        ("'", '"'),
        # ...and comments
        (r"/\*[\s\S]*?\*/", ""),
        (r"#(.*?)\n", ""),
        # One hack to deal with a single quote in a language name
        ('O"odham', "O'odham")
    ]
    
    json_ln = php_ln
    for old, new in repl:
        json_ln = re.sub(old, new, json_ln)
    
    py_ln = json.loads(json_ln)
    return py_ln

langs = {}
for url in urls:
    langs.update(get_lang_names(url))

# Add languages not included in the CLDR files
langs.update({
    "als": "Tosk",
    "atj": "Atikamekw",
    "diq": "Zazaki",
    "fiu-vro": "Võro",
    "map-bms": "Banyumasan",
    "nah": "Nahuatl",
    "pih": "Norfuk-Pitkern",
    "rmy": "Vlax Romani",
    "simple": "Simple English"
})

In [191]:
# Projects not split by language into different editions
unified_projects = {
    "betawikiversity": "Wikiversity Beta",
    "commons": "Wikimedia Commons",
    "incubator": "Wikimedia Incubator",
    "labs": "Wikitech",
    "mediawiki": "MediaWiki.org",
    "meta": "Meta-Wiki",
    "outreach": "Outreach Wiki",
    "sources": "Multilingual Wikisource",
    "species": "Wikispecies",
    "wikidata": "Wikidata"
}

def lang_name(row):
    lang = row.loc["language_code"]
    return langs[lang]

def proj_name(row):
    proj = row.loc["project_code"]
    unified = unified_projects.get(proj)
    if unified:
        return unified
    else:
        return proj.title()

def wiki_name(row):
    proj = row.loc["project_code"]
    unified = unified_projects.get(proj)
    if unified:
        return unified
    else:
        return " ".join([lang_name(row), proj_name(row)])
    
wikis["language_name"] = wikis.apply(lang_name, axis=1)
wikis["project_name"] = wikis.apply(proj_name, axis=1)
wikis["wiki_name"] = wikis.apply(wiki_name, axis=1)

In [192]:
len(wikis)

830

In [193]:
# Remove closed wikis
closed_url = "https://raw.githubusercontent.com/wikimedia/operations-mediawiki-config/master/dblists/closed.dblist"
closed = pd.Series(requests.get(closed_url).text.split("\n"))
wikis = wikis.drop(closed, errors="ignore")

In [194]:
len(wikis)

725

In [195]:
wikis.sample(10)

Unnamed: 0_level_0,project_code,language_code,domain_name,language_name,project_name,wiki_name
database_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
sdwiktionary,wiktionary,sd,https://sd.wiktionary.org,Sindhi,Wiktionary,Sindhi Wiktionary
azwiktionary,wiktionary,az,https://az.wiktionary.org,Azerbaijani,Wiktionary,Azerbaijani Wiktionary
quwiktionary,wiktionary,qu,https://qu.wiktionary.org,Quechua,Wiktionary,Quechua Wiktionary
kaawiki,wikipedia,kaa,https://kaa.wikipedia.org,Kara-Kalpak,Wikipedia,Kara-Kalpak Wikipedia
ugwiktionary,wiktionary,ug,https://ug.wiktionary.org,Uyghur,Wiktionary,Uyghur Wiktionary
rwwiktionary,wiktionary,rw,https://rw.wiktionary.org,Kinyarwanda,Wiktionary,Kinyarwanda Wiktionary
bnwiktionary,wiktionary,bn,https://bn.wiktionary.org,Bangla,Wiktionary,Bangla Wiktionary
fiwikiquote,wikiquote,fi,https://fi.wikiquote.org,Finnish,Wikiquote,Finnish Wikiquote
lmowiki,wikipedia,lmo,https://lmo.wikipedia.org,Lombard,Wikipedia,Lombard Wikipedia
wikidatawiki,wikidata,en,https://www.wikidata.org,English,Wikidata,Wikidata


# Data

In [196]:
def merge_in(df, on="database_code"):
    global wikis
    wikis = pd.merge(wikis, df, how="left", on=on).fillna(0)

In [197]:
def top_10(df, col):
    return df.sort_values(col, ascending=False).head(10)

In [198]:
def rename_df(df):
    return df.rename({"wiki": "database_code", "domain": "domain_name"}, axis=1)

## Monthly active editors

In [None]:
mae = wmf.mariadb.run("""
select
    wiki as database_code,
    count(*) / 12 as monthly_active_editors,
    sum(
        extract(year_month from user_registration) = extract(year_month from month)
    ) / 12 as monthly_new_active_editors
from editor_month
where
    content_edits >= 5 and
    month >= "{start}" and
    month < "{end}" and
    local_user_id != 0 and
    bot_flag = 0 and (
        convert(user_name using utf8) not regexp "bot\\\\b" or
        convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")    
    )
group by wiki;
""".format(start="2017-06-01", end="2018-06-01"))

In [199]:
merge_in(mae)

## Monthly unique devices

In [23]:
mud = wmf.hive.run("""
select
    regexp_replace(
        regexp_replace(
            regexp_replace(domain, "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.') as domain_name,
  sum(uniques_estimate) / 12 as monthly_unique_devices
FROM wmf.unique_devices_per_domain_monthly
WHERE 
    concat(year, month) >= "201706" and
    concat(year, month) < "201806"
group by    
    regexp_replace(
        regexp_replace(
            regexp_replace(domain, "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.')
""")

mud["domain"] = "https://" + mud["domain"]

top_10(mud, "monthly_unique_devices")

Unnamed: 0,domain,monthly_unique_devices
194,https://en.wikipedia.org,696456600.0
208,https://es.wikipedia.org,139841100.0
164,https://de.wikipedia.org,92964350.0
366,https://ja.wikipedia.org,84374000.0
629,https://ru.wikipedia.org,82848950.0
249,https://fr.wikipedia.org,76052840.0
598,https://pt.wikipedia.org,49912060.0
356,https://it.wikipedia.org,47468420.0
853,https://zh.wikipedia.org,33209570.0
32,https://ar.wikipedia.org,28593390.0


In [201]:
merge_in(mud, on="domain_name")

## Overall size rank

In [241]:
size = np.sqrt(wikis["monthly_unique_devices"] * wikis["monthly_active_editors"])
rank = size.rank(method="min", na_option="bottom", ascending=False)
wikis["overall_size_rank"] = rank

## New editor retention

In [31]:
with open("queries/new_editor_retention.hql") as f:
    q = f.read()

ner = wmf.hive.run(
    q.format(start = "2017-04", end = "2018-04")
)

In [173]:
top_10(ner, "new_editor_retention")

Unnamed: 0,database_code,new_editor_retention
379,stqwiki,1.0
133,nrmwiki,1.0
439,cdowiki,1.0
575,roa_rupwiki,1.0
22,bxrwiki,1.0
109,liwikisource,1.0
211,zeawiki,1.0
455,dsbwiki,1.0
57,frpwiki,1.0
207,vowiki,1.0


In [203]:
merge_in(ner)

## Mobile editing

In [34]:
mep = wmf.mariadb.run("""
    select 
        wiki as database_code,
        sum(mobile_web_edits + mobile_app_edits) / sum(edits) as mobile_editing_proportion
    from staging.editor_month
    where
        month >= "{start}" and
        month < "{end}" and
        -- A user is a bot if they have a matching name or have the bot flag on *any* wiki
        -- See https://meta.wikimedia.org/wiki/Research:Active_editor and https://meta.wikimedia.org/wiki/Research:Bot_user
        bot_flag = 0 and (
            -- Convert from BINARY to CHAR so that case-insentive regexes work
            convert(user_name using utf8) not regexp "bot\\\\b" or
            convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")    
        )
    group by wiki;
""".format(start="2017-06-01", end="2018-06-01"))

In [205]:
top_10(mep, "mobile_editing_proportion")

Unnamed: 0,database_code,mobile_editing_proportion
372,lawikiquote,0.7894
522,rmywiki,0.6528
220,gawiktionary,0.6119
552,sawiktionary,0.5949
323,jvwiktionary,0.5217
557,sdwiki,0.4901
508,pswiki,0.4755
432,mswikibooks,0.4742
428,mrwikiquote,0.4636
510,pswiktionary,0.4474


In [206]:
merge_in(mep)

## Bot editing proportion

In [37]:
bep = wmf.mariadb.run("""
    select 
        wiki as database_code,
        sum(
            if(
                bot_flag = 1 or convert(user_name using utf8) regexp "bot\\\\b", 
                edits,
                0
            )
        ) / sum(edits) as bot_editing_proportion
    from staging.editor_month
    where
        month >= "{start}" and
        month < "{end}"
    group by wiki;
""".format(start="2017-06-01", end="2018-06-01"))

top_10(bep, "bot_editing_proportion")

Unnamed: 0,wiki,bot_editing_proportion
409,mgwiktionary,0.9994
88,cebwiki,0.9976
157,eowikinews,0.9887
656,trwikinews,0.9859
275,hywikiquote,0.9755
439,mywiktionary,0.9704
361,kuwiktionary,0.9608
112,cywiki,0.9574
89,cewiki,0.9465
701,vowiktionary,0.945


In [209]:
merge_in(bep)

## Anonymous editing proportion

In [39]:
aep = wmf.mariadb.run("""
    select 
        wiki as database_code,
        sum(if(local_user_id = 0, edits, 0)) / sum(edits) as anonymous_editing_proportion
    from staging.editor_month
    where
        month >= "{start}" and
        month < "{end}" and
        -- A user is a bot if they have a matching name or have the bot flag on *any* wiki
        -- See https://meta.wikimedia.org/wiki/Research:Active_editor and https://meta.wikimedia.org/wiki/Research:Bot_user
        bot_flag = 0 and (
            -- Convert from BINARY to CHAR so that case-insentive regexes work
            convert(user_name using utf8) not regexp "bot\\\\b" or
            convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")    
        )
    group by wiki;
""".format(start="2017-06-01", end="2018-06-01"))

top_10(aep, "anonymous_editing_proportion")

Unnamed: 0,wiki,anonymous_editing_proportion
566,simplewikiquote,1.0
718,zawiktionary,1.0
607,suwikibooks,1.0
565,simplewikibooks,1.0
700,vowikibooks,1.0
291,ikwiktionary,1.0
723,zh_min_nanwikiquote,1.0
722,zh_min_nanwikibooks,1.0
435,mtwiktionary,0.9506
368,kywiktionary,0.937


In [211]:
merge_in(aep)

## Majority-mobile editors proportion

In [41]:
mmep = wmf.mariadb.run("""
    select
        wiki as database_code,
        sum(mobile_editing_proportion > 0.5) / count(*) as majority_mobile_editors_proportion
    from (
        select
            wiki,
            sum(mobile_web_edits + mobile_app_edits) / sum(edits) as mobile_editing_proportion
        from staging.editor_month
        where
            month >= "{start}" and
            month < "{end}" and
            local_user_id != 0 and
            -- A user is a bot if they have a matching name or have the bot flag on *any* wiki
            -- See https://meta.wikimedia.org/wiki/Research:Active_editor and https://meta.wikimedia.org/wiki/Research:Bot_user
            bot_flag = 0 and (
                -- Convert from BINARY to CHAR so that case-insentive regexes work
                convert(user_name using utf8) not regexp "bot\\\\b" or
                convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")    
            )
        group by wiki, user_name
    ) user_edits
    group by wiki
""".format(start="2017-06-01", end="2018-06-01"))

top_10(mmep, "majority_mobile_editors_proportion")

Unnamed: 0,wiki,majority_mobile_editors_proportion
22,arwikisource,0.541
20,arwikinews,0.5
19,arwikibooks,0.493
61,bnwiki,0.4889
253,hiwiki,0.4741
254,hiwikibooks,0.4677
18,arwiki,0.4598
21,arwikiquote,0.4422
23,arwikiversity,0.4388
186,fawiktionary,0.438


In [214]:
merge_in(mmep)

## Revert rate

In [44]:
rr = wmf.hive.run("""
    select
        wiki_db as wiki as database_code,
        sum(cast(revision_is_identity_reverted as int)) / count(*) as revert_rate
    from
        wmf.mediawiki_history
    where
        event_entity = "revision" and
        event_type = "create" and
        snapshot = "{snapshot}" and
        event_timestamp >= "{start}" and
        event_timestamp < "{end}" and
        event_user_is_bot_by_name = false and
        array_contains(event_user_groups, "bot") = false
    group by wiki_db
""".format(snapshot="2018-05", start="2017-06", end="2018-06")
)

top_10(rr, "revert_rate")

Unnamed: 0,wiki,revert_rate
499,xalwiki,0.483673
357,dzwiki,0.36692
139,bgwikisource,0.362978
432,mgwikibooks,0.333333
390,kbdwiki,0.323529
370,ltwikisource,0.276786
642,adywiki,0.249664
28,nycwikimedia,0.244898
233,rowikisource,0.244752
396,bgwiktionary,0.236686


In [216]:
merge_in(rr)

## Mobile pageviews proportion

In [None]:
mpp = wmf.hive.run("""
select 
    concat("https://", project, ".org") as domain_name,
    sum(if(access_method = "mobile web", view_count, 0)) / sum(view_count) as mobile_web_pageviews_proportion,
    sum(if(access_method = "mobile app", view_count, 0)) / sum(view_count) as mobile_app_pageviews_proportion
from wmf.projectview_hourly
where
    agent_type = "user" and
    concat(year, month) >= "201706" and
    concat(year, month) < "201806"
group by concat("https://", project, ".org")
""")

In [346]:
mpp = mpp.replace("https://wikidata.org", "https://www.wikidata.org")

In [351]:
top_10(mpp, "mobile_web_pageviews_proportion")

Unnamed: 0,domain_name,mobile_web_pageviews_proportion,mobile_app_pageviews_proportion
620,https://wikipedia.org,1.0,0.0
668,https://hi.wikibooks.org,0.904597,0.000247
540,https://hi.wikipedia.org,0.864018,0.011292
550,https://jv.wiktionary.org,0.831347,0.0
291,https://id.wikibooks.org,0.823915,4.6e-05
10,https://bn.wikipedia.org,0.81308,0.013498
44,https://hi.wikiquote.org,0.810344,1.8e-05
176,https://id.wiktionary.org,0.808276,4e-06
49,https://ig.wikipedia.org,0.763,7.8e-05
545,https://id.wikiquote.org,0.751988,0.000398


In [352]:
merge_in(mpp, on="domain_name")

## Monthly active administrators

In [48]:
maa = wmf.hive.run("""
    select 
        wiki as database_code,
        sum(monthly_active_administrators) / 12 as monthly_active_administrators
    from (
        select
            wiki_db as wiki,
            substr(log_timestamp, 1, 6) as month,
            count(distinct log_user) as monthly_active_administrators
        from wmf_raw.mediawiki_logging
        where
            log_type in ("block", "protect", "delete", "rights") and
            log_timestamp >= "{start}" and
            log_timestamp < "{end}" and
            snapshot = "{snapshot}"
        group by wiki_db, substr(log_timestamp, 1, 6)
    ) mae
    group by wiki
""".format(start="201706", end="201806", snapshot="2018-05"))

top_10(maa, "monthly_active_administrators")

Unnamed: 0,wiki,monthly_active_administrators
390,enwiki,1649.5
643,ruwiki,609.0
521,dewiki,483.833333
504,fawiki,437.25
543,frwiki,279.083333
99,eswiki,187.5
512,commonswiki,185.666667
134,itwiki,160.916667
223,ptwiki,144.5
132,zhwiki,141.833333


In [220]:
merge_in(maa)

## Monthly non-bot edits

In [50]:
mnbe = wmf.mariadb.run("""
    select 
        wiki as database_code,
        sum(edits) / 12 as monthly_nonbot_edits
    from staging.editor_month
    where
        month >= "{start}" and
        month < "{end}" and
        -- A user is a bot if they have a matching name or have the bot flag on *any* wiki
        -- See https://meta.wikimedia.org/wiki/Research:Active_editor and https://meta.wikimedia.org/wiki/Research:Bot_user
        bot_flag = 0 and (
            -- Convert from BINARY to CHAR so that case-insentive regexes work
            convert(user_name using utf8) not regexp "bot\\\\b" or
            convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")    
        )
    group by wiki;
""".format(start="2017-06-01", end="2018-06-01"))

top_10(mnbe, "monthly_nonbot_edits")

Unnamed: 0,wiki,monthly_nonbot_edits
705,wikidatawiki,6049303.0
147,enwiki,4180341.0
95,commonswiki,2844182.0
122,dewiki,794248.2
205,frwiki,705548.4
161,eswiki,561415.0
535,ruwiki,482159.8
302,itwiki,411943.9
313,jawiki,344451.9
727,zhwiki,342995.3


In [222]:
merge_in(mnbe)

## Edits Gini coefficient

In [52]:
user_edits = wmf.hive.run("""
    select
        wiki_db as wiki,
        count(*) as user_edits
    from
        wmf.mediawiki_history
    where
        event_entity = "revision" and
        event_type = "create" and
        snapshot = "{snapshot}" and
        event_timestamp >= "{start}" and
        event_timestamp < "{end}" and
        event_user_is_bot_by_name = false and
        array_contains(event_user_groups, "bot") = false
    group by event_user_id, wiki_db
""".format(snapshot="2018-05", start="2017-06", end="2018-06"))

In [53]:
# from https://github.com/oliviaguest/gini
def gini(array):
    """Calculate the Gini coefficient of a numpy array."""
    # based on bottom eq:
    # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg
    # from:
    # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    # All values are treated equally, arrays must be 1d:
    array = array.flatten()
    if np.amin(array) < 0:
        # Values cannot be negative:
        array -= np.amin(array)
    # Values cannot be 0:
    array = array + 0.0000001
    # Values must be sorted:
    array = np.sort(array)
    # Index per array element:
    index = np.arange(1,array.shape[0]+1)
    # Number of array elements:
    n = array.shape[0]
    # Gini coefficient:
    return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array)))

In [54]:
egc = user_edits.groupby("wiki").apply(lambda g: gini(g["user_edits"].values)).reset_index()

In [223]:
egc.columns = ["database_code", "edits_Gini_coefficient"]

In [224]:
merge_in(egc)

## Monthly editors

In [225]:
me = wmf.mariadb.run("""
select
    wiki as database_code,
    count(*) / 12 as monthly_editors
from editor_month
where
    month >= "{start}" and
    month < "{end}" and
    local_user_id != 0 and
    bot_flag = 0 and (
        convert(user_name using utf8) not regexp "bot\\\\b" or
        convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")    
    )
group by wiki;
""".format(start="2017-06-01", end="2018-06-01"))

top_10(me, "monthly_editors")

Unnamed: 0,database_code,monthly_editors
147,enwiki,134670.0833
95,commonswiki,34389.6667
122,dewiki,19794.5
700,wikidatawiki,18782.5833
205,frwiki,18165.8333
161,eswiki,17497.0
312,jawiki,13344.9167
534,ruwiki,11576.5833
301,itwiki,8860.9167
719,zhwiki,7930.8333


In [226]:
merge_in(me)

## Unique devices per editor

In [227]:
wikis["unique_devices_per_editor"] = wikis["monthly_unique_devices"] / wikis["monthly_editors"]

In [228]:
wikis = wikis.replace([np.inf], 0)

## Article count

In [62]:
wikis_list = wikis["wiki"].tolist()

In [None]:
ac = wmf.mariadb.multirun("""
select
    database() as wiki,
    ss_good_articles as article_count
from site_stats
""", wikis = wikis_list)

In [230]:
top_10(ac, "article_count")

Unnamed: 0,database_code,article_count
19,wikidatawiki,49123434
6,commonswiki,47507124
0,enwiki,5673696
17,enwiktionary,5656990
122,cebwiki,5382643
470,mgwiktionary,4724651
16,svwiki,3781924
35,frwiktionary,3310305
2,dewiki,2194921
4,frwiki,1996841


In [231]:
merge_in(ac)

## Cumulative content edits

In [66]:
cce = wmf.hive.run("""
    select
        wiki_db as wiki,
        count(*) as cumulative_content_edits
    from
        wmf.mediawiki_history
    where
        event_entity = "revision" and
        event_type = "create" and
        snapshot = "{snapshot}" and
        page_namespace_is_content = true and
        event_user_is_bot_by_name = false and
        array_contains(event_user_groups, "bot") = false
    group by wiki_db
""".format(snapshot="2018-05")
)

In [233]:
top_10(cce, "cumulative_content_edits")

Unnamed: 0,database_code,cumulative_content_edits
638,enwiki,501038078
20,wikidatawiki,215483470
490,commonswiki,149098365
392,dewiki,104154896
568,frwiki,81828624
191,eswiki,68501562
47,ruwiki,53871124
68,jawiki,51231382
518,itwiki,49099062
149,ptwiki,26961422


In [234]:
merge_in(cce)

## Edits per content page

In [235]:
wikis["edits_per_content_page"] = wikis["cumulative_content_edits"] / wikis["article_count"]

## Script direction

In [236]:
rtl_url = "https://noc.wikimedia.org/conf/dblists/rtl.dblist"
rtl_wikis = pd.Series(requests.get(rtl_url).text.split("\n"))
rtl = pd.DataFrame({"database_code": rtl_wikis, "script_direction": "right-to-left"})

merge_in(rtl)
wikis["script_direction"] = wikis["script_direction"].replace([0], "left-to-right")

## Monthly structured discussions messages

In [237]:
msdm = wmf.mariadb.run("""
select
    rev_user_wiki as database_code,
    count(*) / 12 as monthly_structured_discussions_messages
from flowdb.flow_revision
where
    rev_change_type in ("new-post", "reply") and
    date_format(from_unixtime(
        (conv(substring(hex(rev_id), 1, 12), 16, 10) >> 2) / 1000),
    "%Y-%m-%d %H:%i:%S") >= "{start}" and
    date_format(from_unixtime(
        (conv(substring(hex(rev_id), 1, 12), 16, 10) >> 2) / 1000),
    "%Y-%m-%d %H:%i:%S") < "{end}"
group by rev_user_wiki
""".format(start="2017-06", end="2018-06"))

top_10(msdm, "monthly_structured_discussions_messages")

Unnamed: 0,database_code,monthly_structured_discussions_messages
21,mediawikiwiki,3603.5
9,frwiki,3052.9167
37,zhwiki,1699.8333
0,arwiki,1220.5833
36,wikidatawiki,1212.1667
2,cawiki,749.4167
5,elwiki,328.5833
25,plwiki,223.25
16,hewiki,222.6667
6,fawiki,160.5833


In [238]:
merge_in(msdm)

## Visual edits

In [312]:
ve = wmf.mariadb.run("""
    select 
        wiki as database_code,
        sum(visual_edits) / sum(edits) as visual_edits
    from staging.editor_month
    where
        month >= "{start}" and
        month < "{end}" and
        -- A user is a bot if they have a matching name or have the bot flag on *any* wiki
        -- See https://meta.wikimedia.org/wiki/Research:Active_editor and https://meta.wikimedia.org/wiki/Research:Bot_user
        bot_flag = 0 and (
            -- Convert from BINARY to CHAR so that case-insentive regexes work
            convert(user_name using utf8) not regexp "bot\\\\b" or
            convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")    
        )
    group by wiki;
""".format(start="2017-06-01", end="2018-06-01"))

top_10(ve, "visual_edits")

Unnamed: 0,database_code,visual_edits
12,angwiki,0.494
52,bgwikibooks,0.4817
634,tgwiki,0.416
356,kswiki,0.4143
233,gotwiki,0.4126
144,elwikivoyage,0.3419
130,dinwiki,0.3086
463,novwiki,0.3064
512,ptwikibooks,0.2764
666,twwiki,0.2753


In [313]:
merge_in(ve)

## Mobile unique devices

In [327]:
wmf.hive.run("""
select 
    regexp_replace(
        regexp_replace(
            regexp_replace(regexp_replace(domain, "www\\\\.", ""), "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.') as domain,
    sum(
        if((domain regexp '^m\\\\.' or  domain regexp '\\\\.m\\\\.'), uniques_estimate, 0)
    ) as mobile_uniques,
    sum(uniques_estimate) as total_uniques
from wmf.unique_devices_per_domain_monthly
where 
    domain like "%wikidata%" or domain like "%mediawiki%" and
    concat(year, month) >= "201706" and
    concat(year, month) < "201806"

group by    
    regexp_replace(
        regexp_replace(
            regexp_replace(regexp_replace(domain, "www\\\\.", ""), "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.')
""")

Unnamed: 0,domain,mobile_uniques,total_uniques
0,download.mediawiki.org,0,0
1,mediawiki.org,2890433,8578939
2,ru.wikidata.org,0,1
3,wikidata.org,25197333,52675070
4,zh.wikidata.org,1,1


In [355]:
mob_ud = wmf.hive.run("""
select
    regexp_replace(
        regexp_replace(
            regexp_replace(regexp_replace(domain, "www\\\\.", ""), "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.') as domain_name,
    sum(if((domain regexp '^m\\\\.' or  domain regexp '\\\\.m\\\\.'), uniques_estimate, 0)) as mobile_count,
    sum(uniques_estimate) as total_count,
    sum(
        if((domain regexp '^m\\\\.' or  domain regexp '\\\\.m\\\\.'), uniques_estimate, 0)
    ) / sum(uniques_estimate) as mobile_unique_devices
FROM wmf.unique_devices_per_domain_monthly
WHERE 
    concat(year, month) >= "201706" and
    concat(year, month) < "201806"
group by    
    regexp_replace(
        regexp_replace(
            regexp_replace(regexp_replace(domain, "www\\\\.", ""), "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.')
""")

mob_ud["domain_name"] = "https://" + mob_ud["domain_name"]

In [360]:
mob_ud = mob_ud.replace("https://wikidata.org", "https://www.wikidata.org")

In [362]:
top_10(mob_ud, "mobile_unique_devices")

Unnamed: 0,domain_name,mobile_count,total_count,mobile_unique_devices
340,https://ig.wikipedia.org,1576494,1659716,0.949858
299,https://hi.wikibooks.org,1385981,1495662,0.926667
304,https://hi.wiktionary.org,1184357,1317786,0.898748
300,https://hi.wikipedia.org,80130832,91233823,0.878302
375,https://jv.wiktionary.org,239642,274477,0.873086
642,https://sa.wiktionary.org,336046,385697,0.871269
301,https://hi.wikiquote.org,286321,330944,0.865164
91,https://bn.wikisource.org,527266,613176,0.859893
334,https://id.wikiquote.org,485423,575474,0.843519
332,https://id.wikibooks.org,5630417,6683891,0.842386


In [363]:
merge_in(mob_ud, on="domain_name")

# Readying for spreadsheet

In [308]:
def print_col(cols):
    global wikis
    wikis = wikis.sort_values("overall_size_rank").fillna(0).reset_index(drop=True)
    
    for i in range(len(wikis)):
        row_vals = []
        for col in cols:
            cell_val = wikis[col][i]
            if type(cell_val) != str:
                cell_val = repr(cell_val).format(":f")
            row_vals.append(cell_val)
        print(",".join(row_vals))

In [None]:
print_col([
    
])