In [1]:
import numpy as np
import pandas as pd
import requests
import re
import json
import datetime

import wmfdata as wmf
from wmfdata import charting, mariadb, hive
from wmfdata.utils import pct_str, pd_display_all

You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata


In [2]:
query_vars = dict(
    snapshot = "2019-09",
    start= "2017-06-01", 
    end="2018-06-01",
    
    pv_start = "201706",
    pv_end = "201806",
    
    ner_start = "2017-04",
    ner_end = "2018-04"
)

# List of wikis

In [3]:
# Gather all content wikis
wikis = wmf.hive.run("""
SELECT
    database_code,
    database_group AS project_code,
    language_code,
    CONCAT("https://",domain_name) AS domain_name
FROM canonical_data.wikis
WHERE database_group in (
        "commons", "incubator", "foundation", "mediawiki", "meta", "sources",
        "species","wikibooks", "wikidata", "wikinews", "wikipedia", "wikiquote",
        "wikisource", "wikiversity", "wikivoyage", "wiktionary"
    )
""")

wikis = wikis.set_index("database_code")

In [5]:
urls = [
    "https://raw.githubusercontent.com/wikimedia/mediawiki-extensions-cldr/master/CldrNames/CldrNamesEn.php",
    "https://raw.githubusercontent.com/wikimedia/mediawiki-extensions-cldr/master/LocalNames/LocalNamesEn.php"
]

def get_lang_names(url):
    r = requests.get(url)
    m = re.search(r"languageNames = (\[[\s\S]+?\])", r.text)
    php_ln = m.group(1)
    
    repl = [
        # Convert FROM PHP array format to JSON
        (" =>", ":"),
        ("\[", "{"),
        ("\]", "}"),
        # Trailing commas will cause problems
        (",\n}", "\n}"),
        # ...so will single quotes
        ("'", '"'),
        # ...AND comments
        (r"/\*[\s\S]*?\*/", ""),
        (r"#(.*?)\n", ""),
        # One hack to deal with a single quote in a language name
        ('O"odham', "O'odham")
    ]
    
    json_ln = php_ln
    for old, new in repl:
        json_ln = re.sub(old, new, json_ln)
    
    py_ln = json.loads(json_ln)
    return py_ln

langs = {}
for url in urls:
    langs.update(get_lang_names(url))

# Add languages not included in the CLDR files
langs.update({
    "als": "Tosk",
    "atj": "Atikamekw",
    "diq": "Zazaki",
    "fiu-vro": "Võro",
    "map-bms": "Banyumasan",
    "nah": "Nahuatl",
    "pih": "Norfuk-Pitkern",
    "rmy": "Vlax Romani",
    "simple": "Simple English"
})

In [6]:
# Projects not split by language into different editions
unified_projects = {
    "betawikiversity": "Wikiversity Beta",
    "commons": "Wikimedia Commons",
    "incubator": "Wikimedia Incubator",
    "labs": "Wikitech",
    "mediawiki": "MediaWiki.org",
    "meta": "Meta-Wiki",
    "outreach": "Outreach Wiki",
    "sources": "Multilingual Wikisource",
    "species": "Wikispecies",
    "wikidata": "Wikidata"
}

def lang_name(row):
    lang = row.loc["language_code"]
    return langs[lang]

def proj_name(row):
    proj = row.loc["project_code"]
    unified = unified_projects.get(proj)
    if unified:
        return unified
    else:
        return proj.title()

def wiki_name(row):
    proj = row.loc["project_code"]
    unified = unified_projects.get(proj)
    if unified:
        return unified
    else:
        return " ".join([lang_name(row), proj_name(row)])
    
wikis["language_name"] = wikis.apply(lang_name, axis=1)
wikis["project_name"] = wikis.apply(proj_name, axis=1)
wikis["wiki_name"] = wikis.apply(wiki_name, axis=1)

In [7]:
len(wikis)

837

In [8]:
# Remove closed wikis
closed_url = "https://raw.githubusercontent.com/wikimedia/operations-mediawiki-config/master/dblists/closed.dblist"
closed = pd.Series(requests.get(closed_url).text.split("\n"))
wikis = wikis.drop(closed, errors="ignore")

In [9]:
len(wikis)

731

In [10]:
wikis.sample(10)

Unnamed: 0_level_0,project_code,language_code,domain_name,language_name,project_name,wiki_name
database_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
suwiktionary,wiktionary,su,https://su.wiktionary.org,Sundanese,Wiktionary,Sundanese Wiktionary
kmwiktionary,wiktionary,km,https://km.wiktionary.org,Khmer,Wiktionary,Khmer Wiktionary
ukwiktionary,wiktionary,uk,https://uk.wiktionary.org,Ukrainian,Wiktionary,Ukrainian Wiktionary
crhwiki,wikipedia,crh,https://crh.wikipedia.org,Crimean Turkish,Wikipedia,Crimean Turkish Wikipedia
rmwiki,wikipedia,rm,https://rm.wikipedia.org,Romansh,Wikipedia,Romansh Wikipedia
shnwiki,wikipedia,shn,https://shn.wikipedia.org,Shan,Wikipedia,Shan Wikipedia
bgwikibooks,wikibooks,bg,https://bg.wikibooks.org,Bulgarian,Wikibooks,Bulgarian Wikibooks
viwiktionary,wiktionary,vi,https://vi.wiktionary.org,Vietnamese,Wiktionary,Vietnamese Wiktionary
vecwiki,wikipedia,vec,https://vec.wikipedia.org,Venetian,Wikipedia,Venetian Wikipedia
zuwiktionary,wiktionary,zu,https://zu.wiktionary.org,Zulu,Wiktionary,Zulu Wiktionary


In [11]:
wikis = wikis.reset_index()

# Data

In [12]:
def merge_in(df, on="database_code"):
    global wikis
    wikis = pd.merge(wikis, df, how="left", on=on).fillna(0)

In [13]:
def top_10(df, col):
    return df.sort_values(col, ascending=False).head(10)

In [14]:
def rename_df(df):
    return df.rename({"wiki": "database_code", "domain": "domain_name"}, axis=1)

In [15]:
wikis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 7 columns):
database_code    731 non-null object
project_code     731 non-null object
language_code    731 non-null object
domain_name      731 non-null object
language_name    731 non-null object
project_name     731 non-null object
wiki_name        731 non-null object
dtypes: object(7)
memory usage: 40.1+ KB


## Monthly active editors

In [16]:
#formerly a mariadb query using the editor_month table and the bot_flag column, as well as the following to filter bot editors 
#convert(user_name using utf8) not regexp "bot\\\\b" or convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")

mae = wmf.hive.run("""
SELECT
    wiki AS database_code,
    COUNT(*) / 12 AS monthly_active_editors,
    SUM(
        CAST(TRUNC(user_registration, 'MM') = TRUNC(month, 'MM') AS INT)
        )/ 12 AS monthly_new_active_editors
FROM neilpquinn.editor_month
WHERE
    content_edits >= 5 
    AND month >= "{start}" 
    AND month < "{end}" 
    AND user_id != 0 
    --AND user_id IS NOT NULL
    AND bot_by_group = FALSE 
    AND (
        user_name not regexp "bot\\b" or
        user_name in ("Paucabot", "Niabot", "Marbot")    
    )    
GROUP BY wiki
""".format(**query_vars))

In [20]:
merge_in(mae)

## Monthly unique devices

In [21]:
mud = wmf.hive.run("""
SELECT
    regexp_replace(
        regexp_replace(
            regexp_replace(domain, "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.') AS domain_name,
  SUM(uniques_estimate) / 12 AS monthly_unique_devices
FROM wmf.unique_devices_per_domain_monthly
WHERE 
    CONCAT(year, month) >= "{pv_start}" and
    CONCAT(year, month) < "{pv_end}"
GROUP BY    
    regexp_replace(
        regexp_replace(
            regexp_replace(domain, "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.')
""".format(**query_vars))

mud["domain_name"] = "https://" + mud["domain_name"]

top_10(mud, "monthly_unique_devices")

Unnamed: 0,domain_name,monthly_unique_devices
194,https://en.wikipedia.org,696456600.0
208,https://es.wikipedia.org,139841100.0
164,https://de.wikipedia.org,92964350.0
366,https://ja.wikipedia.org,84374000.0
629,https://ru.wikipedia.org,82848950.0
249,https://fr.wikipedia.org,76052840.0
598,https://pt.wikipedia.org,49912060.0
356,https://it.wikipedia.org,47468420.0
853,https://zh.wikipedia.org,33209570.0
32,https://ar.wikipedia.org,28593390.0


In [29]:
merge_in(mud, on="domain_name")

## Overall SIZE rank

In [31]:
SIZE = np.sqrt(wikis["monthly_unique_devices"] * wikis["monthly_active_editors"])
rank = SIZE.rank(method="min", na_option="bottom", ascending=False)
wikis["overall_SIZE_rank"] = rank

## New editor retention

In [32]:
with open("queries/new_editor_retention.hql") as f:
    q = f.read()

ner = wmf.hive.run(
    q.format(start = "{ner_start}", end = "{ner_end}").format(**query_vars))

In [33]:
top_10(ner, "new_editor_retention")

Unnamed: 0,database_code,new_editor_retention
134,stqwiki,1.0
497,zeawiki,1.0
35,emlwiki,1.0
277,nrmwiki,1.0
353,csbwiki,1.0
514,bmwiki,1.0
283,piwiki,1.0
45,frpwiki,1.0
303,szlwiki,1.0
581,liwikiquote,1.0


In [34]:
merge_in(ner)

## Mobile editing proportion

In [82]:
#formerly a mariadb query using the staging.editor_month table and the bot_flag column, as well as the following to filter bot editors 
#convert(user_name using utf8) not regexp "bot\\\\b" or convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")

mep = wmf.hive.run(
"""
SELECT wiki_db AS database_code,
       (mobile_web_edits + app_edits) / all_edits AS mobile_editing_proportion
FROM
    (select
        wiki_db,
        SUM(CAST(ARRAY_CONTAINS(revision_tags, "mobile web edit") AS INT)) mobile_web_edits, 
        SUM(CAST(ARRAY_CONTAINS(revision_tags, "mobile app edit") AS INT)) app_edits,
        count(*) AS all_edits     
    from wmf.mediawiki_history 
    where
        event_timestamp >= "{start}" 
        AND event_timestamp < "{end}" 
        AND event_user_id != 0 
        -- event_user_id IS NOT NULL 
        AND (
            SIZE(event_user_is_bot_by) = 0 OR 
            SIZE(event_user_is_bot_by_historical) = 0
        )
        AND (event_user_id not regexp "bot\\b" OR event_user_id in ("Paucabot", "Niabot", "Marbot"))
        -- A user is a bot if they have a matching name or have the bot flag on *any* wiki
        -- See https://meta.wikimedia.org/wiki/Research:Active_editor AND https://meta.wikimedia.org/wiki/Research:Bot_user
        AND snapshot = "{snapshot}"
    group by wiki_db) user_edits_raw
""".format(**query_vars))

In [83]:
top_10(mep, "mobile_editing_proportion")

Unnamed: 0,database_code,mobile_editing_proportion
644,rmywiki,0.575652
452,lawikiquote,0.366563
522,lowiki,0.329622
29,sdwiki,0.327111
747,pswiki,0.297043
51,gawiktionary,0.295072
7,lmowiki,0.286574
231,kmwiki,0.219235
592,pihwiki,0.204759
577,xmfwiki,0.187151


In [84]:
merge_in(mep)

## Bot editing proportion

In [35]:
#formerly a mariadb query using the staging.editor_month table and the bot_flag column, as well as the following to filter bot editors 
#bot_flag = 1 or convert(user_name using utf8) regexp "bot\\\\b"
bep = wmf.hive.run("""
    SELECT 
        wiki AS database_code,
        SUM(if((bot_by_group = TRUE OR user_name regexp "bot\\b"), edits, 0))/ SUM(edits) AS bot_editing_proportion
        --SUM(if(bot_flag = 1 or convert(user_name using utf8) regexp "bot\\\\b", edits,0)) / SUM(edits) AS bot_editing_proportion
    FROM neilpquinn.editor_month
    WHERE
        month >= "{start}" 
        AND month < "{end}"
    GROUP BY wiki
""".format(**query_vars))

top_10(bep, "bot_editing_proportion")

Unnamed: 0,database_code,bot_editing_proportion
497,mgwiktionary,0.999308
8,cebwiki,0.997587
460,eowikinews,0.988675
161,trwikinews,0.985927
288,hywikiquote,0.975538
679,mywiktionary,0.97043
544,cywiki,0.9554
180,cewiki,0.946461
485,kuwiktionary,0.945699
435,vowiktionary,0.945034


In [36]:
merge_in(bep)

## Anonymous editing proportion

In [37]:
#formerly a mariadb query using the staging.editor_month table and the bot_flag column, as well as the following to filter bot editors 
#convert(user_name using utf8) not regexp "bot\\\\b" or convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")

aep = wmf.hive.run("""
SELECT
   wiki AS database_code,
   SUM(IF(user_name IS NULL, edits, 1)) / SUM(edits)  AS anonymous_editing_proportion
FROM neilpquinn.editor_month
WHERE
   month >= "{start}" 
   AND month < "{end}" 
   AND bot_by_group = FALSE
GROUP BY wiki
""".format(**query_vars))

top_10(aep, "anonymous_editing_proportion")

Unnamed: 0,database_code,anonymous_editing_proportion
437,yuewiktionary,1.0
41,liwikinews,1.0
83,zhwikiversity,1.0
603,satwiki,1.0
151,shnwiki,1.0
225,mtwiktionary,0.962246
248,tlwikibooks,0.949813
128,kywiktionary,0.938104
356,cywikibooks,0.929078
129,lbwiktionary,0.915773


In [38]:
merge_in(aep)

## Majority-mobile editors proportion

In [85]:
#formerly a mariadb query using the staging.editor_month table and the bot_flag column, as well as the following to filter bot editors 
#convert(user_name using utf8) not regexp "bot\\\\b" or convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")
mmep = wmf.hive.run("""
SELECT
    wiki_db as database_code,
    --SUM(IF(mobile_editing_proportion > 0.5, 1, 0))  / COUNT(*) AS majority_mobile_editors_proportion
    --SUM(CAST(ARRAY_CONTAINS(revision_tags, "mobile edit") AS INT)) wikipedia_mobile_edits,
    --SUM(if(access_method = "mobile web", view_COUNT, 0)) / SUM(view_COUNT)
    SUM(CAST(IF(mobile_editing_proportion > 0.5, 1, 0) AS INT)) / COUNT(*) AS majority_mobile_editors_proportion
FROM(
        SELECT wiki_db,
               (mobile_web_edits + app_edits) / all_edits AS mobile_editing_proportion
        FROM
                (select
                    wiki_db,
                    SUM(CAST(ARRAY_CONTAINS(revision_tags, "mobile web edit") AS INT)) mobile_web_edits, 
                    SUM(CAST(ARRAY_CONTAINS(revision_tags, "mobile app edit") AS INT)) app_edits,
                    count(*) AS all_edits     
                from wmf.mediawiki_history 
                where
                    event_timestamp >= "{start}" 
                    AND event_timestamp < "{end}" 
                    AND event_user_id != 0 
                    AND (
                        SIZE(event_user_is_bot_by) = 0 OR size(event_user_is_bot_by_historical) = 0
                        )
                    -- A user is a bot if they have a matching name or have the bot flag on *any* wiki
                    -- See https://meta.wikimedia.org/wiki/Research:Active_editor AND https://meta.wikimedia.org/wiki/Research:Bot_user
                    AND (event_user_id not regexp "bot\\b" OR event_user_id in ("Paucabot", "Niabot", "Marbot"))
                    AND snapshot = "{snapshot}"
                group by wiki_db) user_edits_raw
    ) mobile_edits_proportion
GROUP BY wiki_db  
""".format(**query_vars))
top_10(mmep, "majority_mobile_editors_proportion")




#SUM(CAST(ARRAY_CONTAINS(revision_tags, "visualeditor") AS INT))/count(*) AS visual_edits


Unnamed: 0,database_code,majority_mobile_editors_proportion
644,rmywiki,1.0
0,pnbwiki,0.0
536,tewikibooks,0.0
526,commonswiki,0.0
527,tnwiki,0.0
528,cowikiquote,0.0
529,hewiki,0.0
530,frwikiversity,0.0
531,iawikibooks,0.0
532,sawikibooks,0.0


In [86]:
merge_in(mmep)

## Revert rate

In [39]:
#updated to change how revision_is_identity_reverted is handled
rr = wmf.hive.run("""
    SELECT
        wiki_db AS database_code,
        SUM(IF(revision_is_identity_reverted, 1, 0)) / COUNT(*) AS revert_rate
        --SUM(CAST(revision_is_identity_reverted AS INT)) / COUNT(*) AS revert_rate
    FROM
        wmf.mediawiki_history
    WHERE
        event_entity = "revision" 
        AND event_type = "create" 
        AND snapshot = "{snapshot}" 
        AND event_timestamp >= "{start}" 
        AND event_timestamp < "{end}" 
        AND SIZE(event_user_is_bot_by_historical) = 0 
    GROUP BY wiki_db
""".format(**query_vars))

top_10(rr, "revert_rate")

Unnamed: 0,database_code,revert_rate
523,xalwiki,0.483597
579,tlwikibooks,0.414582
557,jvwiktionary,0.405941
780,viwikiquote,0.370804
378,dzwiki,0.366853
152,bgwikisource,0.361786
456,mgwikibooks,0.355556
339,sawiktionary,0.336585
413,kbdwiki,0.323583
391,ltwikisource,0.285714


In [40]:
merge_in(rr)

## Monthly pageviews

In [41]:
pageviews = hive.run("""
SELECT CONCAT("https://", project, ".org") AS domain_name, 
       SUM(view_count)/12 AS monthly_average_pageviews
FROM wmf.projectview_hourly
WHERE
    agent_type = "user" 
    AND CONCAT(year, month) >= "{pv_start}" 
    AND CONCAT(year, month) < "{pv_end}"
GROUP BY CONCAT("https://", project, ".org")
""".format(**query_vars))
top_10(pageviews, "monthly_average_pageviews")

Unnamed: 0,domain_name,monthly_average_pageviews
653,https://en.wikipedia.org,7617446000.0
272,https://es.wikipedia.org,1095616000.0
422,https://ja.wikipedia.org,1055091000.0
20,https://de.wikipedia.org,975220300.0
99,https://ru.wikipedia.org,918834600.0
531,https://fr.wikipedia.org,686521900.0
50,https://it.wikipedia.org,513289700.0
497,https://zh.wikipedia.org,368315900.0
585,https://pt.wikipedia.org,337922200.0
206,https://pl.wikipedia.org,247648100.0


In [42]:
pageviews = pageviews.replace("https://wikidata.org", "https://www.wikidata.org")

In [43]:
merge_in(pageviews, on="domain_name")

## Mobile pageviews proportion

In [46]:
mpp = wmf.hive.run("""
SELECT 
    CONCAT("https://", project, ".org") AS domain_name,
    SUM(if(access_method = "mobile web", view_COUNT, 0)) / SUM(view_COUNT) AS mobile_web_pageviews_proportion,
    SUM(if(access_method = "mobile app", view_COUNT, 0)) / SUM(view_COUNT) AS mobile_app_pageviews_proportion
FROM wmf.projectview_hourly
WHERE
    agent_type = "user" 
    AND CONCAT(year, month) >= "{pv_start}" 
    AND CONCAT(year, month) < "{pv_end}" 
GROUP BY CONCAT("https://", project, ".org")
""".format(**query_vars))

In [47]:
mpp = mpp.replace("https://wikidata.org", "https://www.wikidata.org")

In [48]:
top_10(mpp, "mobile_web_pageviews_proportion")

Unnamed: 0,domain_name,mobile_web_pageviews_proportion,mobile_app_pageviews_proportion
620,https://wikipedia.org,1.0,0.0
668,https://hi.wikibooks.org,0.904597,0.000247
540,https://hi.wikipedia.org,0.864018,0.011292
550,https://jv.wiktionary.org,0.831347,0.0
291,https://id.wikibooks.org,0.823915,4.6e-05
10,https://bn.wikipedia.org,0.81308,0.013498
44,https://hi.wikiquote.org,0.810344,1.8e-05
176,https://id.wiktionary.org,0.808276,4e-06
49,https://ig.wikipedia.org,0.763,7.8e-05
545,https://id.wikiquote.org,0.751988,0.000398


In [49]:
merge_in(mpp, on="domain_name")

## Monthly active administrators

In [50]:
maa = hive.run("""
SELECT
    wiki as database_code,
    sum(monthly_active_administrators) / 12 as monthly_active_administrators
FROM (
    SELECT
        wiki_db as wiki,
        substr(log_timestamp, 1, 6) as month,
        count(distinct log_actor) as monthly_active_administrators
    from wmf_raw.mediawiki_logging
    WHERE
        log_type in ("block", "delete", "protect", "rights")
        -- Omit the "delete_redir", "move_prot", and "autopromote" actions, which can be done by regular users
        AND log_action not in ("autopromote", "delete_redir", "move_prot")
        AND log_timestamp >= "{start}" 
        AND log_timestamp < "{end}" 
        AND snapshot = "{snapshot}"
    GROUP BY wiki_db, substr(log_timestamp, 1, 6)
) mae
GROUP BY wiki
""".format(**query_vars))

top_10(maa, "monthly_active_administrators")

Unnamed: 0,database_code,monthly_active_administrators
96,enwiki,428.833333
127,commonswiki,166.833333
660,dewiki,137.75
410,frwiki,106.083333
741,ruwiki,103.916667
675,itwiki,98.166667
180,ptwiki,85.166667
460,plwiki,74.166667
130,metawiki,55.75
29,eswiki,52.833333


In [51]:
merge_in(maa)

## Monthly non-bot edits

In [52]:
#formerly a mariadb query pulling from staging.editor_month and using bot_flag = 0 to identify bots, as well as the following to filter bot editors 
#convert(user_name using utf8) not regexp "bot\\\\b" or convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")
mnbe = wmf.hive.run("""
    SELECT 
        wiki AS database_code,
        SUM(edits) / 12 AS monthly_nonbot_edits
    FROM neilpquinn.editor_month
    WHERE
        month >= "{start}" 
        AND month < "{end}" 
        -- A user is a bot if they have a matching name or have the bot flag on *any* wiki
        -- See https://meta.wikimedia.org/wiki/Research:Active_editor AND https://meta.wikimedia.org/wiki/Research:Bot_user
        AND bot_by_group = FALSE
        AND (user_name not regexp "bot\\b" or user_name in ("Paucabot", "Niabot", "Marbot"))
    GROUP BY wiki
""".format(**query_vars))

top_10(mnbe, "monthly_nonbot_edits")

Unnamed: 0,database_code,monthly_nonbot_edits
252,wikidatawiki,5988898.0
102,enwiki,3353922.0
10,commonswiki,2813452.0
98,dewiki,716323.1
368,frwiki,600207.5
312,ruwiki,385957.1
544,eswiki,385138.0
289,itwiki,296275.2
620,zhwiki,268719.9
656,jawiki,253945.8


In [53]:
merge_in(mnbe)

## Edits Gini coefficient

In [54]:
user_edits = wmf.hive.run("""
    SELECT
        wiki_db AS wiki,
        COUNT(*) AS user_edits
    FROM
        wmf.mediawiki_history
    WHERE
        event_entity = "revision" 
        AND event_type = "create" 
        AND snapshot = "{snapshot}" 
        AND event_timestamp >= "{start}" 
        AND event_timestamp < "{end}" 
        AND SIZE(event_user_is_bot_by_historical) = 0
        --event_user_is_bot_by_name = false 
        --array_contains(event_user_groups, "bot") = false
        --array_contains(event_user_is_bot_by, "NULL")= false
    GROUP BY event_user_id, wiki_db
""".format(**query_vars))

In [55]:
# FROM https://github.com/oliviaguest/gini
def gini(array):
    """Calculate the Gini coefficient of a numpy array."""
    # based on bottom eq:
    # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg
    # FROM:
    # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    # All values are treated equally, arrays must be 1d:
    array = array.flatten()
    if np.amin(array) < 0:
        # Values cannot be negative:
        array -= np.amin(array)
    # Values cannot be 0:
    array = array + 0.0000001
    # Values must be sorted:
    array = np.sort(array)
    # Index per array element:
    index = np.arange(1,array.shape[0]+1)
    # Number of array elements:
    n = array.shape[0]
    # Gini coefficient:
    return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array)))

In [56]:
egc = user_edits.groupby("wiki").apply(lambda g: gini(g["user_edits"].values)).reset_index()

In [57]:
egc.columns = ["database_code", "edits_Gini_coefficient"]

In [58]:
merge_in(egc)

## Monthly editors

In [59]:
#formerly a mariadb query pulling from staging.editor_month and using bot_flag = 0 to identify bots, as well as the following to filter bot editors 
#convert(user_name using utf8) not regexp "bot\\\\b" or convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")

me = wmf.hive.run("""
SELECT
    wiki AS database_code,
    COUNT(*) / 12 AS monthly_editors
FROM neilpquinn.editor_month
WHERE
    month >= "{start}" 
    AND month < "{end}" 
    --local_user_id != 0 
    AND bot_by_group = FALSE
    AND (user_name not regexp "bot\\b" or user_name in ("Paucabot", "Niabot", "Marbot"))
GROUP BY wiki
""".format(**query_vars))

top_10(me, "monthly_editors")

Unnamed: 0,database_code,monthly_editors
102,enwiki,134582.0
10,commonswiki,34391.666667
98,dewiki,20318.0
252,wikidatawiki,18779.416667
368,frwiki,18163.0
544,eswiki,17499.583333
656,jawiki,13335.0
312,ruwiki,11571.75
289,itwiki,8859.333333
620,zhwiki,7928.5


In [60]:
merge_in(me)

## Unique devices per editor

In [61]:
wikis["unique_devices_per_editor"] = wikis["monthly_unique_devices"] / wikis["monthly_editors"]

In [62]:
wikis = wikis.replace([np.inf], 0)

## Article COUNT

In [63]:
#wikis_list = wikis["wiki"].tolist()
wikis_list = wikis["database_code"].tolist()

In [64]:
#as of 09/19
##wikis_list_not_working = ['alswiktionary', 'alswikibooks', 'alswikiquote', 'mowiki', 'mowiktionary']

In [65]:
#wikis_list_clean = [x for x in wikis_list if x not in wikis_list_not_working]

In [66]:
ac = wmf.mariadb.run("""
SELECT
    database() AS database_code,
    ss_good_articles AS article_COUNT
FROM site_stats
""", (wikis_list))

In [67]:
top_10(ac, "article_COUNT")

Unnamed: 0,database_code,article_COUNT
696,wikidatawiki,65066940
92,commonswiki,56130655
151,enwiktionary,6146211
144,enwiki,5960275
403,mgwiktionary,5835447
85,cebwiki,5378850
603,svwiki,3747919
209,frwiktionary,3615370
119,dewiki,2358511
202,frwiki,2149876


In [68]:
merge_in(ac)

## Cumulative content edits

In [69]:
#updated with new snapshot
#updated with differently filter for bot identification. Previously used `event_user_is_bot_by_name = false`. 
#Now using `SIZE(event_user_is_bot_by_historical) = 0`

cce = wmf.hive.run("""
    SELECT
        wiki_db AS database_code,
        COUNT(*) AS cumulative_content_edits
    FROM
        wmf.mediawiki_history
    WHERE
        event_entity = "revision" 
        AND event_type = "create" 
        AND snapshot = "{snapshot}" 
        AND page_namespace_is_content = true 
        AND SIZE(event_user_is_bot_by_historical) = 0 
        AND array_contains(event_user_groups, "bot") = false
    GROUP BY wiki_db
""".format(**query_vars))

In [70]:
top_10(cce, "cumulative_content_edits")

Unnamed: 0,database_code,cumulative_content_edits
658,enwiki,566201296
22,wikidatawiki,372857088
505,commonswiki,198248629
407,dewiki,116621022
586,frwiki,92946573
200,eswiki,79143135
50,ruwiki,61670956
71,jawiki,56818195
534,itwiki,55942300
517,zhwiki,32255056


In [71]:
merge_in(cce)

## Edits per content page

In [72]:
wikis["edits_per_content_page"] = wikis["cumulative_content_edits"] / wikis["article_COUNT"]

## Script direction

In [73]:
rtl_url = "https://noc.wikimedia.org/conf/dblists/rtl.dblist"
rtl_wikis = pd.Series(requests.get(rtl_url).text.split("\n"))
rtl = pd.DataFrame({"database_code": rtl_wikis, "script_direction": "right-to-left"})

merge_in(rtl)
wikis["script_direction"] = wikis["script_direction"].replace([0], "left-to-right")

## Monthly structured discussions messages

In [74]:
msdm = wmf.mariadb.run(
"""
SELECT
    rev_user_wiki AS database_code,
    COUNT(*) / 12 AS monthly_structured_discussions_messages
FROM flowdb.flow_revision
WHERE
    rev_change_type in ("new-post", "reply") 
    AND date_format(FROM_unixtime(
        (conv(substring(hex(rev_id), 1, 12), 16, 10) >> 2) / 1000),
        "%Y-%m-%d %H:%i:%S") >= "{start}" 
    AND date_format(FROM_unixtime(
        (conv(substring(hex(rev_id), 1, 12), 16, 10) >> 2) / 1000),
        "%Y-%m-%d %H:%i:%S") < "{end}"
GROUP BY rev_user_wiki
""".format(**query_vars), "wikishared")

top_10(msdm, "monthly_structured_discussions_messages")

Unnamed: 0,database_code,monthly_structured_discussions_messages
21,mediawikiwiki,3603.5
9,frwiki,3052.9167
37,zhwiki,1699.8333
0,arwiki,1220.5833
36,wikidatawiki,1212.1667
2,cawiki,749.4167
5,elwiki,328.5833
25,plwiki,223.25
16,hewiki,222.6667
6,fawiki,160.5833


In [75]:
merge_in(msdm)

## Visual edits

In [76]:
#formerly a mariadb query pulling from staging.editor_month and using bot_flag = 0 to identify bots, as well as the following to filter bot editors 
#convert(user_name using utf8) not regexp "bot\\\\b" or convert(user_name using utf8) in ("Paucabot", "Niabot", "Marbot")

ve = wmf.hive.run("""
SELECT
    wiki_db AS database_code,
    SUM(CAST(ARRAY_CONTAINS(revision_tags, "visualeditor") AS INT))/count(*) AS visual_edits
FROM wmf.mediawiki_history
WHERE
    event_timestamp >= "{start}" 
    AND event_timestamp < "{end}"
    -- A user is a bot if they have a matching name or have the bot flag on *any* wiki
    -- See https://meta.wikimedia.org/wiki/Research:Active_editor AND https://meta.wikimedia.org/wiki/Research:Bot_user
    AND (
        SIZE(event_user_is_bot_by) = 0 OR size(event_user_is_bot_by_historical) = 0
        )
    AND (event_user_id not regexp "bot\\b" OR event_user_id in ("Paucabot", "Niabot", "Marbot"))
    AND snapshot = "{snapshot}"
group by wiki_db
""".format(**query_vars))

top_10(ve, "visual_edits")

Unnamed: 0,database_code,visual_edits
781,cnwikimedia,0.5
751,bgwikibooks,0.391144
498,angwiki,0.166667
693,suwiki,0.163495
311,ptwikiversity,0.151054
128,dinwiki,0.150012
137,fiwikibooks,0.14301
448,etwiki,0.130574
271,novwiki,0.128736
478,cawiki,0.119218


In [77]:
merge_in(ve)

## Mobile unique devices

In [78]:
mob_ud = wmf.hive.run("""
SELECT
    regexp_replace(
        regexp_replace(
            regexp_replace(regexp_replace(domain, "www\\\\.", ""), "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.') AS domain_name,
    SUM(if((domain regexp '^m\\\\.' or  domain regexp '\\\\.m\\\\.'), uniques_estimate, 0)) AS mobile_COUNT,
    SUM(uniques_estimate) AS total_COUNT,
    SUM(
        IF((domain regexp '^m\\\\.' or  domain regexp '\\\\.m\\\\.'), uniques_estimate, 0)
    ) / SUM(uniques_estimate) AS mobile_unique_devices
FROM wmf.unique_devices_per_domain_monthly
WHERE 
    CONCAT(year, month) >= "{pv_start}" AND
    CONCAT(year, month) < "{pv_end}"
    
    
    
GROUP BY    
    regexp_replace(
        regexp_replace(
            regexp_replace(regexp_replace(domain, "www\\\\.", ""), "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.')
""".format(**query_vars))

mob_ud["domain_name"] = "https://" + mob_ud["domain_name"]

In [79]:
mob_ud = mob_ud.replace("https://wikidata.org", "https://www.wikidata.org")

In [80]:
top_10(mob_ud, "mobile_unique_devices")

Unnamed: 0,domain_name,mobile_count,total_count,mobile_unique_devices
340,https://ig.wikipedia.org,1576494,1659716,0.949858
299,https://hi.wikibooks.org,1385981,1495662,0.926667
304,https://hi.wiktionary.org,1184357,1317786,0.898748
300,https://hi.wikipedia.org,80130832,91233823,0.878302
375,https://jv.wiktionary.org,239642,274477,0.873086
642,https://sa.wiktionary.org,336046,385697,0.871269
301,https://hi.wikiquote.org,286321,330944,0.865164
91,https://bn.wikisource.org,527266,613176,0.859893
334,https://id.wikiquote.org,485423,575474,0.843519
332,https://id.wikibooks.org,5630417,6683891,0.842386


In [81]:
merge_in(mob_ud, on="domain_name")

# Readying for spreadsheet

In [87]:
wikis.columns.tolist()

['database_code',
 'project_code',
 'language_code',
 'domain_name',
 'language_name',
 'project_name',
 'wiki_name',
 'monthly_active_editors',
 'monthly_new_active_editors',
 'monthly_unique_devices',
 'overall_SIZE_rank',
 'new_editor_retention',
 'bot_editing_proportion',
 'anonymous_editing_proportion',
 'revert_rate',
 'monthly_average_pageviews',
 'mobile_web_pageviews_proportion',
 'mobile_app_pageviews_proportion',
 'monthly_active_administrators',
 'monthly_nonbot_edits',
 'edits_Gini_coefficient',
 'monthly_editors',
 'unique_devices_per_editor',
 'article_COUNT',
 'cumulative_content_edits',
 'edits_per_content_page',
 'script_direction',
 'monthly_structured_discussions_messages',
 'visual_edits',
 'mobile_count',
 'total_count',
 'mobile_unique_devices',
 'mobile_editing_proportion',
 'majority_mobile_editors_proportion']

In [88]:
wikis = wikis[[
    'overall_SIZE_rank',
    'monthly_unique_devices',
    'mobile_unique_devices',
    'mobile_web_pageviews_proportion',
    'mobile_app_pageviews_proportion',
    'unique_devices_per_editor',
    'monthly_editors',
    'monthly_active_editors',
    'monthly_active_administrators',
    'majority_mobile_editors_proportion',
    'monthly_new_active_editors',
    'new_editor_retention',
    'monthly_nonbot_edits',
    'bot_editing_proportion',
    'mobile_editing_proportion',
    'visual_edits',
    'anonymous_editing_proportion',
    'revert_rate',
    'edits_Gini_coefficient',
    'monthly_structured_discussions_messages',
    'article_COUNT',
    'cumulative_content_edits',
    'edits_per_content_page',
    'script_direction',
    'database_code',
    'project_code',
    'language_code',
    'domain_name',
    'language_name',
    'project_name',
    'wiki_name',
]]

In [89]:
wikis.rename(columns={
    'article_COUNT':'content_pages',
    'anonymous_editing_proportion': 'anonymous_edits',
    'mobile_editing_proportion': 'mobile_edits',
    'bot_editing_proportion':'bot_edits',
    'new_editor_retention':'second_month_editor_retention',
    'majority_mobile_editors_proportion':'majority_mobile_editors',
    'mobile_app_pageviews_proportion':'mobile_app_pageviews',
    'mobile_web_pageviews_proportion':'mobile_web_pageviews'
}, inplace=True)

In [90]:
wikis.columns = wikis.columns.str.replace('_', ' ')

In [91]:
wikis.head()

Unnamed: 0,overall SIZE rank,monthly unique devices,mobile unique devices,mobile web pageviews,mobile app pageviews,unique devices per editor,monthly editors,monthly active editors,monthly active administrators,majority mobile editors,...,cumulative content edits,edits per content page,script direction,database code,project code,language code,domain name,language name,project name,wiki name
0,396.0,9795.166667,0.150372,0.049021,0.013506,763.25974,12.833333,2.166667,1.75,0.0,...,26423.0,4.421519,left-to-right,abwiki,wikipedia,ab,https://ab.wikipedia.org,Abkhazian,Wikipedia,Abkhazian Wikipedia
1,312.0,19162.166667,0.489045,0.22216,0.006298,1054.798165,18.166667,3.416667,1.75,0.0,...,39510.0,4.277363,left-to-right,acewiki,wikipedia,ace,https://ace.wikipedia.org,Achinese,Wikipedia,Achinese Wikipedia
2,569.0,5189.583333,0.107154,0.045236,0.006015,506.300813,10.25,0.333333,1.0,0.0,...,5211.0,12.556627,left-to-right,adywiki,wikipedia,ady,https://ady.wikipedia.org,Adyghe,Wikipedia,Adyghe Wikipedia
3,87.0,416171.916667,0.580397,0.349981,0.004144,3043.304692,136.75,33.416667,7.0,0.0,...,845199.0,9.765555,left-to-right,afwiki,wikipedia,af,https://af.wikipedia.org,Afrikaans,Wikipedia,Afrikaans Wikipedia
4,659.0,1103.666667,0.26933,0.206711,3.7e-05,662.2,1.666667,0.0,0.416667,0.0,...,653.0,28.391304,left-to-right,afwikibooks,wikibooks,af,https://af.wikibooks.org,Afrikaans,Wikibooks,Afrikaans Wikibooks


In [92]:
wikis.to_csv("wikis_test.csv", sep=',', encoding = 'utf-8', index=False)