In [12]:
import pandas as pd
import numpy as np
import pylab
%matplotlib inline

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

# Load data

In [13]:
df = pd.read_csv("../data/20181019-wikia_stats_users_birthdate.csv")
df['datetime.birthDate'] = pd.to_datetime(df['datetime.birthDate'], infer_datetime_format=True, errors='coerce') 
df.set_index(df['datetime.birthDate'], inplace=True)
df.head()

Unnamed: 0_level_0,url,creation_date,domain,founding_user_id,headline,hub,id,lang,language,name,...,stats.nonarticles,users_1,users_5,users_10,users_20,users_50,users_100,bots,birthDate,datetime.birthDate
datetime.birthDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-05-01 14:14:00,http://spellmagotm.wikia.com,2012-05-01 13:58:13,spellmagotm.wikia.com,5069110.0,,Games,529058.0,en,en,Spellmagotm Wiki,...,221.0,5,1,1,1,1,0,5,"14:14, May 1, 2012",2012-05-01 14:14:00
2017-07-24 22:35:00,http://2017-monster-energy-nascar-cup-series.w...,2017-07-24 22:35:31,2017-monster-energy-nascar-cup-series.wikia.com,32529801.0,,TV,1601247.0,en,en,2017 Monster Energy NASCAR Cup Series Wiki,...,108.0,6,3,1,1,1,1,4,"22:35, July 24, 2017",2017-07-24 22:35:00
2009-09-15 23:21:00,http://10low46japreligion.wikia.com,2009-09-15 23:21:34,10low46japreligion.wikia.com,1602876.0,,Lifestyle,52061.0,en,en,Ancient Japanese Religion (Daramalan Assignmen...,...,239.0,7,2,1,1,1,1,5,"23:21, September 15, 2009",2009-09-15 23:21:00
2014-05-30 15:43:00,http://indigo-showdown.wikia.com,2014-05-30 15:43:23,indigo-showdown.wikia.com,25001469.0,,Games,982346.0,en,en,Indigo showdown Wiki,...,139.0,9,4,3,3,1,0,5,"15:43, May 30, 2014",2014-05-30 15:43:00
2011-02-18 23:15:00,http://animewiki2.wikia.com,2011-02-18 23:15:14,animewiki2.wikia.com,1160460.0,,TV,221590.0,en,en,Animewiki2 Wiki,...,5203.0,33,21,16,14,9,6,3,"23:15, February 18, 2011",2011-02-18 23:15:00


# Number of wikis over the years

In [14]:
byYear = df.resample('y').count()['id']
byYear

datetime.birthDate
1998-12-31      133
1999-12-31        4
2000-12-31        0
2001-12-31        2
2002-12-31        0
2003-12-31        1
2004-12-31       30
2005-12-31      438
2006-12-31      950
2007-12-31      934
2008-12-31     2669
2009-12-31     8927
2010-12-31    14432
2011-12-31    16746
2012-12-31    24404
2013-12-31    26663
2014-12-31    25167
2015-12-31    22074
2016-12-31    19324
2017-12-31    40105
2018-12-31    74791
Name: id, dtype: int64

Data shows an anomaly because Wikia was created in 2004. A deepen analysis of some of the anomaly wikis shows that some fake birthdates were included by a maintenance script (see [the earliest edit of this wiki, as an example](http://blasterman.wikia.com/wiki/Blasterman_Wiki?dir=prev&action=history)).

Additionally, 2018 wikis are very young (no more than two months) so they will be also removed.


In [15]:
dfClean = df['2004':'2017'].copy()
byYear = dfClean.resample('y').count()['id']

# Active Wikis: at least one active user in the last 30 days
activeByYear = dfClean[(dfClean['stats.activeUsers']>=1)&(dfClean['users_1']>0)].resample('y').count()['id']
activeByYear

datetime.birthDate
2004-12-31      21
2005-12-31     322
2006-12-31     636
2007-12-31     564
2008-12-31    1217
2009-12-31    1887
2010-12-31    1968
2011-12-31    2142
2012-12-31    2471
2013-12-31    2710
2014-12-31    2684
2015-12-31    2898
2016-12-31    3198
2017-12-31    5511
Freq: A-DEC, Name: id, dtype: int64

In [16]:
traceTotal = go.Bar(x=byYear.index.year, y=byYear.values, name="Total wikis")
traceActive = go.Bar(x=activeByYear.index.year, y=activeByYear.values, name="Active wikis")
layout = go.Layout(
    legend=dict(x=0.1, y=0.85),
    xaxis=dict(
        autotick=False,
        tickangle=30
    )
)
iplot(go.Figure(data=[traceTotal, traceActive], layout=layout), filename='byYear')

## Wiki age

Instead of showing the number of wikis over the years, we will focus on the age of the active wikis in order to visualize the population pyramid.

In [17]:
def computeAge(birthDate):
    timeSinceBirth = pd.Timestamp(2018, 2, 20)-birthDate
    return int(timeSinceBirth.days/365)
dfClean['age'] = dfClean['datetime.birthDate'].apply(computeAge)

In [18]:
activeWikis = dfClean[(dfClean['stats.activeUsers']>=1)&(dfClean['users_1']>0)]
inactiveWikis = dfClean[(dfClean['stats.activeUsers']<1)|(dfClean['users_1']==0)]

In [19]:
activeByAge = activeWikis.groupby(by=['age']).url.count()
inactiveByAge = inactiveWikis.groupby(by=['age']).url.count()

In [20]:
trace0 = go.Scatter(
    x=activeByAge.index.values,
    y=activeByAge.values,
    mode='lines',
    name="Active wikis",
    line=dict(width=0.5),
    fill='tonexty'
)

trace1 = go.Scatter(
    x=inactiveByAge.index.values,
    y=inactiveByAge.values,
    mode='lines',
    name="Inactive wikis",
    line=dict(width=0.5),
    fill='tonexty'
)

layout = go.Layout(
    yaxis=dict(title='Number of active wikis'),
    xaxis=dict(
        domain=[0,0.5],
        tickmode='array',
        tickvals=list(range(0,20)),
        title="Age (in years)"
    ),
    legend=dict(
        x=0.4
    )
);

fig = go.Figure(data=[trace0], layout=layout)
iplot(fig, filename='stacked-area-plot')