In [34]:
import os
import urllib
import datetime
import requests
import wikipedia
import pandas as pd
from dateutil.relativedelta import relativedelta

In [62]:
articles = 'data/articles.tsv'

with open(articles, 'r') as f:
    article_names = f.read().strip().split('\n')[12:]

article_names = list(map(lambda x: urllib.parse.unquote(x), article_names))
article_names[:10]

['Áedán_mac_Gabráin',
 'Åland',
 'Édouard_Manet',
 'Éire',
 'Óengus_I_of_the_Picts',
 '€2_commemorative_coins',
 '10th_century',
 '11th_century',
 '12th_century',
 '13th_century']

# USER ACTIVITY

Documentation and example of GET requests available at https://wikimedia.org/api/rest_v1/#!/Pageviews_data/

Requests library documentation: http://docs.python-requests.org/en/master/

In [39]:
def getUserActivity(article, granularity, start, end, project ="en.wikipedia.org",
                    access="all-access", agent="user",dateformat="iso"):
    """
    Method to obtain user activity of a given page for a given period of time
    article: name of the wikiipedia article
    granularity: time granularity of activity, either 'monthly' or 'daily'
    start: start date of the research as Datetime.datetime object
    end: end date of the research as Datetime.datetime object
    project: If you want to filter by project, use the domain of any Wikimedia project (by default en.wikipedia.org)
    access: If you want to filter by access method, use one of desktop, mobile-app or mobile-web (by default all-access)
    agent: If you want to filter by agent type, use one of user, bot or spider (by default user).
    dateformat: the dateformat used in result array, can be 'iso','ordinal','datetime'.
    return:
        it return an array of array of the form [ [user_activity_value1, date1], [user_activity_value2, date2]]
    """

    #granularity['monthly','daily']
    #format['iso','ordinal','datetime']
    #Be carefull, for daily granularity left bound date is included, for monthly granularity left bound date is excluded
    
    dstart = start.strftime("%Y%m%d")
    dend = end.strftime("%Y%m%d")
    path = ("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"+project
            +"/"+access+"/"+agent+"/"+article+"/"+granularity+"/"+dstart+"/"+dend)
    r = requests.get(path)
    res = []
    for i in range(len(r.json()['items'])):
        time_label = None
        if granularity == 'daily':
            time_label = (start + datetime.timedelta(days=i))
        else:
            time_label = (start + relativedelta(months=+i))
        if dateformat == 'iso':
            time_label = time_label.isoformat()
        elif dateformat == 'ordinal':
            time_label = time_label.toordinal()
            
        res.append([r.json()['items'][i]['views'],time_label])
    return res

Example: number of views for article Switzerland on February 2017 and March 2017

In [63]:
s = datetime.datetime(year=2017,month=1,day=1)
e = datetime.datetime(year=2017,month=12,day=31)

article_names = list(map(lambda x: x.replace('__', ': ').replace('_', ' '), article_names))

article_df = pd.DataFrame(columns=['article', 'categories', 'traffic'])
for article in article_names:
    try:
        p = wikipedia.page(article)
        res = getUserActivity(article=article, granularity='daily', start=s, end=e, dateformat='iso')
        entry = {'article': article, 'categories': p.categories, 'traffic': res}
        article_df = article_df.append(entry, ignore_index=True)
    except Exception:
        print(article)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Aggregator
Amur
Bantu
Battles of the Mexican-American War
Beet
Bjørnøya
Blackbird
Capital
Catherine II of Russia
Cocoa
Conflict
Cubeb
Dark Ages
Directdebit
Doom
Durham
Effect of Hurricane Katrina on New Orleans
Forth
Friend Directdebit
Gallery of the Kings and Queens of England
Garage (dance music)
Global
Helen
Macedon
Market
Newmarket
Newshounds
Poetry of the United States
Prehistoric man
RER
Race
Recorder
Sandur
Set
Sponsorship Directdebit
Terik
Weakest Link
Weymouth
Wikipedia Text of the GNU Free Documentation License
William Gilbert
William and Mary
Woodruff
Wowpurchase
Zulu


In [64]:
article_df.to_pickle('wiki_data.pkl')

In [85]:
categories = []

for i in range(len(article_df)):
    categories = categories + article_df.iloc[i].categories
    
len(categories)

122941

In [97]:
set(categories)

{'Use dmy dates from September 2014',
 'Former Roman Catholics',
 'Spain',
 'Articles with unsourced statements from August 2014',
 'British racecar constructors',
 'Members of the House of Commons of Canada from Alberta',
 'Military units and formations established in 1942',
 'Films based on musicals',
 'People from Rumson, New Jersey',
 '1724 births',
 'People from LaRue County, Kentucky',
 'American non-fiction outdoors writers',
 'Fossil taxa described in 1854',
 'Writers from Missouri',
 'Strontium',
 '1020s births',
 'Anglo-Saxon pagans',
 'Army Black Knights football players',
 '9th-century illuminated manuscripts',
 'Articles that may contain original research from September 2015',
 'American people of the Indian Wars',
 'African-American golfers',
 'Statistical deviation and dispersion',
 'African-American Buddhists',
 'People from Votkinsk',
 'OpenDomesday',
 'Treaties of Namibia',
 'Claverack College alumni',
 'Mayors of places in Tennessee',
 'Rivers of San Juan County, Uta