In [1]:
import urllib
import requests
import json
import datetime
from urllib.parse import quote, urlencode

import numpy as np
import pandas as pd
import pickle

In [2]:
HOST = "https://uk.wikipedia.org/w/api.php"

In [15]:
def GetRevisions(pageTitle, timestamp):
    params = {}
    params["action"] = "query"
    params["format"] = "json"
    params["prop"] = "revisions"
    params["rvlimit"] = "1000"
    params["rvstart"] = timestamp #It goes down
    #params["rvend"] = timestamp
    users = []
    revs = []
    
    while True:
        response = urllib.request.urlopen(HOST + "?" + urlencode(params) + "&titles=" + quote(pageTitle)).read().decode('utf-8')
        dataJson = json.loads(response)
        pages = dataJson["query"]["pages"]
        #print(pages)
        key = list(pages.keys())[0]
        revisions = pages[key]['revisions']
        users = users + [rev["user"] for rev in revisions]
        revs = revs + [rev for rev in revisions]
        
        if "continue" in dataJson.keys():
            cont = dataJson["continue"]["rvcontinue"]
            params["rvcontinue"] = cont
        else:
            break
    last_revision = revs[-1]
    page_born = datetime.datetime.strptime(last_revision['timestamp'], '%Y-%m-%dT%H:%M:%SZ')
    timestamp_datetime = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ')

    # today = datetime.datetime.today()
    delta = timestamp_datetime - page_born
    return users, delta.days
        
    
    
    

In [4]:
def create_timeseries(page_titles, str_date_from, number_of_days):
    """
    Create timeseries start from given date and go older
    params: page_titles: which pages you want data for
    params: str_date_from: this and number_of_days before will be taken
    params: number_of_days: how many days back from start should we look
    return: df: created dataframe with timeseries
    """
    datetime_start = datetime.datetime.strptime(str_date_from, "%Y-%m-%dT%H:%M:%SZ")
    df = pd.DataFrame(columns=['timestamp','page_name','revisions_count','contributors_count','age_of_page_days'])
    for i in range(number_of_days):
        time_query  = datetime_start - datetime.timedelta(days=i)
        time_query_str = time_query.strftime("%Y-%m-%dT%H:%M:%SZ")
        print("Run for {}, when took {} days back".format(time_query_str, i))
        for page_name in page_titles:
            try:
                users_updated, age_of_page_days = GetRevisions(page_name,time_query_str)
                revisions = len(users_updated)
                contributors = len(set(users_updated))
                df = df.append({'timestamp':time_query_str,
                            'page_name':page_name,
                            'revisions_count':revisions,
                            'contributors_count':contributors,
                            'age_of_page_days':age_of_page_days}, ignore_index=True)
            except:
                print("Error for page {} on {} day before".format(page_name, i))
    return df

In [None]:
#Demo how function works
users_updated, age_of_page_days = GetRevisions("Львів","2018-01-15T14:56:00Z")
revisions = len(users_updated)
contributors = len(set(users_updated))

print("Revisions: ".format(revisions))
print("contributors".format(contributors))
print("Age of page".format(age_of_page_days))

In [5]:
#Load Yurii data

pkl_file = open('pages.pkl', 'rb')
data1 = pickle.load(pkl_file)

In [6]:
titles = [i[0].replace(" ", "_") for i in data1] # only Ukr titles

In [None]:
my_df = create_timeseries(test, "2018-06-07T12:00:00Z", 40)

Run for 2018-06-07T12:00:00Z, when took 0 days back
Run for 2018-06-06T12:00:00Z, when took 1 days back
Run for 2018-06-05T12:00:00Z, when took 2 days back
Run for 2018-06-04T12:00:00Z, when took 3 days back
Run for 2018-06-03T12:00:00Z, when took 4 days back
Run for 2018-06-02T12:00:00Z, when took 5 days back
Run for 2018-06-01T12:00:00Z, when took 6 days back
Run for 2018-05-31T12:00:00Z, when took 7 days back
Run for 2018-05-30T12:00:00Z, when took 8 days back
Run for 2018-05-29T12:00:00Z, when took 9 days back
Run for 2018-05-28T12:00:00Z, when took 10 days back
Run for 2018-05-27T12:00:00Z, when took 11 days back
Run for 2018-05-26T12:00:00Z, when took 12 days back
Run for 2018-05-25T12:00:00Z, when took 13 days back
Run for 2018-05-24T12:00:00Z, when took 14 days back
Run for 2018-05-23T12:00:00Z, when took 15 days back
Run for 2018-05-22T12:00:00Z, when took 16 days back
Run for 2018-05-21T12:00:00Z, when took 17 days back
Run for 2018-05-20T12:00:00Z, when took 18 days back
Run

In [10]:
my_df

Unnamed: 0,timestamp,page_name,revisions_count,contributors_count,age_of_page_days
0,2018-06-07T12:00:00Z,Himantolophus_borealis,5,1,1323
1,2018-06-07T12:00:00Z,Himantolophus_brevirostris,6,2,1323
2,2018-06-07T12:00:00Z,Himantolophus_compressus,7,3,1323
3,2018-06-07T12:00:00Z,Himantolophus_cornifer,9,2,1323
4,2018-06-07T12:00:00Z,Himantolophus_crinitus,4,2,1323
5,2018-06-07T12:00:00Z,Himantolophus_danae,5,1,1323
6,2018-06-07T12:00:00Z,Himantolophus_litoceras,5,2,1323
7,2018-06-07T12:00:00Z,Himantolophus_macroceras,7,2,1323
8,2018-06-07T12:00:00Z,Himantolophus_macroceratoides,3,1,1323
9,2018-06-07T12:00:00Z,Himantolophus_mauli,5,2,1323


In [16]:
test = titles[-100:]

In [8]:
test

['Himantolophus_borealis',
 'Himantolophus_brevirostris',
 'Himantolophus_compressus',
 'Himantolophus_cornifer',
 'Himantolophus_crinitus',
 'Himantolophus_danae',
 'Himantolophus_litoceras',
 'Himantolophus_macroceras',
 'Himantolophus_macroceratoides',
 'Himantolophus_mauli',
 'Himantolophus_melanolophus',
 'Himantolophus_multifurcatus',
 'Himantolophus_nigricornis',
 'Himantolophus_paucifilosus',
 'Himantolophus_pseudalbinares',
 'Himantolophus_stewarti',
 'Hipposideros_commersoni',
 'Hipposideros_griffini',
 'Hipposideros_khaokhouayensis',
 'Hipposideros_pelingensis',
 'Hipposideros_rotalis',
 'Hipposideros_scutinares',
 'Histiotus_magellanicus',
 'Historia_Naturalis_Brasiliae',
 'Hoegaarden',
 'Hold_On_(пісня_Young_Buck)',
 'Homestead',
 'Honda_FCX',
 'Honda_Odyssey',
 'Honda_RCV1000R',
 'Honeymoon',
 'Hoplostethus_druzhinini',
 'Hoplostethus_mento',
 'Humanoid_(альбом)',
 'Hybognathus_regius',
 'Hydac',
 'Hylomys_megalotis',
 'Hypsugo_dolichodon',
 'Hyundai_Xcient',
 'Hyundai_ix

In [13]:
my_df.to_csv('40days100pages.csv')