In [3]:
import urllib
import requests
import json
import datetime
from urllib.parse import quote, urlencode

import numpy as np
import pandas as pd
import pickle

In [2]:
HOST = "https://uk.wikipedia.org/w/api.php"

In [25]:
def getViews(page, start, end, project='uk.wikipedia'):
    """
    Get number of Views for certain page for every day between start and end. 
    params: page: which pages you want data for
    params: start: starting from this date
    params: end: end on this date
    params: project: which wiki
    return: df: created dataframe with timeseries
    
    """
    base_url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/%s/all-access/all-agents/%s/daily/%s/%s" % (quote(project),page,start,end)
    data = urllib.request.urlopen(base_url)
    dataJson = json.loads(data.read().decode('utf-8'))['items']
    df = pd.DataFrame(dataJson) [['views','timestamp']]
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y%m%d%H')
    return df

In [39]:
temp_sum = 0
for j in range(0, 10):
        temp_sum  += j
        if temp_sum >= 5:
            eps = j
            break
            
print(eps)

3


In [4]:
def GetRevisions_andAge(pageTitle, timestamp):
    """
    Get number of Revisions for certain day 
    params: pageTitle: which pages you want data for
    params: timestamp: on which date
    return: users: all user revisions
    return: delta.days: age of page in days (from first revision till timestamp)
    """
    params = {}
    params["action"] = "query"
    params["format"] = "json"
    params["prop"] = "revisions"
    params["rvlimit"] = "1000"
    params["rvstart"] = timestamp #It goes down (new on top)
    users = []
    revs = []
    
    while True:
        response = urllib.request.urlopen(HOST + "?" + urlencode(params) + "&titles=" + quote(pageTitle)).read().decode('utf-8')
        dataJson = json.loads(response)
        pages = dataJson["query"]["pages"]
        key = list(pages.keys())[0]
        revisions = pages[key]['revisions']
        users = users + [rev["user"] for rev in revisions]
        revs = revs + [rev for rev in revisions]
        
        if "continue" in dataJson.keys():
            cont = dataJson["continue"]["rvcontinue"]
            params["rvcontinue"] = cont
        else:
            break
    last_revision = revs[-1]
    page_born = datetime.datetime.strptime(last_revision['timestamp'], '%Y-%m-%dT%H:%M:%SZ')
    timestamp_datetime = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ')

    # today = datetime.datetime.today()
    delta = timestamp_datetime - page_born
    return users, delta.days

In [34]:
def create_timeseries(page_titles, str_date_from, number_of_days):
    """
    Create timeseries start from given date and go older
    params: page_titles: which pages you want data for
    params: str_date_from: this and number_of_days before will be taken
    params: number_of_days: how many days back from start should we look
    return: df: created dataframe with timeseries
    """
    datetime_start = datetime.datetime.strptime(str_date_from, "%Y-%m-%dT%H:%M:%SZ")
    df = pd.DataFrame(columns=['timestamp','page_name','revisions_count','contributors_count','age_of_page_days'])
    for i in range(number_of_days):
        time_query  = datetime_start - datetime.timedelta(days=i)
        time_query_revs_str = time_query.strftime("%Y-%m-%dT%H:%M:%SZ")
        time_query_views_str = time_query.strftime("%Y%m%d00")
        print(time_query_revs_str)
        
        print("Run for {}, when took {} days back".format(time_query_revs_str, i))
        for page_name in page_titles:
            try:
                users_updated, age_of_page_days = GetRevisions_andAge(page_name,time_query_revs_str)
                revisions = len(users_updated)
                contributors = len(set(users_updated))
                try:
                    num_of_views = int(getViews(page_name,time_query_views_str,time_query_views_str).iloc[0]['views'])
                except:
                    num_of_views = 0
                df = df.append({'timestamp':time_query_revs_str,
                            'page_name':page_name,
                            'revisions_count':revisions,
                            'contributors_count':contributors,
                            'age_of_page_days':age_of_page_days,
                            'num_of_views':num_of_views
                            }, ignore_index=True)
            except  Exception as e:
                print("Error for page {} on {} day before".format(page_name, i))
                print("Error message: {}".format(e))
    return df

In [4]:
TEST_SUBSET = 100
DAYS = 40

In [7]:
titles

['Himantolophus_borealis',
 'Himantolophus_brevirostris',
 'Himantolophus_compressus',
 'Himantolophus_cornifer',
 'Himantolophus_crinitus',
 'Himantolophus_danae',
 'Himantolophus_litoceras',
 'Himantolophus_macroceras',
 'Himantolophus_macroceratoides',
 'Himantolophus_mauli',
 'Himantolophus_melanolophus',
 'Himantolophus_multifurcatus',
 'Himantolophus_nigricornis',
 'Himantolophus_paucifilosus',
 'Himantolophus_pseudalbinares',
 'Himantolophus_stewarti',
 'Hipposideros_commersoni',
 'Hipposideros_griffini',
 'Hipposideros_khaokhouayensis',
 'Hipposideros_pelingensis',
 'Hipposideros_rotalis',
 'Hipposideros_scutinares',
 'Histiotus_magellanicus',
 'Historia_Naturalis_Brasiliae',
 'Hoegaarden',
 'Hold_On_(пісня_Young_Buck)',
 'Homestead',
 'Honda_FCX',
 'Honda_Odyssey',
 'Honda_RCV1000R',
 'Honeymoon',
 'Hoplostethus_druzhinini',
 'Hoplostethus_mento',
 'Humanoid_(альбом)',
 'Hybognathus_regius',
 'Hydac',
 'Hylomys_megalotis',
 'Hypsugo_dolichodon',
 'Hyundai_Xcient',
 'Hyundai_ix

In [30]:
import timeit

str_date_from = "2018-06-07T12:00:00Z"
datetime_start = datetime.datetime.strptime(str_date_from, "%Y-%m-%dT%H:%M:%SZ")
start  = datetime_start - datetime.timedelta(days=DAYS)
#time_query_revs_str = time_query.strftime("%Y-%m-%dT%H:%M:%SZ")
start = start.strftime("%Y%m%d00")
end = datetime_start
end = end.strftime("%Y%m%d00")

test = {}
start_time = timeit.default_timer()
for i, page_name in enumerate(titles):
    #print("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{}/all-access/all-agents/{}/daily/{}/{}".format(page_name, start, end,'uk.wikipedia'))
    
    try:
        test[page_name] = getViews(quote(page_name), start, end, project='uk.wikipedia')
    except:
        print("Failed for page {}".format(page_name))
stop = timeit.default_timer()
print("Elapsed time: {} s".format(stop-start_time))

Failed for page INCA1
Elapsed time: 24.4233717256295 s


In [27]:
test

{'Himantolophus_borealis':     views  timestamp
 0       1 2018-04-28
 1       1 2018-04-30
 2       1 2018-05-01
 3       2 2018-05-03
 4       1 2018-05-04
 5       1 2018-05-05
 6       1 2018-05-06
 7       1 2018-05-07
 8       1 2018-05-10
 9       1 2018-05-12
 10      1 2018-05-13
 11      1 2018-05-14
 12      1 2018-05-15
 13      1 2018-05-16
 14      1 2018-05-17
 15      1 2018-05-18
 16      1 2018-05-19
 17      3 2018-05-20
 18      1 2018-05-22
 19      1 2018-05-23
 20      1 2018-05-26
 21      1 2018-05-27
 22      1 2018-05-28
 23      1 2018-05-29
 24      1 2018-05-30
 25      1 2018-05-31
 26      1 2018-06-02
 27      2 2018-06-03
 28      1 2018-06-05
 29      2 2018-06-06
 30      1 2018-06-07, 'Himantolophus_brevirostris':    views  timestamp
 0      1 2018-05-09
 1      1 2018-05-12
 2      1 2018-05-16
 3      1 2018-05-19
 4      2 2018-05-27
 5      1 2018-06-03
 6      1 2018-06-06, 'Himantolophus_compressus':     views  timestamp
 0       1 2018-04-29


In [6]:
#Load Yurii data

pkl_file = open('pages.pkl', 'rb')
data1 = pickle.load(pkl_file)

titles = [i[0].replace(" ", "_") for i in data1] # only Ukr titles
titles = titles[-TEST_SUBSET:]
#my_df = create_timeseries(titles, "2018-06-07T12:00:00Z", DAYS)
#my_df.to_csv('{}days{}pages.csv'.format(DAYS,TEST_SUBSET))

In [37]:
my_df

Unnamed: 0,timestamp,page_name,revisions_count,contributors_count,age_of_page_days,num_of_views
0,2018-06-07T12:00:00Z,"Dubai_Duty_Free_Women's_Open_2002,_одиночний_р...",4,2,164,0.0
1,2018-06-07T12:00:00Z,"Dubai_Tennis_Championships_2004,_жінки,_одиноч...",2,1,164,0.0
2,2018-06-07T12:00:00Z,Durban_Roodepoort_Deep,10,6,3974,0.0
3,2018-06-07T12:00:00Z,Dymchuk_Gallery,19,3,1048,1.0
4,2018-06-07T12:00:00Z,Dysphoria,18,7,1245,0.0
5,2018-06-07T12:00:00Z,E!,3,3,3460,1.0
6,2018-06-07T12:00:00Z,ECTS-оцінки,12,12,4330,0.0
7,2018-06-07T12:00:00Z,EEPD1,1,1,261,1.0
8,2018-06-07T12:00:00Z,EIFL,18,12,4492,1.0
9,2018-06-07T12:00:00Z,EJ_675,97,25,2243,7.0


In [None]:
# Not translated pages
df = pd.read_csv("C:/Users/kzorina/Studing/MMDS/not_eng_titles_500.csv")
titles = df['uk_title'].tolist()
titles = [i.replace(" ", "_") for i in titles]
titles = titles[50:100]
not_eng_df = create_timeseries(titles, "2018-06-07T12:00:00Z", DAYS)
not_eng_df.to_csv('Not_translated{}days{}pages.csv'.format(DAYS,100))

### Drafts

In [None]:
#Demo how function works
users_updated, age_of_page_days = GetRevisions("Львів","2018-01-15T14:56:00Z")
revisions = len(users_updated)
contributors = len(set(users_updated))

print("Revisions: ".format(revisions))
print("contributors".format(contributors))
print("Age of page".format(age_of_page_days))

In [None]:
# In development....
def get_lang():
    """
    For every title find its translations (among 10 lang for now)
    + get their first revision
    """
    