In [1]:
import urllib
import requests
import json
import datetime
from urllib.parse import quote, urlencode

import numpy as np
import pandas as pd
import pickle

In [2]:
HOST = "https://uk.wikipedia.org/w/api.php"

In [3]:
def getViews(page, start, end, project='uk.wikipedia'):
    """
    Get number of Views for certain page for every day between start and end. 
    params: page: which pages you want data for
    params: start: starting from this date
    params: end: end on this date
    params: project: which wiki
    return: df: created dataframe with timeseries
    
    """
    base_url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/%s/all-access/all-agents/%s/daily/%s/%s" % (quote(project),page,start,end)
    data = urllib.request.urlopen(base_url)
    dataJson = json.loads(data.read().decode('utf-8'))['items']
    df = pd.DataFrame(dataJson) [['views','timestamp']]
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y%m%d%H')
    return df

In [4]:
def GetRevisions_andAge(pageTitle, timestamp):
    """
    Get number of Revisions for certain day 
    params: pageTitle: which pages you want data for
    params: timestamp: on which date
    return: users: all user revisions
    return: delta.days: age of page in days (from first revision till timestamp)
    """
    params = {}
    params["action"] = "query"
    params["format"] = "json"
    params["prop"] = "revisions"
    params["rvlimit"] = "1000"
    params["rvstart"] = timestamp #It goes down (new on top)
    users = []
    revs = []
    
    while True:
        response = urllib.request.urlopen(HOST + "?" + urlencode(params) + "&titles=" + quote(pageTitle)).read().decode('utf-8')
        dataJson = json.loads(response)
        pages = dataJson["query"]["pages"]
        key = list(pages.keys())[0]
        revisions = pages[key]['revisions']
        users = users + [rev["user"] for rev in revisions]
        revs = revs + [rev for rev in revisions]
        
        if "continue" in dataJson.keys():
            cont = dataJson["continue"]["rvcontinue"]
            params["rvcontinue"] = cont
        else:
            break
    last_revision = revs[-1]
    page_born = datetime.datetime.strptime(last_revision['timestamp'], '%Y-%m-%dT%H:%M:%SZ')
    timestamp_datetime = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ')

    # today = datetime.datetime.today()
    delta = timestamp_datetime - page_born
    return users, delta.days

In [5]:
def create_timeseries(page_titles, str_date_from, number_of_days):
    """
    Create timeseries start from given date and go older
    params: page_titles: which pages you want data for
    params: str_date_from: this and number_of_days before will be taken
    params: number_of_days: how many days back from start should we look
    return: df: created dataframe with timeseries
    """
    datetime_start = datetime.datetime.strptime(str_date_from, "%Y-%m-%dT%H:%M:%SZ")
    df = pd.DataFrame(columns=['timestamp','page_name','revisions_count','contributors_count','age_of_page_days'])
    for i in range(number_of_days):
        time_query  = datetime_start - datetime.timedelta(days=i)
        time_query_revs_str = time_query.strftime("%Y-%m-%dT%H:%M:%SZ")
        time_query_views_str = time_query.strftime("%Y%m%d00")
        print(time_query_revs_str)
        
        print("Run for {}, when took {} days back".format(time_query_revs_str, i))
        for page_name in page_titles:
            try:
                users_updated, age_of_page_days = GetRevisions_andAge(page_name,time_query_revs_str)
                revisions = len(users_updated)
                contributors = len(set(users_updated))
                try:
                    num_of_views = int(getViews(page_name,time_query_views_str,time_query_views_str).iloc[0]['views'])
                except:
                    num_of_views = 0
                df = df.append({'timestamp':time_query_revs_str,
                            'page_name':page_name,
                            'revisions_count':revisions,
                            'contributors_count':contributors,
                            'age_of_page_days':age_of_page_days,
                            'num_of_views':num_of_views
                            }, ignore_index=True)
            except  Exception as e:
                print("Error for page {} on {} day before".format(page_name, i))
                print("Error message: {}".format(e))
    return df

In [32]:
def create_timeseries_from_translate_date(page_titles, translate_dates_str, number_of_days):
    """
    Create timeseries start from day of translation date and go older
    params: page_titles: which pages you want data for
    params: translate_dates: dim = dim(page_titles), corresponding translation datetimes
    params: number_of_days: how many days back from start should we look
    return: df: created dataframe with timeseries
    """
    
    start_time = timeit.default_timer()
    df = pd.DataFrame(columns=['timestamp','page_name','revisions_count','contributors_count','age_of_page_days'])
    
    for index, page_name in enumerate(page_titles):
        datetime_start = datetime.datetime.strptime(translation_time[index], "b'%Y%m%d%H%M%S'")
        if index % 100 == 0:
            stop_time = timeit.default_timer()
            print("On {} record elapsed time: {} s".format(index, stop_time-start_time))
        
        for i in range(number_of_days):
            time_query  = datetime_start - datetime.timedelta(days=i)
            time_query_revs_str = time_query.strftime("%Y-%m-%dT%H:%M:%SZ")
            time_query_views_str = time_query.strftime("%Y%m%d00")
            try:
                users_updated, age_of_page_days = GetRevisions_andAge(page_name,time_query_revs_str)
                revisions = len(users_updated)
                contributors = len(set(users_updated))
                try:
                    num_of_views = int(getViews(page_name,time_query_views_str,time_query_views_str).iloc[0]['views'])
                except:
                    num_of_views = 0
                df = df.append({'timestamp':time_query_revs_str,
                            'page_name':page_name,
                            'revisions_count':revisions,
                            'contributors_count':contributors,
                            'age_of_page_days':age_of_page_days,
                            'num_of_views':num_of_views
                            }, ignore_index=True)
            except  Exception as e:
                print("Error for page {} on {} day before".format(page_name, i))
                print("Error message: {}".format(e))
    
    return df

In [6]:
df = pd.read_csv("data/df_clean.csv")


In [10]:
titles = df['uk_title'].tolist()
translation_time = df['en_timestamp'].tolist()
##titles = [i.replace(" ", "_") for i in titles]
#titles = titles[50:100]
#not_eng_df = create_timeseries(titles, "2018-06-07T12:00:00Z", DAYS)
#not_eng_df.to_csv('Not_translated{}days{}pages.csv'.format(DAYS,100))

In [73]:
#HOST = "https://uk.wikipedia.org/w/api.php"
HOST = "https://en.wikipedia.org/w/api.php"

def GetCreator(pageTitle):
    """
    Get number of Revisions for certain day 
    params: pageTitle: which pages you want data for
    params: timestamp: on which date
    return: users: all user revisions
    return: delta.days: age of page in days (from first revision till timestamp)
    """
    params = {}
    params["action"] = "query"
    params["format"] = "json"
    params["prop"] = "revisions"
    params["rvlimit"] = "1000"
    #params["rvstart"] = timestamp #It goes down (new on top)
    users = []
    revs = []
    
    while True:
        response = urllib.request.urlopen(HOST + "?" + urlencode(params) + "&titles=" + quote(pageTitle)).read().decode('utf-8')
        dataJson = json.loads(response)
        pages = dataJson["query"]["pages"]
        key = list(pages.keys())[0]
        revisions = pages[key]['revisions']
        users = users + [rev["user"] for rev in revisions]
        revs = revs + [rev for rev in revisions]
        
        if "continue" in dataJson.keys():
            cont = dataJson["continue"]["rvcontinue"]
            params["rvcontinue"] = cont
        else:
            break
    last_revision = revs[-1]
    creator = users[-1]
    #page_born = datetime.datetime.strptime(last_revision['timestamp'], '%Y-%m-%dT%H:%M:%SZ')
    #timestamp_datetime = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ')

    # today = datetime.datetime.today()
    #delta = timestamp_datetime - page_born
    return creator, last_revision

In [None]:
creator, last_revision = GetCreator('Ауста')

In [65]:
some_titles = pd.read_csv("data/save.csv")

In [None]:
new_some_titles = some_titles
new_some_titles["en_title"] = title
new_some_titles.sort_values(["Eng_time"]) 

In [71]:
page_names = new_some_titles["en_title"].tolist()


In [None]:
creator_list = []
for page in page_names:
    creator, last_revision = GetCreator(page)
    print(creator)
    creator_list.append(creator)

In [75]:
d = {x:creator_list.count(x) for x in creator_list}
print(d)

{'Inwind': 495, 'Arbustum': 1, 'Fimatic': 1, "Chase me ladies, I'm the Cavalry": 1}


In [19]:
TEST_SUBSET = 100
DAYS = 30

In [None]:
timeseries_df = create_timeseries_from_translate_date(titles, translation_time, DAYS)

In [34]:
timeseries_df.to_csv('Clean_df_timeseries.csv')

In [None]:
df_not_tr = pd.read_csv("data/df_uk_lang_not_translated")
page_titles = df_not_tr['title'].tolist()
page_titles = [eval(i).decode('UTF-8') for i in titles]

In [None]:
df_not_tr_10 = create_timeseries(page_titles[20:30, "%Y-%m-%dT%H:%M:%SZ", 30)

In [6]:
#Load Yurii data

pkl_file = open('pages.pkl', 'rb')
data1 = pickle.load(pkl_file)

titles = [i[0].replace(" ", "_") for i in data1] # only Ukr titles
titles = titles[-TEST_SUBSET:]
#my_df = create_timeseries(titles, "2018-06-07T12:00:00Z", DAYS)
#my_df.to_csv('{}days{}pages.csv'.format(DAYS,TEST_SUBSET))

In [37]:
my_df

Unnamed: 0,timestamp,page_name,revisions_count,contributors_count,age_of_page_days,num_of_views
0,2018-06-07T12:00:00Z,"Dubai_Duty_Free_Women's_Open_2002,_одиночний_р...",4,2,164,0.0
1,2018-06-07T12:00:00Z,"Dubai_Tennis_Championships_2004,_жінки,_одиноч...",2,1,164,0.0
2,2018-06-07T12:00:00Z,Durban_Roodepoort_Deep,10,6,3974,0.0
3,2018-06-07T12:00:00Z,Dymchuk_Gallery,19,3,1048,1.0
4,2018-06-07T12:00:00Z,Dysphoria,18,7,1245,0.0
5,2018-06-07T12:00:00Z,E!,3,3,3460,1.0
6,2018-06-07T12:00:00Z,ECTS-оцінки,12,12,4330,0.0
7,2018-06-07T12:00:00Z,EEPD1,1,1,261,1.0
8,2018-06-07T12:00:00Z,EIFL,18,12,4492,1.0
9,2018-06-07T12:00:00Z,EJ_675,97,25,2243,7.0


In [None]:
# Not translated pages
df = pd.read_csv("C:/Users/kzorina/Studing/MMDS/not_eng_titles_500.csv")
titles = df['uk_title'].tolist()
titles = [i.replace(" ", "_") for i in titles]
titles = titles[50:100]
not_eng_df = create_timeseries(titles, "2018-06-07T12:00:00Z", DAYS)
not_eng_df.to_csv('Not_translated{}days{}pages.csv'.format(DAYS,100))