In [73]:
import requests
from datetime import datetime
from urllib import parse
import pandas as pd
import json
import os
import numpy as np
import glob

In [15]:
URL = "https://wikimedia.org/api/rest_v1/"
HEADERS = {"Accept":"application/json","user-agent":"test123@gmail.com"}
SESSION = requests.Session()

<requests.sessions.Session at 0x26f2412f3a0>

In [14]:
def get_pageviews(article: str, start: datetime, end: datetime, project="de.wikipedia.org",
                  access="all-access", agent="user", granularity="daily"):
    """
        args:
            article: The name of the article
            start: Start date 
            end: End date
            project: The domain, default en.wikipedia.org
            access: Type of the device, default all-access. other options(Desktop, mobile-app, mobile-web)
            agent: Type of the agent, default all-agents. other options(user, spider, automated)
            granularity: The time unit, default daily. Other options(monthly)
            
    """
    params = [
        "metrics",
        "pageviews",
        "per-article",
        project.capitalize(),
        access,
        agent,
        parse.quote(article),
        granularity,
        start.strftime("%Y%m%d"),
        end.strftime("%Y%m%d")
    ]
    url = URL + "/".join(params)
    return SESSION.get(url,headers=HEADERS).json()["items"]

[{'project': 'de.wikipedia',
  'article': 'Minecraft',
  'granularity': 'daily',
  'timestamp': '2020010100',
  'access': 'all-access',
  'agent': 'user',
  'views': 1458},
 {'project': 'de.wikipedia',
  'article': 'Minecraft',
  'granularity': 'daily',
  'timestamp': '2020010200',
  'access': 'all-access',
  'agent': 'user',
  'views': 1400},
 {'project': 'de.wikipedia',
  'article': 'Minecraft',
  'granularity': 'daily',
  'timestamp': '2020010300',
  'access': 'all-access',
  'agent': 'user',
  'views': 1474},
 {'project': 'de.wikipedia',
  'article': 'Minecraft',
  'granularity': 'daily',
  'timestamp': '2020010400',
  'access': 'all-access',
  'agent': 'user',
  'views': 1448},
 {'project': 'de.wikipedia',
  'article': 'Minecraft',
  'granularity': 'daily',
  'timestamp': '2020010500',
  'access': 'all-access',
  'agent': 'user',
  'views': 1572},
 {'project': 'de.wikipedia',
  'article': 'Minecraft',
  'granularity': 'daily',
  'timestamp': '2020010600',
  'access': 'all-access',

In [10]:
def store_pageviews(title, genre: str, data):
    """
        args:
            genre: options (Games,Film and TV, Literary, Music)
    """
    if genre.lower() not in ["games", "film and tv", "literary", "music"]:
        raise ValueError(
            "Genre must be an element of [Games, Film and TV, Literary, Music]")
    df = pd.DataFrame(data)
    df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y%m%d%H",)
    if not os.path.isdir("Data"):
        os.mkdir("Data")
    df.to_excel("Data/" + genre +
                "/" + title + ".xlsx", index=False)


def fetch_data_sources(page, filename):
    url = "https://www.wikitable2json.com/api/" + \
        page

    params = {"cleanRef": "true", "lang": "en"}
    request = SESSION.get(url=url, params=params, headers=HEADERS)
    resposne = request.json()
    table = np.squeeze(resposne)
    data = {}
    for i in range(len(table[0])):
      data[table[0, i]] = table[1:, i].tolist()
    df = pd.DataFrame(data)
    df.to_excel("Data/" + filename + ".xlsx", index=False)

fetch_data_sources("List_of_video_games_considered_the_best", "Games_sources")

In [None]:
# df = pd.read_excel("Data/Games_sources.xlsx")
# df = df[df["Year"] >= 2000 ]
# for year,game in df.values[:,0:2]:
#     print(game,year)
#     if year < 2005:
#         year = 2005
#     data = get_pageviews(game,datetime(year,1,1),datetime(2020,1,1))
#     store_pageviews(game,"Games",data)

# dfs = []
# for file in os.listdir("Data/Games"):
#     if file.endswith(".xlsx"):
#         df = pd.read_excel("Data/Games/"+file)
#         dfs.append(df)
# new_df = pd.concat(dfs)
# new_df.to_excel("Data/All_Games.xlsx")

In [100]:
def get_related_titles(article: str):
    url = "https://en.wikipedia.org/api/rest_v1/page/related/" + article
    data = SESSION.get(url,headers=HEADERS).json()["pages"]
    titles = []
    for i in data:
        i = i["title"]
        titles.append(i)
    return titles

In [103]:
gameNames = []
for file in os.listdir("Data/Games"):
    if file.endswith(".xlsx"):
        x = file[0: len(file) - 5]
        gameNames.append(x)

for i in gameNames:
    titles = get_related_titles(i)
    df = pd.DataFrame(titles, columns=["related titles"])
    df.to_excel("Data/Games/related/related to " + i + ".xlsx")