# Data collection

To collect the relevant wikipedia pages for our project we specify the dataclass `Node`. This is based on the use of the open-source software [PetScan](https://petscan.wmflabs.org/) that based on a list of wikipedia-categories yields the corresponding page-names. We furthermore specify the depth of our PetScan-query, which is a measure of how deep we want our categories to be. As the list of pages grows exponentially with the levels of depth we set the parameter to 1. 

In [1]:
import requests
import json
import wikipedia
import pandas as pd
from tqdm.notebook import tqdm
from dataclasses import dataclass
from typing import List
from itertools import chain
from collections import Counter

In [12]:
# We define the dataclass Node that contains: name, parent, depth, edges, text and categories of a page
@dataclass(frozen=False)
class Node:
    name:str
    parent:str
    depth:int=0
    edges:List[tuple]=None
    text:str=None
    categories:List[str]=None    
    

def collect_nodes(parents:list,depth:int=0)->List[Node]:
    
    """
    Based on a category(parents) and the depth this function collects all of the 
    corresponding wikipedia pages and returns a list of element of classtype Node.
    """
    
    nodes = list()
    base_url = 'https://petscan.wmflabs.org/?ns%5B0%5D=1&'
    params = {'project':'wikipedia',
              'language':'en',
              'format':'json',
              'interface_language':'en',
              'depth':str(depth),
              'doit':''}
    
    # Loop through all the categories of the parent-category
    for cat in tqdm(parents):
        
        params['categories'] = cat  # Add new entry to the params dict
        resp = requests.get(url=base_url, params=params).json() #  Get pages based on the params
        
        # Due to DisambiguationError (some page query are disambiguats) we have to use try statements
        # to extract the page elements
        try: 
            for node in tqdm(resp['*'][0]['a']['*'], leave=False):
                try:
                    article = wikipedia.page(node['title'].replace('_',' '))
                    text = article.content
                    edges = article.links
                    categories = article.categories
                    #Append the elements to nodes
                    nodes.append(Node(name=node['title'],
                                    parent=cat,
                                    depth=depth,
                                    edges=edges,
                                    text=text,
                                    categories=categories))
                #except AttributeError or DisambiguationError:
#                    nodes.append(Node(name=node['title'],
#                                    parent=cat,
#                                    depth=depth))
                except:
                    print(f'Node {node.name} could not be collected...')
            
        except KeyError:
            print(f'Category {cat} could not be collected...')
    
    return nodes

In [15]:
# We now make our actual query be specying, parent-categories and depth
parent_categories = ['political_science', 'economics', 'sociology', 'anthropology', 'psychology']
depths = [1]
nodes = []
for d in tqdm(depths):
    nodes += collect_nodes(parent_categories, d)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/892 [00:00<?, ?it/s]

Node {'id': 21754, 'len': 22738, 'metadata': {'wikidata': 'Q6266'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q6266', 'title': 'Nation', 'touched': '20211111144259'} could not be collected...
Node {'id': 408891, 'len': 4910, 'metadata': {'wikidata': 'Q5172479'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q5172479', 'title': 'Corporate_nationalism', 'touched': '20210919002625'} could not be collected...
Node {'id': 454402, 'len': 25655, 'metadata': {'wikidata': 'Q617473'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q617473', 'title': 'Likert_scale', 'touched': '20211016074109'} could not be collected...
Node {'id': 467982, 'len': 12654, 'metadata': {'wikidata': 'Q1761743'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q1761743', 'title': 'Red_tape', 'touched': '20211107200515'} could not be collected...
Node {'id': 503474, 'len': 25323, 'metadata': {'wikidata': 'Q1752412'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q1752412', 'title': 'Nation-building', 'to

  0%|          | 0/987 [00:00<?, ?it/s]

Node {'id': 18135, 'len': 13410, 'metadata': {'wikidata': 'Q430378'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q430378', 'title': 'Lorenz_curve', 'touched': '20210727141002'} could not be collected...
Node {'id': 667117, 'len': 3350, 'metadata': {'wikidata': 'Q772524'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q772524', 'title': 'Eonia', 'touched': '20210304200335'} could not be collected...
Node {'id': 1531457, 'len': 85744, 'metadata': {'wikidata': 'Q72596'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q72596', 'title': 'Resource_curse', 'touched': '20211108180045'} could not be collected...
Node {'id': 1808848, 'len': 1329, 'metadata': {'wikidata': 'Q6407549'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q6407549', 'title': 'Killer_bees_(business)', 'touched': '20210416065935'} could not be collected...
Node {'id': 2110105, 'len': 13658, 'metadata': {'wikidata': 'Q1161046'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q1161046', 'title': 'MONIAC', 'tou

Node {'id': 34945002, 'len': 8635, 'metadata': {'wikidata': 'Q5121659'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q5121659', 'title': 'Circular_cumulative_causation', 'touched': '20210905233335'} could not be collected...
Node {'id': 35186422, 'len': 1675, 'metadata': {'wikidata': 'Q7714941'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q7714941', 'title': 'Austrian_Economics_Newsletter', 'touched': '20211007051939'} could not be collected...
Node {'id': 35626853, 'len': 11091, 'metadata': {'wikidata': 'Q4895912'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q4895912', 'title': 'Bertrand–Edgeworth_model', 'touched': '20201231080227'} could not be collected...
Node {'id': 35675580, 'len': 1167, 'metadata': {'wikidata': 'Q5421563'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q5421563', 'title': 'Export_parity_price', 'touched': '20170417195220'} could not be collected...
Node {'id': 36881492, 'len': 1066, 'metadata': {'wikidata': 'Q5347438'}, 'n': 'page', 'namespa

Node {'id': 43103489, 'len': 10321, 'metadata': {'wikidata': 'Q18392216'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q18392216', 'title': 'Galor–Zeira_model', 'touched': '20210925003958'} could not be collected...
Node {'id': 43334696, 'len': 23308, 'metadata': {'wikidata': 'Q18385613'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q18385613', 'title': 'Stock-flow_consistent_model', 'touched': '20210930074336'} could not be collected...
Node {'id': 43446060, 'len': 2587, 'metadata': {'wikidata': 'Q18356548'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q18356548', 'title': 'Secular_inflation', 'touched': '20190314201204'} could not be collected...
Node {'id': 43482230, 'len': 422, 'metadata': {'wikidata': 'Q18344877'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q18344877', 'title': 'Milestone_fee', 'touched': '20210423204034'} could not be collected...
Node {'id': 44013878, 'len': 1390, 'metadata': {'wikidata': 'Q18391886'}, 'n': 'page', 'namespace': 0, 'nstext': 

  0%|          | 0/1083 [00:00<?, ?it/s]

Node {'id': 51067, 'len': 19805, 'metadata': {'wikidata': 'Q17930'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q17930', 'title': 'Anomie', 'touched': '20211031212126'} could not be collected...
Node {'id': 56285, 'len': 102518, 'metadata': {'wikidata': 'Q12131'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q12131', 'title': 'Disability', 'touched': '20211114003956'} could not be collected...
Node {'id': 161973, 'len': 72682, 'metadata': {'wikidata': 'Q2798912'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q2798912', 'title': 'Accountability', 'touched': '20210920185813'} could not be collected...
Node {'id': 161975, 'len': 1057, 'metadata': {'wikidata': 'Q1274115'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q1274115', 'title': 'Responsibility', 'touched': '20211111160409'} could not be collected...
Node {'id': 323779, 'len': 50892, 'metadata': {'wikidata': 'Q831725'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q831725', 'title': 'Structural_functionalism',

  0%|          | 0/1843 [00:00<?, ?it/s]

Node {'id': 16743, 'len': 192133, 'metadata': {'wikidata': 'Q9061'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q9061', 'title': 'Karl_Marx', 'touched': '20211118222647'} could not be collected...
Node {'id': 17547, 'len': 69654, 'metadata': {'wikidata': 'Q316'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q316', 'title': 'Love', 'touched': '20211105005337'} could not be collected...
Node {'id': 30299, 'len': 120720, 'metadata': {'wikidata': 'Q194100'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q194100', 'title': 'Transhumanism', 'touched': '20211117175004'} could not be collected...
Node {'id': 50024, 'len': 31673, 'metadata': {'wikidata': 'Q6498477'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q6498477', 'title': 'Lust', 'touched': '20211108212706'} could not be collected...
Node {'id': 154147, 'len': 37499, 'metadata': {'wikidata': 'Q1210705'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q1210705', 'title': 'Limerence', 'touched': '20211017073138'} could

Node {'id': 26266653, 'len': 9629, 'metadata': {'wikidata': 'Q192242'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q192242', 'title': 'Frustration', 'touched': '20210928184310'} could not be collected...
Node {'id': 27431805, 'len': 8831, 'metadata': {'wikidata': 'Q7495646'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q7495646', 'title': 'Shetani', 'touched': '20210523204424'} could not be collected...
Node {'id': 29331449, 'len': 18281, 'metadata': {'wikidata': 'Q14623169'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q14623169', 'title': 'Nations_and_IQ', 'touched': '20211014025022'} could not be collected...
Node {'id': 34104355, 'len': 50917, 'metadata': {'wikidata': 'Q7456400'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q7456400', 'title': 'Subjective_well-being', 'touched': '20211108130206'} could not be collected...
Node {'id': 36196531, 'len': 5387, 'metadata': {'wikidata': 'Q5508847'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q5508847', 'title':

  0%|          | 0/1374 [00:00<?, ?it/s]

Node {'id': 10332, 'len': 69285, 'metadata': {'wikidata': 'Q59157'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q59157', 'title': 'Educational_psychology', 'touched': '20211114002059'} could not be collected...
Node {'id': 28297, 'len': 94037, 'metadata': {'wikidata': 'Q9165'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q9165', 'title': 'Soul', 'touched': '20211118164536'} could not be collected...
Node {'id': 36770, 'len': 9557, 'metadata': {'wikidata': 'Q324925'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q324925', 'title': 'Tony_Buzan', 'touched': '20210623075754'} could not be collected...
Node {'id': 102883, 'len': 96377, 'metadata': {'wikidata': 'Q34394'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q34394', 'title': 'Belief', 'touched': '20211021183808'} could not be collected...
Node {'id': 238212, 'len': 62446, 'metadata': {'wikidata': 'Q1987782'}, 'n': 'page', 'namespace': 0, 'nstext': '', 'q': 'Q1987782', 'title': 'Abnormal_psychology', 'touched': '20

Following the collection of pages we gather them in a dataframe and edgelist for future use. To reduce the size of edgelist we alreay now remove edges that points to pages we have not collected. This means that we only keep edges that link to other pages in one of the five categories. 

In [129]:
# Based on our nodes we can now create and save our df for future use
def create_df(nodes = nodes):
    return pd.DataFrame({"name": [node.name for node in nodes],
                         "parent": [node.parent for node in nodes],
                         "depth": [node.depth for node in nodes],
                         "edges": [node.edges for node in nodes],
                         "text": [node.text for node in nodes],
                         "categories": [node.categories for node in nodes]})

df = create_df()
df.to_pickle("df.obj")

In [130]:
# Based on our nodes we can now create a edgelist and save our ot for future use
def create_edgelist(nodes = nodes):
    nodelist = [node.name for node in nodes]
    edgelist = [[(nodes[i].name, edge) for edge in nodes[i].edges if edge in nodelist]
                for i in tqdm(range(len(nodes)))]
    return list(chain.from_iterable(edgelist))

edgelist = create_edgelist()
with open('edgelist.obj', 'wb') as f:
    pickle.dump(edgelist, f)