In [108]:
import requests, re
import urllib.parse as urlp

from graphviz import Digraph
from bs4 import BeautifulSoup as bs
from palettable.tableau import Tableau_20

In [109]:
base_url = "https://en.wikipedia.org/"
redirects = {}
DEBUG_LEVEL=2
house_color_map = Tableau_20.hex_colors

In [110]:
predecessor_labels = ["Preceded", "Predecessor"]
sucessor_labels = ["Successor", "Succeeded"]
issue_labels = ["Child", "Issue"]
parents = ["Parent", "Mother", "Father"]
house_labels = ["Allegiance","Political party", "House"]
spouse_labels = ["Husband", "Wife", "Spouse"]

In [144]:
redify = lambda s: "\x1b[31m"+s+"\x1b[0m"
flattened = lambda l: [item for sublist in l for item in sublist]
log = lambda *args: print(*args) if DEBUG_LEVEL > 1 else None
error = lambda *args: print(*args) if DEBUG_LEVEL > 0 else None

In [145]:
def setdebuglevel(level): 
    DEBUG_LEVEL=level

In [112]:
class Person:
    def __init__(self, name, path):
        self.name = name
        self.path = path
        self.titles = None
        self.parents = []
        self.spouse = []
        self.issue = []
        self.successor = []
        self.predecessor = []
        self.house = None
    def __repr__(self):
        return "Person(%s, %s)" % (self.name, self.path)
    def __eq__(self, other):
        if isinstance(other, Person):
            return (self.path.split(',')[0].lower() == other.path.split(',')[0].lower())
        else:
            return False
    def __ne__(self, other):
        return (not self.__eq__(other))
    def __hash__(self):
        return hash(self.path)
        
class Peon:
    def __init__(self, name, path = ''):
        self.name = name
        self.path = path
    def __repr__(self):
        return "Peon(%s, %s)" % (self.name, self.path)
    def __eq__(self, other):
        if isinstance(other, Peon):
            return (self.path.split(',')[0].lower() == other.path.split(',')[0].lower())
        else:
            return False
    def __ne__(self, other):
        return (not self.__eq__(other))
    def __hash__(self):
        return hash(self.path)

In [113]:
class Queue:
    
    def __init__(self):
        self.to_create = set()
        self.new = set()
        self.finished = set()
    def stage(self, p):
        self.new.add(p)
    def commit(self):
        tmp = self.new.difference(self.finished)
        self.to_create.update(tmp)
        self.new = set()
    def push(self, p):
        if p not in self.finished:
            self.to_create.add(p)
    def pop(self):
        offer = self.to_create.pop()
        self.finished.add(offer)
        return offer
    def refresh(self):
        self.new = set()

In [114]:
def make_singleton(elem, queue):
    refs = elem.find_all('a')
    if len(refs) >= 1:
        ret = urlp.unquote(refs[0]['href'])
        peon = Peon(refs[0].text, ret)
    else:
        ret = elem.text
        peon = Peon(ret)
    queue.stage(peon)
    return ret
    
def make_list(elem, queue):
    refs = elem.find_all('a')
    ret = []
    for ref in refs:
        r = urlp.unquote(ref['href'])
        queue.stage(Peon(ref.text, r))
        ret.append(r)
    return ret
        

In [115]:
def get_infocard(path):
    resp = requests.get(base_url + path)
    txt = resp.text
    s = bs(txt)
    p = r'\"wgPageName\":\"(.+?)\"'
    endpath = '/wiki/' + re.search(p, txt).group(1)
    infocard = s.find_all("table", class_ = "infobox vcard")[0]
    name = s.title.string.replace(' - Wikipedia', '')
    return name, infocard, endpath

In [116]:
def get_person(path, name, people, queue, onlytitled = False):
    queue.refresh()
    if path is None:
        log(redify("No path supplied:"), name)
        return False
    try:
        name, info, endpath = get_infocard(path)
        if endpath != path:
            redirects[path] = endpath
            path = endpath
            log(redify("Redirect:"), path, '->', endpath)
        person = Person(name, path)
        if person in people:
            log(redify("Already Exists:"), name, path)
            return True
        rows = info.find_all("tr")
        if 'img' in [x.name for x in rows[1].descendants]:
            person.titles = rows[2].text
        elif onlytitled:
            raise Exception('Untitled.')
        for r in rows:
            if len(r.contents) == 2 and r.contents[0].name == 'th':
                if any([word in r.contents[0].text for word in parents]):
                    person.mother = make_list(r.contents[1], queue)
                elif any([word in r.contents[0].text for word in issue_labels]):
                    person.issue = make_list(r.contents[1], queue)
                elif any([word in r.contents[0].text for word in spouse_labels]):
                    person.spouse = make_list(r.contents[1], queue)
                elif any([word in r.contents[0].text for word in house_labels]):
                    person.house = make_singleton(r.contents[1], queue)
                elif any([word in r.contents[0].text for word in predecessor_labels]):
                    person.predecessor = make_list(r.contents[1], queue)
                elif any([word in r.contents[0].text for word in sucessor_labels]):
                    person.successor = make_list(r.contents[1], queue)
        people.add(person)
        log("Adding", len(queue.new), "new elements")
        queue.commit()
        return True
    except Exception as e:
        error("    ", redify("Error - "), path + ":", e)
        return False
    

In [117]:
def collect_people(basis_person_name, basis_person_link, count=150):
    people = set()
    queue = Queue()
    queue.stage(Peon(basis_person_name, basis_person_link))
    queue.commit()
    for i in range(count):
        peon = queue.pop()
        worked = get_person(peon.path, peon.name, people, queue)
        if not worked:
            people.add(peon)
        log(peon.name)
    return people

In [118]:
# Remove extra marriage link (helps for removing edges)
def one_marriage_to_rule_them_all(marriage):
    cleaned = []
    for bond in marriage:
        if bond not in cleaned and (bond[1], bond[0]) not in cleaned:
            cleaned.append(bond)
    return cleaned

# Get actual link from redirected links
def final(x):
    return redirects[x] if x in redirects else x

# Transform list of people into graph of relations
# List of people (nodes) and lists of edges (child relationships, marriage, crown pathways)
def make_nodegraph_parts(people):
    nodes = []
    childrels = set()
    marriage = set()
    crown = set()

    relevant_people = [p for p in people if isinstance(p, Person)]
    for person in relevant_people:
        nodes.append((person.path, person.name, person.house))
        childrels.update({(person.path, final(x)) for x in person.issue})
        marriage.update({(person.path, final(x)) for x in person.spouse})
        crown.update({(person.path, final(x)) for x in person.successor})
        crown.update({(final(x),person.path) for x in person.predecessor})
        childrels.update({(final(x),person.path) for x in person.parents})

    nodelinks = [x[0] for x in nodes]
    houses = [x[2] for x in nodes]
    crown = [x for x in crown if x[0] in nodelinks and x[1] in nodelinks]
    marriage = [x for x in marriage if x[0] in nodelinks and x[1] in nodelinks]
    childrels = [x for x in childrels if x[0] in nodelinks and x[1] in nodelinks]

    marriage = one_marriage_to_rule_them_all(marriage)
    
    return nodes, childrels, marriage, crown, houses

In [131]:
# Functions to remove non-cycle elements, so that the circular ones are really highlighted
# TODO: can probably be improved by using find-cycles graph functions instead of recursing on find-leafs

# Find unimportant people (with only one relation; graph edges)
def get_leafs(nodes, crown, marriage, childrels):
    nodepaths = crown + marriage + childrels
    relinstances = flattened(nodepaths)
    relinstances_notkids = flattened(crown + marriage) + [x[0] for x in childrels]
    leafs = [x[0] for x in nodes if relinstances.count(x[0]) < 2]
    leafs += [x for x in relinstances_notkids if relinstances_notkids.count(x) == 0]
    return list(set(leafs))

# Remove unimportant people
def remove_leafs(nodes, crown, marriage, childrels, leafs):
    nodes = [x for x in nodes if x[0] not in leafs]
    crown = [x for x in crown if x[0] not in leafs and x[1] not in leafs]
    marriage = [x for x in marriage if x[0] not in leafs and x[1] not in leafs]
    childrels = [x for x in childrels if x[0] not in leafs and x[1] not in leafs]
    return nodes, crown, marriage, childrels
    
# Recursively find and remove unimportant people
def prune(nodes, crown, marriage, childrels, backoff_limit=100):
    leafs = get_leafs(nodes, crown, marriage, childrels)
    while len(leafs)>0 and backoff_limit > 0:
        nodes, crown, marriage, childrels = remove_leafs(nodes, crown, marriage, childrels, leafs)
        log("Removed", len(leafs), "leaves")
        leafs = get_leafs(nodes, crown, marriage, childrels)
    return nodes, crown, marriage, childrels

In [132]:
def get_graph(basis_person_name, basis_person_link, count=150, pruneit = True):
    people = collect_people(basis_person_name, basis_person_link, count)
    log("Collected", len(people), "people")
    nodes, childrels, marriage, crown, houses = make_nodegraph_parts(people)
    log("Turned into", len(nodes), "nodes")
    if pruneit:
        nodes, childrels, marriage, crown = prune(nodes, childrels, marriage, crown)
        log("Pruned into", len(nodes), "nodes")
    return nodes, childrels, marriage, crown, houses

In [133]:
def get_house_colors(houses):
    top_houses = sorted([(x, houses.count(x)) for x in set(houses) if x is not None], 
                        key=lambda x: x[1], reverse=True)[:20]
    house_colors = {top_houses[i][0]:house_color_map[i] for i in range(len(top_houses))}
    return house_colors

def create_diagraph(nodes, childrels, marriage, crown, houses, name="WikiRelations"):
    house_colors = get_house_colors(houses)
    dot = Digraph(name=name)

    for p, n, h in nodes:
        dot.node(p, n, color = house_colors.get(h, "black"))

    for a, b in list(marriage):
        dot.edge(a, b, color = 'red', style = 'dashed', arrowhead = 'none')
    for a, b in list(crown):
        dot.edge(a, b, color = 'gold')
    for a, b in list(childrels):
        dot.edge(a, b, color = 'black')
        
    return dot

In [138]:
def wiki_relationship_diagraph(basis_person_name, basis_person_link, count=150,
                               name="WikiRelations", trim=True, save=True, save_format=False):
    nodes, childrels, marriage, crown, houses = get_graph(basis_person_name, basis_person_link, count, trim)
    dot = create_diagraph(nodes, childrels, marriage, crown, houses)
    
    if save_format:
        dot.format = save_format
    if save:
        dot.render(name)
    return dot

In [140]:
dot = wiki_relationship_diagraph("Indira Gandhi", "wiki/Indira_Gandhi", 500, 
                                 name="Indira_Gandhi", save_format='png')

[31mRedirect:[0m /wiki/Indira_Gandhi -> /wiki/Indira_Gandhi
Adding 20 new elements
Indira Gandhi
Adding 23 new elements
P. V. Narasimha Rao
Adding 11 new elements
Pramod Mahajan
[31mRedirect:[0m /wiki/Dinesh_Singh_(politician) -> /wiki/Dinesh_Singh_(politician)
Adding 7 new elements
Dinesh Singh
Adding 9 new elements
Swaran Singh
[31mRedirect:[0m /wiki/Vishwanath_Pratap_Singh -> /wiki/Vishwanath_Pratap_Singh
Adding 18 new elements
V. P. Singh
Adding 2 new elements
Raja Awadhesh Singh
Adding 4 new elements
Abhay Pratap Singh
Adding 4 new elements
Feroze Gandhi
     [31mError - [0m /wiki/President's_rule: list index out of range
President's rule
Adding 21 new elements
Ram Vilas Paswan
[31mRedirect:[0m /wiki/G._Prathap_Reddy -> /wiki/G._Prathap_Reddy
Adding 4 new elements
Gangula Prathapa Reddy
Adding 2 new elements
Bojja Venkata Reddy
Adding 10 new elements
Chidambaram Subramaniam
Adding 14 new elements
Yashwantrao Chavan
[31mRedirect:[0m /wiki/C._K._Jaffer_Sharief -> /wiki/C

Adding 7 new elements
Ramesh Pokhriyal
Adding 2 new elements
K. T. Rama Rao
Adding 8 new elements
Saiyid Nurul Hasan
Adding 11 new elements
Madhu Dandavate
Adding 0 new elements
Ramashish Rai
Adding 3 new elements
Chandrakanta Goyal
Adding 2 new elements
Bharatiya Lok Dal
[31mRedirect:[0m /wiki/Tejaswini_Ananth_Kumar -> /wiki/Tejaswini_Ananth_Kumar
Adding 2 new elements
Tejaswini Kumar
Adding 8 new elements
Vasantdada Patil
Adding 6 new elements
B. D. Sharma
Adding 6 new elements
Govind Ballabh Pant
Adding 10 new elements
Pinarayi Vijayan
Adding 3 new elements
Tejasvi Surya
Adding 7 new elements
Nitin Gadkari
Adding 1 new elements
Janeshwar Mishra
Adding 2 new elements
Krishna Chandra Pant
Adding 7 new elements
Lal Bahadur Shastri
Adding 15 new elements
H. D. Deve Gowda
     [31mError - [0m /wiki/Shiv_Bhanu_Singh_Solanki: list index out of range
Shiv Bhanu Singh Solanki
Adding 14 new elements
Mulayam Singh Yadav
Adding 2 new elements
Dharampal Singh
Adding 0 new elements
Janata Dal

     [31mError - [0m /wiki/Yashodhara_Dasappa: list index out of range
Yashodhara Dasappa
Adding 6 new elements
Rao Inderjit Singh
Adding 2 new elements
P. K. Sawant
Adding 3 new elements
Vidyawati Chaturvedi
Adding 6 new elements
Anant Geete
     [31mError - [0m /wiki/Vijay_Nambisan: list index out of range
Vijay Nambisan
Adding 9 new elements
Krishan Kant Paul
Adding 6 new elements
Tirath Singh Rawat
     [31mError - [0m /w/index.php?title=Sushilabai_Patil_Nilangekar&action=edit&redlink=1: list index out of range
Sushilabai Patil Nilangekar
Adding 6 new elements
Ram Nath Kovind
Adding 2 new elements
Damodaram Sanjivayya
     [31mError - [0m /wiki/Minister_of_Human_Resource_Development: list index out of range
Minister of Human Resource Development
Adding 3 new elements
Devisingh Ransingh Shekhawat
Adding 0 new elements
Swatantra Party
Adding 1 new elements
Ranbir Singh Parjapati
Adding 7 new elements
S. M. Krishna
Adding 11 new elements
B. D. Jatti
Adding 3 new elements
Adhur

Adding 10 new elements
Suraj Bhan
Adding 1 new elements
Ramkrishna Kusmaria
Adding 3 new elements
Vishnu Kant Shastri
[31mRedirect:[0m /wiki/D._K._Barooah -> /wiki/D._K._Barooah
Adding 3 new elements
Devakanta Barua
Adding 3 new elements
Arun Shourie
Adding 6 new elements
Asaf Ali
Adding 10 new elements
Virbhadra Singh
Adding 3 new elements
K. Venkatagiri Gowda
Adding 1 new elements
Tiruvellore Thattai Krishnamachariar
[31mRedirect:[0m /wiki/Eelco_van_Kleffens -> /wiki/Eelco_van_Kleffens
Adding 6 new elements
Eelco N. van Kleffens
Adding 5 new elements
S. Obul Reddy
[31mRedirect:[0m /wiki/Kona_Prabhakara_Rao -> /wiki/Kona_Prabhakara_Rao
Adding 1 new elements
Kona Prabhakar Rao
[31mAlready Exists:[0m Ramaswamy Venkataraman /wiki/Ramaswamy_Venkataraman
R. Venkataraman
Adding 8 new elements
Harcharan Singh Brar
     [31mError - [0m /wiki/Kaushik_Patel_(politician): list index out of range
Kaushik Patel
Adding 8 new elements
K. R. Narayanan
[31mRedirect:[0m /wiki/Jugalji_Thakor