In [1]:
import pandas as pd


In [2]:
hero_network_df = pd.read_csv("./hero-network.csv")

hero_network_df

Unnamed: 0,hero1,hero2
0,"LITTLE, ABNER",PRINCESS ZANDA
1,"LITTLE, ABNER",BLACK PANTHER/T'CHAL
2,BLACK PANTHER/T'CHAL,PRINCESS ZANDA
3,"LITTLE, ABNER",PRINCESS ZANDA
4,"LITTLE, ABNER",BLACK PANTHER/T'CHAL
...,...,...
574462,COLOSSUS II/PETER RA,CALLISTO
574463,CALLISTO,ROGUE /
574464,CALLISTO,CALIBAN/
574465,CALIBAN/,ROGUE /


In [3]:
# strip spaces from hero names
hero_network_df["hero1"] = hero_network_df["hero1"].apply(lambda x: x.strip())
hero_network_df["hero2"] = hero_network_df["hero2"].apply(lambda x: x.strip())

hero_network_df

Unnamed: 0,hero1,hero2
0,"LITTLE, ABNER",PRINCESS ZANDA
1,"LITTLE, ABNER",BLACK PANTHER/T'CHAL
2,BLACK PANTHER/T'CHAL,PRINCESS ZANDA
3,"LITTLE, ABNER",PRINCESS ZANDA
4,"LITTLE, ABNER",BLACK PANTHER/T'CHAL
...,...,...
574462,COLOSSUS II/PETER RA,CALLISTO
574463,CALLISTO,ROGUE /
574464,CALLISTO,CALIBAN/
574465,CALIBAN/,ROGUE /


In [4]:
# all_heroes contains each hero name once
all_heroes = set()

for row in hero_network_df.index:
    all_heroes.add(hero_network_df["hero1"][row])
    all_heroes.add(hero_network_df["hero2"][row])
    
print(len(all_heroes))
all_heroes

6421


{'BETA RAY BILL',
 'HELLSCOUT/KENNETH TE',
 'OLD SKULL',
 "O'SHAUGNESSY, COLLEE",
 'BOMBSHELL/',
 'STONE, JOHN',
 'YELLOWJACKET II/RITA',
 'MALICE III',
 'MORPH',
 'MARKED MAN',
 'CLASS CLOWN/GLASS',
 'KO-GAR',
 'NUKLO/ROBERT FRANK J',
 'ZARATHOS',
 'AKAFE',
 'CHASE, TREVOR',
 'GRANDE DAME',
 'KALA',
 'CARNAGE/CLETUS KASAD',
 'SUZIE',
 'BLESSING, MORGANA',
 'REILLY, MAVIS',
 'BERKELEY, TRAVIS',
 'METEORITE',
 'APE-X/XINA',
 'PRINCESS PYTHON/ZELD',
 'MASON, LOUISE',
 'ROSE II/SGT. BLUME',
 'DAVIS, PRINCIPAL',
 'MACKAY, AILSA',
 'SET',
 'CUMMINGS, DIANE',
 'THOMAS, BECKY',
 'OXBOW/',
 'WALKER, SHERIFF CHAN',
 'FOGELSTROM, HIRAM',
 'PALADIN/PAUL DENNIS',
 'PSYLOCKE 2013',
 'WIDDOWS, SGT.',
 'MARTINEZ, MRS.',
 'TALISMAN III/NAHITA',
 'CROSSBOW',
 'VAN NUYS, PROF. RODE',
 'FORTUNATO, GIACOMO J',
 'SAGITTARIUS/HARLAN V',
 'ULYSSES',
 'MASTIFF',
 'PAYNE, EDWARD',
 'DR. MIDAS/',
 'RISQUE/GLORIA',
 'INFERNO',
 'ANGEL DOPPELGANGER',
 'DARK',
 'COOPER, DR. VALERIE',
 'STEED, JOHN',
 'POWERSURGE/I

In [5]:
from collections import defaultdict

# create an undirected graph (adjacency list) for use with BFS
# this does not show edge weights (no count of edges between characters)
undir_hero_map = defaultdict(set)

for row in hero_network_df.index:
    hero1 = hero_network_df["hero1"][row]
    hero2 = hero_network_df["hero2"][row]
    
    undir_hero_map[hero1].add(hero2)
    undir_hero_map[hero2].add(hero1)
    
print(len(undir_hero_map.keys()))
undir_hero_map

6421


defaultdict(set,
            {'LITTLE, ABNER': {'BINARY/CAROL DANVERS',
              "BLACK PANTHER/T'CHAL",
              'CARNIVORE/COUNT ANDR',
              'DECAY II/YOSHIRO HAC',
              'DIXON, GENERAL',
              'FIN FANG FOOM/MIDGAR',
              'FUJIKAWA, RUMIKO',
              'GOLDEN-BLADE',
              'HOGAN, VIRGINIA PEPP',
              'INFERNO III/SAMANTHA',
              'IRON MAN IV/JAMES R.',
              'IRON MAN/TONY STARK',
              'JACOBS, GLENDA',
              'JARVIS, EDWIN',
              'JOCASTA',
              'MADAME MENACE/SUNSET',
              'MANN, DR. J. VERNON',
              'PRINCESS ZANDA',
              'SAPPER',
              'TEMPEST II/NICOLETTE',
              'THOR/DR. DONALD BLAK',
              'WAR MACHINE II/PARNE'},
             'PRINCESS ZANDA': {"BLACK PANTHER/T'CHAL",
              'CARNIVORE/COUNT ANDR',
              'DECAY II/YOSHIRO HAC',
              'HOGAN, VIRGINIA PEPP',
              'INFERNO II

In [6]:
# get the number of edges/links (not weighted)
# from undirected graph
total_num_edges = 0

for hero in undir_hero_map.keys():
    edge_length = len(undir_hero_map[hero])
    total_num_edges += edge_length
    
total_num_edges /= 2

total_num_edges

167106.0

In [7]:
# export basic csv with hero name to edge count (first-degree relations)

ordered_heroes = list(all_heroes)

first_deg_df = pd.DataFrame(data={"hero":[hero for hero in ordered_heroes], "count":[len(undir_hero_map[hero]) for hero in ordered_heroes]})

first_deg_df.to_csv("./first_degree.csv", index=False)

first_deg_df

Unnamed: 0,hero,count
0,BETA RAY BILL,238
1,HELLSCOUT/KENNETH TE,18
2,OLD SKULL,29
3,"O'SHAUGNESSY, COLLEE",2
4,BOMBSHELL/,82
...,...,...
6416,"HOGAN, VIRGINIA PEPP",260
6417,MASTERMIND/JASON WYN,142
6418,KHAN,2
6419,ORPHAN-MAKER/PETER,73


In [8]:
from collections import deque

# basic connectivity test - can do now that we have 
# the undirected adjacency list 

# test how many groups of heroes there are (connectivity)

# helper function that gives all heroes connected to a given hero
def basicBFS(hero, graph_map):
    queue = deque([hero])
    seen = set([hero])
    
    while(len(queue) > 0):
        curr_hero = queue.popleft()
        
        # add all first-degree heroes not in seen
        for adjacent_hero in graph_map[curr_hero]:
            if(adjacent_hero not in seen):
                queue.append(adjacent_hero)
                seen.add(adjacent_hero)
            
    
    return seen

# connectivity function - returns the number of heroes in each group and number of unconnected groups
def connectivity(hero_set, graph_map):
    all_groups = []
    all_seen = set()
    count = 0
    
    for hero in graph_map.keys():
        count += 1
        if(hero not in all_seen):
            hero_group = basicBFS(hero, graph_map)
            all_groups.append(len(hero_group))
            for connected_hero in hero_group:
                all_seen.add(connected_hero)
                
    print("Number of Groupings and Hero Count:", all_groups)
    print("Total Number of Heroes Seen (should match total hero count):", count)
                
    return all_groups

print(connectivity(all_heroes, undir_hero_map))

Number of Groupings and Hero Count: [6403, 9, 7, 2]
Total Number of Heroes Seen (should match total hero count): 6421
[6403, 9, 7, 2]


In [9]:
# basic BFS for getting hero degree of separation
# includes information about the links between the heroes specified

def hero_BFS(hero1, hero2, graph_map):    
    queue = deque()
    queue.append((hero1, [hero1]))
    seen = set([hero1])
    
    while(len(queue) > 0):
        curr_hero, hero_chain = queue.popleft()
        
        # if curr_hero is hero2, end loop
        if(curr_hero == hero2):
            return hero_chain
        
        # otherwise, add all unseen heroes to queue, with chain
        for new_hero in graph_map[curr_hero]:
            if(new_hero not in seen):
                new_hero_chain = hero_chain.copy()
                new_hero_chain.append(new_hero)
                
                queue.append((new_hero, new_hero_chain))
                
                seen.add(new_hero)
#     print(seen)
    return ["Not connected!"]
            
# test
hero_BFS('IRON MAN/TONY STARK', "EMPRESS S'BYLL [SKRU", undir_hero_map)

['IRON MAN/TONY STARK', 'HULK/DR. ROBERT BRUC', "EMPRESS S'BYLL [SKRU"]

In [10]:
# max degrees of separation for a character

def maxSeparation(hero, graph_map):
    queue = deque()
    queue.append((hero, 0))
    seen = set([hero])
    
    while(len(queue) > 0):
        curr_hero, curr_distance = queue.popleft()
        
        # add all first-degree heroes not in seen
        for adjacent_hero in graph_map[curr_hero]:
            if(adjacent_hero not in seen):
                queue.append((adjacent_hero, curr_distance+1))
                seen.add(adjacent_hero)                
    
    return curr_distance

# test
maxSeparation("FAITH", undir_hero_map)

4

In [12]:
# calculate the maximum distance in the graph (width) - 
# by calculating the max number of degrees of separation for each character
# takes some time, so usually commented out after first file creation

# NOTE: the separate groups are not split up here - could look into that in the future!

hero_dist_map = defaultdict(int)

for hero in all_heroes:
    max_dist = maxSeparation(hero, undir_hero_map)
    hero_dist_map[hero] = max_dist
    
hero_dist_map

defaultdict(int,
            {'BETA RAY BILL': 3,
             'HELLSCOUT/KENNETH TE': 4,
             'OLD SKULL': 4,
             "O'SHAUGNESSY, COLLEE": 4,
             'BOMBSHELL/': 4,
             'STONE, JOHN': 4,
             'YELLOWJACKET II/RITA': 4,
             'MALICE III': 4,
             'MORPH': 4,
             'MARKED MAN': 4,
             'CLASS CLOWN/GLASS': 4,
             'KO-GAR': 4,
             'NUKLO/ROBERT FRANK J': 4,
             'ZARATHOS': 4,
             'AKAFE': 4,
             'CHASE, TREVOR': 4,
             'GRANDE DAME': 5,
             'KALA': 4,
             'CARNAGE/CLETUS KASAD': 4,
             'SUZIE': 4,
             'BLESSING, MORGANA': 4,
             'REILLY, MAVIS': 4,
             'BERKELEY, TRAVIS': 4,
             'METEORITE': 4,
             'APE-X/XINA': 4,
             'PRINCESS PYTHON/ZELD': 4,
             'MASON, LOUISE': 4,
             'ROSE II/SGT. BLUME': 4,
             'DAVIS, PRINCIPAL': 4,
             'MACKAY, AILSA': 4,
 

In [22]:
# max degree of separation for Marvel characters
print("The maximum degree of separation in this network is:", max(hero_dist_map.values()))
print("The smallest maximum degree of separation in this network is:", min(hero_dist_map.values()))
print("The average maximum degree of separation in this network is:", sum(hero_dist_map[hero] for hero in all_heroes)/len(all_heroes))

# save graph distances to a file

max_dist_df = pd.DataFrame(data={"hero":ordered_heroes, "max_dist":[hero_dist_map[hero] for hero in ordered_heroes]})

max_dist_df.to_csv("./max_distances.csv", index=False)

max_dist_df

The maximum degree of separation in this network is: 5
The smallest maximum degree of separation in this network is: 1
The average maximum degree of separation in this network is: 3.9788194985204797


Unnamed: 0,hero,max_dist
0,BETA RAY BILL,3
1,HELLSCOUT/KENNETH TE,4
2,OLD SKULL,4
3,"O'SHAUGNESSY, COLLEE",4
4,BOMBSHELL/,4
...,...,...
6416,"HOGAN, VIRGINIA PEPP",4
6417,MASTERMIND/JASON WYN,4
6418,KHAN,4
6419,ORPHAN-MAKER/PETER,4


In [24]:
# heroes not part of the main network comprise the minimum max distances
# - 3 disconnected networksfrom main network!

max_dist_df.loc[max_dist_df["max_dist"] == 1]

Unnamed: 0,hero,max_dist
377,HOFFMAN,1
541,STERLING,1
797,"DARLEGUNG, GEN.",1
937,"ASHER, MICHAEL",1
1577,MANT/ERNEST,1
1694,FAGIN,1
2318,ORWELL,1
2357,OSWALD,1
2947,PANTHER CUB/,1
3251,STEEL SPIDER/OLLIE O,1


In [28]:
print(hero_BFS("AMAZO-MAXI-WOMAN/", "FAITH", undir_hero_map))

['Not connected!']
