# Data collection, cleaning and preprocessing
This notebook contains all the steps in our data collection, cleaning and preprocessing described in the explainer notebook. The notebook is divided into different parts. The first parts defines all necessary function applied in order to collect data from fandom. The next part provides and overview of how the dataframes and dictionaries used for the analysis was created using the functions from the previous part. Afterwards it can be seen how the orginal MCU character was created graph using the collected data. 
In the last two parts the teams network graphs and the organisation network graphs are created and the node and egde attributes are defined.


1. [Functions to find all characters, movies, tv-series, teams and organisations](#functions)
2. [Apply functions and create dataframes](#dataframes)
3. [Create Marvel Cinematic Universe network](#create_MCU_network)
    1. [Scraping character pages ](#character_pages)
    2. [Extract all character links](#links)
    1. [Create links and attributes](#links)
    2. [MCU network](#MCU_network)
4. [Create team network](#team_network)
    1. [Scraping team pages](#team_scraping)
    2. [Extract all characters in teams](#Extract_characters)
    3. [Dictionaries for visualisation of teams](#dict_for_team_viz)
    4. [Attributes for team graph](#graph_attributes)
5. [Create organisation network](#organisation_network)
    1. [Scraping organisation pages](#org_scraping)
    2. [Extracting characters in organisations](#org_extract_characters)
    3. [Organisation dictionaries for visualisation](#org_dict_for_viz)
    4. [Attributes for organisation graph](#org_graph_attr)




In [None]:
import urllib.request
import json
import pandas as pd
import numpy as np
import pandas as pd
import requests
import urllib.request
import json
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import re

# Functions to find all characters, movies, tv-series, teams and organisations <a name="functions"></a>
Following functions have the same purpose to extract all characters, movies, tv-sereis, teams and organisations from the fandom wiki page. However they are adapted to different urls explaining the different functions. 

In [None]:
def get_character():
    '''
    This function finds all characters on the marvel universe character page. 
    
    '''
    individuals = []
    not_in_df = []
    baseurl = "https://marvelcinematicuniverse.fandom.com/api.php?"

    action = "action=query&list=categorymembers"
    content = "prop=revisions&rvprop=content&rvslots=*"
    limit = "cmlimit=3000"  # number of category items returned (max is 500)
    dataformat ="format=json"
    cmtitle = 'cmtitle=Category:Characters'
    q = "{}{}&{}&{}&{}&{}".format(baseurl, action, content,cmtitle,limit, dataformat)
    wikiresponse = urllib.request.urlopen(q)
    wikidata = wikiresponse.read()
    query = json.loads(wikidata.decode('utf-8'))
    matches = ["Category", "Captain America/", "Captain America (Fiction)", "Captain Marvel/", 
               "Iron Man/","Doctor Strange/", "Framework", "Avengers Assassinated", "/Age of Ultron", "Decoy", 
               "Chronicom", "/Ravager T'Challa", "/", "(S.H.I.E.L.D.)", "(S.S.R.)", "(Fiction)"]
    #matches = ["Category"]   
    for page in query['query']['categorymembers']:
        individuals.append(page['title'])

        while True: 
            try:
                contin = 'cmcontinue={}'.format(query['continue']['cmcontinue'])
            except:
                break

            continue_q = "{}{}&{}&{}&{}&{}&{}".format(baseurl, action, content,contin,cmtitle,limit,dataformat)
            #print(contin)
            wikiresponse = urllib.request.urlopen(continue_q)
            wikidata = wikiresponse.read()
            query = json.loads(wikidata.decode('utf-8'))

            #print(query)
            for page in query['query']['categorymembers']:
                #print(page['title'])
                #if any(x in page['title'] for x in matches):
                # This ensures that recurent characters are only mentioned once
                # (any(individuals in page['title'] for individuals in individuals)) |  - Fjernede for mange :
                if (any(x in page['title'] for x in matches)):
                    not_in_df.append(page['title'])
                    None
                else:
                    individuals.append(page['title'])
    return individuals, not_in_df

In [None]:
def get_movie(category_name):
    '''
    This function finds all movies on the marvel universe fandoms movie page
    '''
    individuals = []
    baseurl = "https://marvelcinematicuniverse.fandom.com/api.php?"

    action = "action=query&list=categorymembers"
    content = "prop=revisions&rvprop=content&rvslots=*"
    limit = "cmlimit=3000"  # number of category items returned (max is 500)
    dataformat ="format=json"
    #cmtitle = 'cmtitle=Category:Characters'.format()
    
    cmtitle = 'cmtitle=Category:{}'.format(category_name)
    #cmtitle = ""
    
    q = "{}{}&{}&{}&{}&{}".format(baseurl, action, content,cmtitle,limit, dataformat)
    wikiresponse = urllib.request.urlopen(q)
    wikidata = wikiresponse.read()
    query = json.loads(wikidata.decode('utf-8'))

    for page in query['query']['categorymembers']:
        individuals.append(page['title'])

        while True: 
            try:
                contin = 'cmcontinue={}'.format(query['continue']['cmcontinue'])
            except:
                break

            continue_q = "{}{}&{}&{}&{}&{}&{}".format(baseurl, action, content,contin,cmtitle,limit,dataformat)
            #print(contin)
            wikiresponse = urllib.request.urlopen(continue_q)
            wikidata = wikiresponse.read()
            query = json.loads(wikidata.decode('utf-8'))

            #print(query)
            for page in query['query']['categorymembers']:
                individuals.append(page['title'])
    return individuals

In [None]:
def get_tv():
    '''
    The function finds all tv-sereies on the fandom page
    '''
    individuals = []
    baseurl = "https://marvelcinematicuniverse.fandom.com/api.php?"

    action = "action=query&list=categorymembers"
    content = "prop=revisions&rvprop=content&rvslots=*"
    limit = "cmlimit=3000"  # number of category items returned (max is 500)
    dataformat ="format=json"
    #cmtitle = 'cmtitle=Category:Characters'.format()
    
    cmtitle = 'cmtitle=Category:TV_Series'
    #cmtitle = ""
    
    q = "{}{}&{}&{}&{}&{}".format(baseurl, action, content,cmtitle,limit, dataformat)
    wikiresponse = urllib.request.urlopen(q)
    wikidata = wikiresponse.read()
    query = json.loads(wikidata.decode('utf-8'))
    
    matches = ["Category:"]
    
    for page in query['query']['categorymembers']:
        if (any(x in page['title'] for x in matches)):
            None
        else:
            individuals.append(page['title'])
    return individuals

In [None]:
def team():
    '''
    This function finds all team on the fandom marvel page
    '''
    individuals = []
    baseurl = "https://marvelcinematicuniverse.fandom.com/api.php?"

    action = "action=query&list=categorymembers"
    content = "prop=revisions&rvprop=content&rvslots=*"
    limit = "cmlimit=3000"  # number of category items returned (max is 500)
    dataformat ="format=json"
    cmtitle = 'cmtitle=Category:Teams'.format()
    
    q = "{}{}&{}&{}&{}&{}".format(baseurl, action, content,cmtitle,limit, dataformat)
    wikiresponse = urllib.request.urlopen(q)
    wikidata = wikiresponse.read()
    query = json.loads(wikidata.decode('utf-8'))

    for page in query['query']['categorymembers']:
        individuals.append(page['title'])

        while True: 
            try:
                contin = 'cmcontinue={}'.format(query['continue']['cmcontinue'])
            except:
                break

            continue_q = "{}{}&{}&{}&{}&{}&{}".format(baseurl, action, content,contin,cmtitle,limit,dataformat)
            #print(contin)
            wikiresponse = urllib.request.urlopen(continue_q)
            wikidata = wikiresponse.read()
            query = json.loads(wikidata.decode('utf-8'))

            #print(query)
            for page in query['query']['categorymembers']:
                individuals.append(page['title'])
    return individuals

In [1]:
def organisation():
    '''
    This function finds all organisations from the fandom page
    '''
    individuals = []
    baseurl = "https://marvelcinematicuniverse.fandom.com/api.php?"

    action = "action=query&list=categorymembers"
    content = "prop=revisions&rvprop=content&rvslots=*"
    limit = "cmlimit=3000"  # number of category items returned (max is 500)
    dataformat ="format=json"
    cmtitle = 'cmtitle=Category:Organizations'.format()
    
    q = "{}{}&{}&{}&{}&{}".format(baseurl, action, content,cmtitle,limit, dataformat)
    wikiresponse = urllib.request.urlopen(q)
    wikidata = wikiresponse.read()
    query = json.loads(wikidata.decode('utf-8'))

    for page in query['query']['categorymembers']:
        individuals.append(page['title'])

        while True: 
            try:
                contin = 'cmcontinue={}'.format(query['continue']['cmcontinue'])
            except:
                break

            continue_q = "{}{}&{}&{}&{}&{}&{}".format(baseurl, action, content,contin,cmtitle,limit,dataformat)
            wikiresponse = urllib.request.urlopen(continue_q)
            wikidata = wikiresponse.read()
            query = json.loads(wikidata.decode('utf-8'))

            for page in query['query']['categorymembers']:
                individuals.append(page['title'])
    return individuals

## Create dataframes and dictionaries <a name="dataframes"></a>


Following section concatenates the information provided from the different functions into different dataframes and dictionaries.

For the movie dataframe we have defined the phases for the movies when calling the function to extract all movies

In [6]:
movie_phase =["Phase_One_Movies", "Phase_Two_Movies","Phase_Three_Movies","Phase_Four_Movies"]
movie_list = {}
for movies in movie_phase:
    movie_list[movies] = {'movies': get_movie(movies), 'Phase': movies}


Create movie dataframe

In [7]:
df_movies = pd.DataFrame(movie_list.values())
df_movies = df_movies.explode('movies')

Create character dataframe. When creating the dataframe it has been necessary to consider change in names in the urls, thus the regex.replace has been applied.

In [180]:
characters, not_in_df = get_character()
df_characters = pd.DataFrame(characters, columns = ['name'])
df_characters['_name'] = df_characters['name'].replace([" ","\'"], ["_","%27"], regex = True)

Create tv-series dataframe

In [102]:
tv = get_tv()
df_tv = pd.DataFrame(tv, columns = ['tv_series'])
df_tv['_tv_series'] = df_tv['tv_series'].replace([" "], ["_"], regex = True)

Create team dataframe

In [None]:
teams = team()
df_teams = pd.DataFrame(teams,columns = ['Name'])
df_teams = df_teams.reset_index(drop=True)

Create organisation dataframe

In [None]:
organisations = organisation()
df_organisation = pd.DataFrame(organisations,columns = ['Name']) 

## Create MCU character network <a name="create_MCU_network"></a>

The following function are used to create the final MCU character network. It uses the dataframes for tv-series, movies in order to add attributes. Furthermore it uses the function link to create a link between two characters.

## Scraping character page  <a name="character_pages"></a>


Next section extract the character pages.

In [115]:
def get_page(dicts, char_names):
    baseurl = "https://marvelcinematicuniverse.fandom.com/api.php?"
    action = "action=query"
    content = "prop=revisions&rvprop=content&rvslots=*"
    dataformat ="format=json"

    # Looping over every characters name using the same API logic and rexex as above
    for name in char_names:
        #print(name)
        #name = char_names[idx]
        title = "titles="+name
        link = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)

        wikiresponse = requests.get(link)
        wikitext = wikiresponse.text
        wikijson = json.loads(wikitext)

        page_id = list(wikijson["query"]["pages"].keys())[0] # The page id for each character

        name = name.replace("/", "%")

        if len(wikijson["query"]["pages"][page_id]['revisions']) > 1:
            print("Revisions has more than one entry")
        else:
            text = wikijson["query"]["pages"][page_id]['revisions'][0]['slots']['main']['*']
            with open(dicts+name+'.txt', 'w') as f: # save the text for each character in the folder
                f.write(text)

In [189]:
# list of character names - replacing the space with _ such that one can locate the right title
char_names = df_characters["_name"]
dicts = 'marvel_characters/'

get_page(dicts, char_names)

### Create dictionary with the raw text for characters
So we dont have to load every time

In [191]:
dict_raw_text = {}

name = df_characters['_name']
char_names = df_characters["_name"]

for name in char_names:
    name = name.replace("/", "%")
    with open('marvel_characters/'+ name +'.txt', 'r') as f:
        text = f.read()
        name = name.replace("%27","\'")
        name = name.replace("%","/")
        name = name.replace("_", " ")
        dict_raw_text[name] = text

## Get movie page <a name="movie_pages"></a>


In [193]:
# replace so movie title fits the url title with _
df_movies['_movies'] = df_movies['movies'].replace([" "], ["_"], regex = True)

dicts = 'marvel_movies/'
char_names =  df_movies["_movies"]
get_page(dicts, char_names)


### Create dictionaries with raw text for movies

In [194]:
### Create dictionaries with raw text for movies

dict_raw_text_movies = {}

for name in movie_names:
    with open('marvel_movies/'+ name +'.txt', 'r') as f:
        text = f.read()
        dict_raw_text_movies[name] = text

### Store all relevant files

In [None]:
file_to_store = open("marvel_movies_raw_text.pickle", "wb")
pickle.dump(dict_raw_text_movies, file_to_store)
file_to_store.close()

file_to_store = open("marvel_characters_raw_text.pickle", "wb")
pickle.dump(dict_raw_text, file_to_store)
file_to_store.close()

file_to_store = open("dict_raw_text_tv_series.pickle", "wb")
pickle.dump(dict_raw_text_tv_series, file_to_store)
file_to_store.close()

file_to_store_df = open("df_characters.pickle", "wb")
pickle.dump(df_characters, file_to_store_df)
file_to_store_df.close()

file_to_store_df = open("df_movies.pickle", "wb")
pickle.dump(df_movies, file_to_store_df)
file_to_store_df.close()

file_to_store_df = open("df_tv.pickle", "wb")
pickle.dump(df_tv, file_to_store_df)
file_to_store_df.close()

## Get and attributes links between characters <a name="links"></a>

In [None]:
file_to_read = open("marvel_movies_raw_text.pickle", "rb")
movies_raw_text = pickle.load(file_to_read)
file_to_read.close()

file_to_read = open("marvel_characters_raw_text.pickle", "rb")
characters_raw_text = pickle.load(file_to_read)
file_to_read.close()

file_to_read_characters = open("df_characters.pickle", "rb")
df_characters = pickle.load(file_to_read_characters)
file_to_read_characters.close()

file_to_read_movies = open("df_movies.pickle", "rb")
df_movies = pickle.load(file_to_read_movies)
file_to_read_movies.close()

file_to_read_movies = open("df_tv.pickle", "rb")
df_tv = pickle.load(file_to_read_movies)
file_to_read_movies.close()

In [5]:
print("length of movies: {}".format(len(df_movies)))
print("Number of characters: {}".format(len(df_characters)))

length of movies: 35
Number of characters: 3308


**Function that searches the characters page to get all attribtues using different regex pattern**

In [34]:
def get_attributes(text, df):
    '''
    Function that finds all names in the wiki page. 
    '''
    movies = []
    tv = []
    movie_pattern = r"(?s)\[\[((?:(?!]]).)*)]](?!(?:''|\))\s*</?small>)"
    specie_pattern = r'species = \[\[(.*?)\]\]'
    citizenship_pattern = r'citizenship = \{\{(.*?)\}\}'
    status_pattern = r'status = (.*?)\}\}'
    
    tv_pattern = r'\'\'\[\[(.*?)\]\]\'\''

    #print(text)
    #find match 
    
    
    match_citezen = re.findall(citizenship_pattern, text)
    match_specie = re.findall(specie_pattern, text)
    match_status = re.findall(status_pattern, text)
    
    pattern = r'actor ='
    text_movies = re.split(pattern, text, maxsplit=2)[0]
    match_movies = re.findall(movie_pattern, text_movies)
    
    match_tv = re.findall(tv_pattern, text_movies)

    for match in match_tv:
        match = re.sub(r'\|.*',"",match)
        if (match in list(df_tv['tv_series'].values)):
            tv.append(match)  
    
    for match in match_movies:
        #print(match)
        match = re.sub(r'\|.*',"",match)
        #print(match)
        if (match in list(df['movies'].values)):
            movies.append(match)     
    
    return list(np.unique(movies)), ' '.join(match_citezen), ' '.join(match_specie) ,  ' '.join(match_status), tv

**Function that gets all links for a character**

In [44]:
def get_links(text, name, df):
    '''
    Function that finds all names in the wiki page. 
    '''
    links = []

    pattern = r'\[\[(.*?)\]\]'
    
    # find match 
    matches = re.findall(pattern, text)
    
    for match in matches:
        if (match in list(df['name'].values)) & (match != name):
            links.append(match)
    
            
    return links

**Function that defines attributes for the characters**

In [37]:
def def_attributes(df_movies, dictionary):
    '''
    Function that extract all atributes and returns a dictionary. 
    '''
    attributes ={}
    # get_attributes(characters_raw_text['Iron Man'], df_movies)
    for keys, values in dictionary.items():
        #print(keys)
        match_movies, match_citezen, match_specie, match_status, match_tv = get_attributes(values, df_movies)
        
        words = re.findall(r'\w+',values)
        if "Deceased" in match_status:
            col_status = 'red'
        elif  "Alive" in match_status:
            col_status = 'green'
        else:
            col_status = 'blue'

        attributes[keys] = {'movies': match_movies,'tv-serie':match_tv ,'citizen':match_citezen, 
                            'specie': match_specie,'status' : match_status,
                            'length_of_content': len(words), 'No_movies': len(match_movies), 
                            'color': col_status}
    return attributes

##### get all links


In [38]:
def get_all_links(df, dictionary):
    '''
    Funciton that finds the links. 
    If there are less than 3 links, they will not be considered as an edge.
    For the links a dict with the nummber of times a name is mentioned in the text is returned as well. 
    
    '''
    links_no = {}
    links_no_new = {}
    links = {}
    for keys, values in dictionary.items():
        link = get_links(values, keys, df)
        
        link_ = {i:link.count(i) for i in link}
        links_no[keys] = {k:v for (k,v) in link_.items() if v > 3}
        links[keys] = list(links_no[keys].keys())
    return links,links_no

In [45]:
links,links_no = get_all_links(df_characters,characters_raw_text)
attributes = def_attributes(df_movies, characters_raw_text)

The following function add an edge attribute that defined how many times a character is mentioned on another characters page. 

In [41]:
def get_edge_att(G):
    '''
    Function that returns edge attributes
    '''
    edge_att ={}
    for u,v,_ in list(G.edges(data = True)):
        edge_att[(u,v)]={'value': links_no[u][v]}
    return edge_att


In [48]:
file_to_store_l = open("links.pickle", "wb")
pickle.dump(links, file_to_store_l)
file_to_store_l.close()

file_to_store_a = open("attributes.pickle", "wb")
pickle.dump(attributes, file_to_store_a)
file_to_store_a.close()

## MCU character network <a name="MCU_network"></a>
now we can create the MCU character network applying the functions created above

In [49]:
print("links: {}".format(len(links))) # check
print('attributes: {}'.format(len(attributes.keys())))

links: 3308
attributes: 3308


In [None]:
def create_graph(links):
    G = nx.DiGraph(links)
    print("Number of nodes in total graph: {}".format(G.number_of_nodes()))
    print("Number of edges in total graph: {}".format(G.number_of_edges()))
    G.remove_nodes_from(list(nx.isolates(G)))
    
    # set attributes
    nx.set_node_attributes(G,attributes)
    
    # create giant connected component
    GCC = G.subgraph(max(nx.weakly_connected_components(G), key=len)) # the biggest component
    print("Number of nodes in GCC: {}".format(GCC.number_of_nodes()))
    print("Number of edges in GCC: {}".format(GCC.number_of_edges()))
    
    return G, GCC


In [None]:
G, GCC = create_graph(links)

edge_att = get_edge_att(G)
nx.set_edge_attributes(G, edge_att)


In [None]:
### SAVE GRAPH For analysis

file_to_store = open("graph_G.pickle", "wb")
pickle.dump(G, file_to_store)
file_to_store.close()

# Create Teams network <a name="team_network"></a>



## Scrapping text for each team <a name="team_scraping"></a>

In [None]:
baseurl = "https://marvelcinematicuniverse.fandom.com/api.php?"
action = "action=query"
content = "prop=revisions&rvprop=content&rvslots=*"
dataformat ="format=json"

# list of character names - replacing the space with _ such that one can locate the right title
team_names = df_teams["Name"]

# Looping over every characters name using the same API logic and rexex as above
for name in team_names:
    title = "titles="+name
    link = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
    
    wikiresponse = requests.get(link)
    wikitext = wikiresponse.text
    wikijson = json.loads(wikitext)

    page_id = list(wikijson["query"]["pages"].keys())[0] # The page id for each character
    name = name.replace("/", "%")
    
    if len(wikijson["query"]["pages"][page_id]['revisions']) > 1:
        print("Revisions has more than one entry")
    else:
        text = wikijson["query"]["pages"][page_id]['revisions'][0]['slots']['main']['*']
        with open('Teams/'+name+'.txt', 'w', encoding="utf-8") as f: # save the text for each character in the folder
            f.write(text)

#### Save raw text in a dictionary

In [43]:
dict_raw_team_text = {}

team_names = df_teams["Name"]

for name in team_names:
    name = name.replace("/", "%")
    with open('Teams/'+name+'.txt', 'r',encoding="utf-8") as f:
        text = f.read()
        name = name.replace("%","/")
        dict_raw_team_text[name] = text

## Extracting all characters within each teams   <a name="Extract_characters"></a>

In [11]:
def get_links(text, name, df):
    
    links_all = {}
    
    #founders
    founders = []
    founders_text = re.search(r'\|founder (.*?)\n', text)
    if founders_text:
        founders_text = founders_text.group(1)
        founders_names = r'\[\[(.*?)\]\]'
        founders = re.findall(founders_names, founders_text)
        founders = [re.sub(r'\/.*',"",member) for member in founders]
        founders = [re.sub(r'.*\|',"",member) for member in founders]
    
    #leaders
    leaders = []
    leaders_text = re.search(r'\|leader (.*?)\n', text)
    if leaders_text:
        leaders_text = leaders_text.group(1)
        leaders_names = r'\[\[(.*?)\]\]'
        leaders = re.findall(leaders_names, leaders_text)
        leaders = [re.sub(r'\/.*',"",member) for member in leaders]
        leaders = [re.sub(r'.*\|',"",member) for member in leaders]
    
    #former leaders
    formleaders = []
    formerleaders_text = re.search(r'\|formerleaders (.*?)\n', text)
    if formerleaders_text:
        formerleaders_text = formerleaders_text.group(1)
        formerleaders_names = r'\[\[(.*?)\]\]'
        formleaders = re.findall(formerleaders_names, formerleaders_text)
        formleaders = [re.sub(r'\/.*',"",member) for member in formleaders]
        formleaders = [re.sub(r'.*\|',"",member) for member in formleaders]
    
    
    #members
    members = []
    members_text = re.search(r'\|members (.*?)\n', text)
    if members_text:
        members_text = members_text.group(1)
        members_names = r'\[\[(.*?)\]\]'
        members = re.findall(members_names, members_text)
        members = [re.sub(r'\/.*',"",member) for member in members]
        members = [re.sub(r'.*\|',"",member) for member in members]
    
    #former members
    formermembers = []
    formermembers_text = re.search(r'\|formermembers (.*?)\n', text)
    if formermembers_text:
        formermembers_text = formermembers_text.group(1)
        formermembers_names = r'\[\[(.*?)\]\]'
        formermembers = re.findall(formermembers_names, formermembers_text)
        formermembers = [re.sub(r'\/.*',"",member) for member in formermembers]
        formermembers = [re.sub(r'.*\|',"",member) for member in formermembers]
        
                                   
    links_all = {"Founders": founders, "Leaders": leaders, "Former leaders": formleaders, 
                       "Members": members, "Former members": formermembers}
    
    links_org =[]
    links_org.append(founders)
    links_org.append(leaders)
    links_org.append(formleaders)
    links_org.append(members)
    links_org.append(formermembers)
    links_org = np.unique(links_org)
    links_org = [item for sublist in links_org for item in sublist]
    
    return links_all, links_org

In [12]:
def get_all_links(dictionary):
    links_role = {}
    links_all = {}
    for keys, values in dictionary.items():
        link_role, link_org = get_links(values, keys, df_characters)
        links_role[keys] = link_role
        links_all[keys] = link_org
    return links_role, links_all

In [13]:
team_roles, team_all = get_all_links(teams_raw_text)

  return array(a, dtype, copy=False, order=order, subok=True)


### Gathering teams with more than one name

In [None]:
#removing all teams with Categories or numbers in name
remove = []
for team in df_team.Name:
    if ("Category" in team) or (team.isdigit())  :
        df_team = df_team[df_team["Name"] != team]
        remove.append(team)

In [17]:
#Identify teams with several names
remove = [x for x in df_team['Name'] if ("/" in x) or ("fiction" in x)]
remove.append("STRIKE Team: Delta")
remove.append("Avengers (Fiction)")

In [18]:
#gather all characters in same team if team have multiple names
team_all_new = team_all.copy()
for keys, values in team_all.items():
    if keys in remove:
        if keys == 'Avengers (Fiction)' or keys == 'STRIKE Team: Delta':
            input_key = re.sub(r'\s.*',"",keys)
        else:
            input_key = re.sub(r'\/.*',"",keys)
        if input_key in team_all.keys(): #check if org name without / is in dict
            for name in values: 
                if name not in team_all_new[input_key]: #check if name is already in organisation
                    team_all_new[input_key].append(name)
            team_all_new.pop(keys) #remove old key
        else:
            team_all_new[input_key] = team_all_new.pop(keys)      

In [19]:
#gather all characters roles in same team if team have multiple names
team_roles_new = team_roles.copy()
for keys, values in team_roles.items():
    if keys in remove:
        
        if keys == 'Avengers (Fiction)' or keys == 'STRIKE Team: Delta':
            input_key = re.sub(r'\s.*',"",keys)
        
        else:
            input_key = re.sub(r'\/.*',"",keys)

        if input_key in team_roles.keys(): #check if org name without / is in dict 
            for name in team_roles[keys]["Founders"]: 
                if name not in team_roles_new[input_key]["Founders"]: #check if name is already in organisation
                    team_roles_new[input_key]["Founders"].append(name)
            
            for name in team_roles[keys]["Leaders"]: 
                if name not in team_roles_new[input_key]["Leaders"]: #check if name is already in organisation
                    team_roles_new[input_key]["Leaders"].append(name)
                    
            for name in team_roles[keys]["Former leaders"]: 
                if name not in team_roles_new[input_key]["Former leaders"]: #check if name is already in organisation
                    team_roles_new[input_key]["Former leaders"].append(name)
                    
            for name in team_roles[keys]["Members"]: 
                if name not in team_roles_new[input_key]["Members"]: #check if name is already in organisation
                    team_roles_new[input_key]["Members"].append(name)
                    
            for name in team_roles[keys]["Former members"]: 
                if name not in team_roles_new[input_key]["Former members"]: #check if name is already in organisation
                    team_roles_new[input_key]["Former members"].append(name)
              
            team_roles_new.pop(keys) #remove old key
        else:
            team_roles_new[input_key] = team_roles_new.pop(keys)      

### Dictionaries for graph visualisation  <a name="dict_for_team_viz"></a>


In [569]:
#dictionary with characters as keys and team as values
dict_team_character = {}
for name in df_characters["name"]:
    orgs = []
    for keys, values in team_all_new.items():
        if name in values:
            orgs.append(keys)
    dict_team_character[name] = orgs

In [570]:
#dictionary with character and links to other characters within same team
dict_team_org = {}
for keys, values in dict_team_character.items():
    characters = []
    if values:
        for org in values:
            for names in team_all_new[org]: 
                if names != keys: #ensure that we do not create a link to the character itself
                    characters.append(names)
    dict_team_org[keys] = characters

In [571]:
#dictionary with teams and links to other teams if they share characters
dict_links_team_name = {}
for keys, values in team_all_new.items():
    teams = []
    count = 0
    for i in range(len(team_all_new.keys())):
        try:
            next_key = list(team_all_new)[list(team_all_new).index(keys) + (i+1)]
            for names in values:
                if names in team_all_new[next_key]:
                    teams.append(next_key)
        except (ValueError, IndexError):
            count += 1
                
    dict_links_team_name[keys] = teams

In [572]:
#removing all teams with no links to other organisations
team_links_removed = {k: v for k, v in dict_links_team_name.items() if v}

In [573]:
#removing characters that are not part of an organisation
team_removed = {k: v for k, v in team_all_new.items() if v}

### Saving dictionaries as pickles

In [38]:
import pickle
file_to_store = open("team_shared.pickle", "wb")
pickle.dump(team_links_removed, file_to_store)
file_to_store.close()

In [39]:
import pickle
file_to_store = open("team_characters.pickle", "wb")
pickle.dump(team_removed, file_to_store)
file_to_store.close()

## Graph attributes  <a name="graph_attributes"></a>


In [574]:
attributes_character_team = {}

for keys1, values1 in dict_team_character.items():
    orgs = []
    founder = []
    leader = []
    form_leader = []
    member = []
    form_member = []
    col = []
    for keys2, values2 in team_roles_new.items():


        teams = values1

        if keys1 in team_roles_new[keys2]['Founders']: #extract founder
            founder.append(keys2)

        if keys1 in team_roles_new[keys2]['Leaders']: #extract leader
            leader.append(keys2)

        if keys1 in team_roles_new[keys2]['Former leaders']: #extract former leader
            form_leader.append(keys2)

        if keys1 in team_roles_new[keys2]['Members']: #extract member
            member.append(keys2)

        if keys1 in team_roles_new[keys2]['Former members']: #extract former member
            form_member.append(keys2)


        col = "purple"

        attributes_character_team[keys1] = {"Name": keys1, "Team(s)": teams, "Founder": founder, "Leader": leader,
                                    "Former leader": form_leader, "Member":member, "Former member":form_member,
                                    "Color": col}

In [742]:
import random
def team_att(dictionary):
    attributes_team = {}
    
    from random import randint
    colors = []
    n = len(dictionary)
    for i in range(n):
        
        colors.append('#%06X' % randint(0, 0xFFFFFF))
    n=0
    for keys, values in dictionary.items(): 
        attributes_team[keys] = {"Name": keys , "color": str(colors[n]), "shape": "diamond"}
        n = n+1
    return attributes_team

In [None]:
from collections import Counter
#number of times each organisation is mentioned in another orginasation
links_no_team = team_links_removed.copy()
for keys, values in team_links_removed.items():
    counts = Counter(values)
    links_no_team[keys] = counts

In [None]:
import pickle
file_to_store = open("links_no_team.pickle", "wb")
pickle.dump(links_no_team, file_to_store)
file_to_store.close()

In [764]:
def get_edge_att(G):
    '''
    Function that returns edge attributes
    '''
    edge_att ={}
    for u,v,_ in list(G.edges(data = True)):
        
        no_links = links_no[u][v]
        
        edge_att[(u,v)]={'value': no_links, 'title': 'No. of shared characters:' + str(no_links)}
    return edge_att

## Converting names

As the characters within the teams have different aliases for the same character, all aliases are converted to their original hero name.

In [551]:
def convert_characters(dictionary):
    removes =[]
    for keys, values in dictionary.items():
    
        #Loki
        lokis = ['Loki Variants','Classic Loki','Kid Loki','Boastful Loki','President Loki']
        values = ['Loki' if i in lokis else i for i in values]
        team_removed[keys] = np.unique(values)
        
        #Thor
        thors = ['Bruce Banner','Thor Odinson']
        values = ['Thor' if i in thors else i for i in values]
        team_removed[keys] = np.unique(values)
        
        #Iron Man
        IronMans = ['Tony Stark']
        values = ['Iron Man' if i in IronMans else i for i in values]
        team_removed[keys] = np.unique(values)
        
        #Captain America
        CA = ["Sam Wilson"]
        values = ['Captain America' if i in CA else i for i in values]
        team_removed[keys] = np.unique(values)
                
        #Black Widow
        Black_widows = ['Natasha Romanoff']
        values = ['Black Widow' if i in Black_widows else i for i in values]
        team_removed[keys] = np.unique(values)
        
        #Hawkeye
        Hawkeyes = ['Clint Barton']
        values = ['Hawkeye' if i in Hawkeyes else i for i in values]
        team_removed[keys] = np.unique(values)
        
        #War Machine
        WarMachines = ['James Rhodes']
        values = ['War Machine' if i in WarMachines else i for i in values]
        team_removed[keys] = np.unique(values)
        
        #Spyder Man
        SpyderMans = ['Peter Parker']
        values = ['Spider-Man' if i in SpyderMans else i for i in values]
        team_removed[keys] = np.unique(values)
        
        #Scarlet Witch
        Wandas = ['Wanda Maximoff']
        values = ['Scarlet Witch' if i in Wandas else i for i in values]
        team_removed[keys] = np.unique(values)
        
        #Quicksilver 
        Quicksilver = ['Pietro Maximoff']
        values = ['Quicksilver' if i in Quicksilver else i for i in values]
        team_removed[keys] = np.unique(values)
        
        #Captain Marvel
        CaptainMarvel = ['Carol Danvers']
        values = ['Captain Marvel' if i in CaptainMarvel else i for i in values]
        team_removed[keys] = np.unique(values)
        
        #Ant-Man
        AntMan = ['Scott Lang']
        values = ['Ant-Man' if i in AntMan else i for i in values]
        team_removed[keys] = np.unique(values)
    
    lists = [lokis,thors,IronMans,CA,Black_widows,Hawkeyes,WarMachines,SpyderMans,Wandas,Quicksilver,CaptainMarvel,AntMan]
    for i in lists:
        removes.append(i)
    
    removes = [item for sublist in removes for item in sublist]
    
    return dictionary, removes

Adding the teams and roles to the original hero name that were assigned to the removed aliases.

In [552]:
def update_attributes(attributes_character_team): 

    #update node attributes 
    attributes_character_team['Loki']['Team(s)'].append('Loki Variant Army')
    attributes_character_team['Loki']['Former leader'].append('Loki Variant Army')
    attributes_character_team['Captain America'] = {'Name':'Captain America', 'Team(s)':['Avengers'], 'Founder':[], 'Leader':["Avengers"], 'Former leader':[], 'Member':["Avengers"], 'Former member':[], 'Color':"purple"}
    attributes_character_team['Scarlet Witch'] = {'Name':'Scarlet Witch', 'Team(s)':['Avengers'], 'Founder':[], 'Leader':[], 'Former leader':[], 'Member':["Avengers"], 'Former member':[], 'Color':"purple"}
    attributes_character_team['Quicksilver'] = {'Name':'Quicksilver', 'Team(s)':['Avengers'], 'Founder':[], 'Leader':[], 'Former leader':[], 'Member':["Avengers"], 'Former member':[], 'Color':"purple"}
    attributes_character_team['War Machine'] = {'Name':'War Machine', 'Team(s)':['Avengers'], 'Founder':[], 'Leader':[], 'Former leader':[], 'Member':["Avengers"], 'Former member':[], 'Color':"purple"}
    attributes_character_team['Sam Wilson'] = {'Name':'Sam Wilson', 'Team(s)':['Avengers'], 'Founder':[], 'Leader':[], 'Former leader':[], 'Member':["Avengers"], 'Former member':[], 'Color':"purple"}
    attributes_character_team['Spider-Man'] = {'Name':'Spyder-Man', 'Team(s)':['Avengers'], 'Founder':[], 'Leader':[], 'Former leader':[], 'Member':["Avengers"], 'Former member':[], 'Color':"purple"}
    attributes_character_team['Captain Marvel'] = {'Name':'Captain Marvel', 'Team(s)':['Avengers','Sparrows'], 'Founder':[], 'Leader':[], 'Former leader':['Sparrows'], 'Member':["Avengers"], 'Former member':['Sparrows'], 'Color':"purple"}
    attributes_character_team['Ant-Man']['Member'].append('Avengers')
    attributes_character_team['Ant-Man']['Team(s)'].append('Avengers')
    
    return attributes_character_team

In [553]:
team_removed,remove_from_att = convert_characters(team_removed)

In [554]:
attributes_character_all = update_attributes(attributes_character_team)

In [556]:
attributes_character_team = attributes_character_all.copy()

Removing aliases from the attributes used for the graph.

In [557]:
[attributes_character_team.pop(key) for key in remove_from_att if key in attributes_character_team.keys()]

[{'Name': 'Classic Loki',
  'Team(s)': ['Asgardian Royal Family'],
  'Founder': [],
  'Leader': [],
  'Former leader': [],
  'Member': [],
  'Former member': ['Asgardian Royal Family'],
  'Color': 'purple'},
 {'Name': 'Kid Loki',
  'Team(s)': ['Asgardian Royal Family'],
  'Founder': [],
  'Leader': [],
  'Former leader': [],
  'Member': [],
  'Former member': ['Asgardian Royal Family'],
  'Color': 'purple'},
 {'Name': 'Boastful Loki',
  'Team(s)': ['Asgardian Royal Family'],
  'Founder': [],
  'Leader': [],
  'Former leader': [],
  'Member': [],
  'Former member': ['Asgardian Royal Family'],
  'Color': 'purple'},
 {'Name': 'President Loki',
  'Team(s)': ['Asgardian Royal Family', 'Loki Variant Army'],
  'Founder': [],
  'Leader': [],
  'Former leader': ['Loki Variant Army'],
  'Member': [],
  'Former member': ['Asgardian Royal Family'],
  'Color': 'purple'},
 {'Name': 'Sam Wilson',
  'Team(s)': ['Avengers'],
  'Founder': [],
  'Leader': [],
  'Former leader': [],
  'Member': ['Avengers

In [743]:
attributes_team = team_att(team_removed)

# Organisation network <a name="organisation_network"></a>


## Scrapping text for each orginsation <a name="org_scraping"></a>

In [None]:
baseurl = "https://marvelcinematicuniverse.fandom.com/api.php?"
action = "action=query"
content = "prop=revisions&rvprop=content&rvslots=*"
dataformat ="format=json"


# list of character names - replacing the space with _ such that one can locate the right title
organisations_names = df_organisation["Name"]

# Looping over every characters name using the same API logic and rexex as above
for name in organisations_names:
    if name == "Duncan + Dotter Design":
        name = "Duncan_%2B_Dotter_Design"
    
    if name == "M&R Credit Union":
        name = "M%26R_Credit_Union"
    
    name = name.replace(" & ", "_%26_")

    title = "titles="+name
    link = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
    
    wikiresponse = requests.get(link)
    wikitext = wikiresponse.text
    wikijson = json.loads(wikitext)

    page_id = list(wikijson["query"]["pages"].keys())[0] # The page id for each character
    name = name.replace("/", "%")
    
    if len(wikijson["query"]["pages"][page_id]['revisions']) > 1:
        print("Revisions has more than one entry")
    else:
        text = wikijson["query"]["pages"][page_id]['revisions'][0]['slots']['main']['*']
        with open('Organisations/'+name+'.txt', 'w', encoding="utf-8") as f: # save the text for each character in the folder
            f.write(text)

### Save raw text in a dicctionary

In [None]:
dict_raw_org_text = {}

organisations_names = df_organisation["Name"]

for name in organisations_names:
    if name == "Duncan + Dotter Design":
        name = "Duncan_%2B_Dotter_Design"
    if name == "M&R Credit Union":
        name = "M%26R_Credit_Union"
    name = name.replace(" & ", "_%26_")
    name = name.replace("/", "%")
    with open('Organisations/'+name+'.txt', 'r',encoding="utf-8") as f:
        text = f.read()
        name = name.replace("Duncan_%2B_Dotter_Design", "Duncan + Dotter Design")
        name = name.replace("M%26R_Credit_Union", "M&R Credit Union")
        name = name.replace("%","/")
        name = name.replace("_%26_", " & ")
        dict_raw_org_text[name] = text

## Extracting all characters within each organisation <a name="org_extract_characters"></a>

In [8]:
def get_links(text, name, df):
    
    pattern_link = r'\[\[(.*?)\]\]'
    
    liste = []
    links = re.findall(pattern_link, text)
    
    for l in links:
        if (l in df["name"].values) & (l != name):
            liste.append(l)
    
    return list(np.unique(liste))

In [9]:
def get_links(text, name, df):
    
    links_all = {}
    
    #founders
    founders = []
    founders_text = re.search(r'\|founder (.*?)\n', text)
    if founders_text:
        founders_text = founders_text.group(1)
        founders_names = r'\[\[(.*?)\]\]'
        founders = re.findall(founders_names, founders_text)
        founders = [re.sub(r'\/.*',"",member) for member in founders]
        founders = [re.sub(r'.*\|',"",member) for member in founders]
    
    #leaders
    leaders = []
    leaders_text = re.search(r'\|leader (.*?)\n', text)
    if leaders_text:
        leaders_text = leaders_text.group(1)
        leaders_names = r'\[\[(.*?)\]\]'
        leaders = re.findall(leaders_names, leaders_text)
        leaders = [re.sub(r'\/.*',"",member) for member in leaders]
        leaders = [re.sub(r'.*\|',"",member) for member in leaders]
    
    #former leaders
    formleaders = []
    formerleaders_text = re.search(r'\|formerleaders (.*?)\n', text)
    if formerleaders_text:
        formerleaders_text = formerleaders_text.group(1)
        formerleaders_names = r'\[\[(.*?)\]\]'
        formleaders = re.findall(formerleaders_names, formerleaders_text)
        formleaders = [re.sub(r'\/.*',"",member) for member in formleaders]
        formleaders = [re.sub(r'.*\|',"",member) for member in formleaders]
    
    
    #members
    members = []
    members_text = re.search(r'\|members (.*?)\n', text)
    if members_text:
        members_text = members_text.group(1)
        members_names = r'\[\[(.*?)\]\]'
        members = re.findall(members_names, members_text)
        members = [re.sub(r'\/.*',"",member) for member in members]
        members = [re.sub(r'.*\|',"",member) for member in members]
    
    #former members
    formermembers = []
    formermembers_text = re.search(r'\|formermembers (.*?)\n', text)
    if formermembers_text:
        formermembers_text = formermembers_text.group(1)
        formermembers_names = r'\[\[(.*?)\]\]'
        formermembers = re.findall(formermembers_names, formermembers_text)
        formermembers = [re.sub(r'\/.*',"",member) for member in formermembers]
        formermembers = [re.sub(r'.*\|',"",member) for member in formermembers]
        
                                   
    links_all = {"Founders": founders, "Leaders": leaders, "Former leaders": formleaders, 
                       "Members": members, "Former members": formermembers}
    
    links_org =[]
    links_org.append(founders)
    links_org.append(leaders)
    links_org.append(formleaders)
    links_org.append(members)
    links_org.append(formermembers)
    links_org = np.unique(links_org)
    links_org = [item for sublist in links_org for item in sublist]
    
    return links_all, links_org

In [10]:
def get_all_links(dictionary):
    links_role = {}
    links_all = {}
    for keys, values in dictionary.items():
        link_role, link_org = get_links(values, keys, df_characters)
        links_role[keys] = link_role
        links_all[keys] = link_org
    return links_role, links_all

In [11]:
org_roles, org_all = get_all_links(dict_raw_org_text)

  return array(a, dtype, copy=False, order=order, subok=True)


## Preprocessing organisation data

Remove organisation if name contains "Category" as these are repititions or if they contain digits as the represent newspapers.

In [12]:
remove = []
for org in df_organisation.Name:
    if ("Category" in org) or (org.isdigit())  :
        df_organisation = df_organisation[df_organisation["Name"] != org]
        remove.append(org)

In [13]:
df_organisation = df_organisation.iloc[3: , :]
df_organisation = df_organisation.reset_index(drop=True)

### Gathering organisations and characters if organisations have several names

Idenitifying organisations with several names.

In [14]:
remove = [x for x in df_organisation['Name'] if "/" in x]

In [15]:
df_organisation = df_organisation.loc[df_organisation["Name"]!= "Avengers/Avengers Assassinated" ]

Gather all characters roles in same organisation if organisation have multiple names

In [32]:
#gather all characters roles in same organisation if organisation have multiple names
org_roles_new = org_roles.copy()
for keys, values in org_roles.items():
    if keys in remove:
        
        input_key = re.sub(r'\/.*',"",keys)

        if input_key in org_roles.keys(): #check if org name without / is in dict 
            for name in org_roles[keys]["Founders"]: 
                if name not in org_roles_new[input_key]["Founders"]: #check if name is already in organisation
                    org_roles_new[input_key]["Founders"].append(name)
            
            for name in org_roles[keys]["Leaders"]: 
                if name not in org_roles_new[input_key]["Leaders"]: #check if name is already in organisation
                    org_roles_new[input_key]["Leaders"].append(name)
                    
            for name in org_roles[keys]["Former leaders"]: 
                if name not in org_roles_new[input_key]["Former leaders"]: #check if name is already in organisation
                    org_roles_new[input_key]["Former leaders"].append(name)
                    
            for name in org_roles[keys]["Members"]: 
                if name not in org_roles_new[input_key]["Members"]: #check if name is already in organisation
                    org_roles_new[input_key]["Members"].append(name)
                    
            for name in org_roles[keys]["Former members"]: 
                if name not in org_roles_new[input_key]["Former members"]: #check if name is already in organisation
                    org_roles_new[input_key]["Former members"].append(name)
              
            org_roles_new.pop(keys) #remove old key
        else:
            org_roles_new[input_key] = org_roles_new.pop(keys)      

In [17]:
#gather all characters in same organisation if organisation have multiple names
org_all_new = org_all.copy()
for keys, values in org_all.items():
    if keys in remove:
        input_key = re.sub(r'\/.*',"",keys)
        if input_key in org_all.keys(): #check if org name without / is in dict
            for name in values: 
                if name not in org_all_new[input_key]: #check if name is already in organisation
                    org_all_new[input_key].append(name)
            org_all_new.pop(keys) #remove old key
        else:
            org_all_new[input_key] = org_all_new.pop(keys)      

### Create dictionaries used for graph representations  <a name="org_dict_for_viz"></a>


In [30]:
#dictionary with characters as keys and organisations as values
dict_org_character = {}
for name in df_characters["name"]:
    orgs = []
    for keys, values in org_all_new.items():
        if name in values:
            orgs.append(keys)
    dict_org_character[name] = orgs

In [18]:
#dictionary with organisations and links to other organisation if they share characters
dict_links_org_name = {}
for keys, values in org_all_new.items():
    orgs = []
    count = 0
    for i in range(len(org_all_new.keys())):
        try:
            next_key = list(org_all_new)[list(org_all_new).index(keys) + (i+1)]
            for names in values:
                if names in org_all_new[next_key]:
                    orgs.append(next_key)
        except (ValueError, IndexError):
            count += 1
                
    dict_links_org_name[keys] = orgs

In [19]:
#removing all organisation with no links to other organisations
org_links_removed = {k: v for k, v in dict_links_org_name.items() if v}

In [20]:
#removing characters that are not part of an organisation
org_removed = {k: v for k, v in org_all_new.items() if v}

### Saving dictionaries as pickles

In [38]:
import pickle
file_to_store = open("org_shared.pickle", "wb")
pickle.dump(org_links_removed, file_to_store)
file_to_store.close()

In [39]:
import pickle
file_to_store = open("org_characters.pickle", "wb")
pickle.dump(org_removed, file_to_store)
file_to_store.close()

## Graph attributes <a name="org_graph_attr"></a>


In [33]:
attributes_character_org = {}

for keys1, values1 in dict_org_character.items():
    orgs = []
    founder = []
    leader = []
    form_leader = []
    member = []
    form_member = []
    col = []
    for keys2, values2 in org_roles_new.items():


        orgs = values1

        if keys1 in org_roles_new[keys2]['Founders']: #extract founder
            founder.append(keys2)

        if keys1 in org_roles_new[keys2]['Leaders']: #extract leader
            leader.append(keys2)

        if keys1 in org_roles_new[keys2]['Former leaders']: #extract former leader
            form_leader.append(keys2)

        if keys1 in org_roles_new[keys2]['Members']: #extract member
            member.append(keys2)

        if keys1 in org_roles_new[keys2]['Former members']: #extract former member
            form_member.append(keys2)


        col = "purple"

        attributes_character_org[keys1] = {"Name": keys1, "Organisation(s)": orgs, "Founder": founder, "Leader": leader,
                                    "Former leader": form_leader, "Member":member, "Former member":form_member,
                                    "Color": col}

In [34]:
import pickle
file_to_store = open("attributes_character_org.pickle", "wb")
pickle.dump(attributes_character_org, file_to_store)
file_to_store.close()

In [None]:
from collections import Counter
#number of times each organisation is mentioned in another orginasation
links_no = org_links_removed.copy()
for keys, values in org_links_removed.items():
    counts = Counter(values)
    links_no[keys] = counts

In [63]:
import pickle
file_to_store = open("links_no.pickle", "wb")
pickle.dump(links_no, file_to_store)
file_to_store.close()