# 1 Data Collection

The data collection for analyzing social graphs and interactions was performed for scraping the Fandom Wiki of the Grey's Anatomy universe in the following three main steps. 

1. Scraping the list of Grey's Anatomy characters

undefined. Scraping and cleaning the character pages of every character in the character list

undefined. Scrape the episode and season summaries

In [11]:
!pip install networkx

import warnings
warnings.filterwarnings("ignore")

import json
import pandas as pd
import urllib.request
import re
import networkx as nx
import numpy as np
import urllib.request
import requests
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline

## 1.1 Scraping the character list

The first step of the data gathering is to get the list of all characters from the Fandom page.

In [12]:
session = requests.Session()

url = "https://greysanatomy.fandom.com/api.php"

In [None]:
params = {
    "format": "json",
    "list": "categorymembers",
    "action": "query",
    "cmtitle": "Category:Characters",
    "cmlimit": "500",
    "cmcontinue": ""
}
request = session.get(url=url, params=params)
data = request.json()

pages = data["query"]["categorymembers"]

while 'continue' in data.keys():
    params["cmcontinue"] = data["continue"]["cmcontinue"]
    request = session.get(url=url, params=params)
    data = request.json()
    pages.extend(data["query"]["categorymembers"])

with open('characters.json', 'w', encoding='utf-8') as file:
    json.dump(pages, file, ensure_ascii=False, indent=4)

## 1.2 Scraping and cleaning the character pages

The scraping and cleaning of the character pages contains the following main steps.

1. Scrape and save the character wikipages in a folder based on the character list

undefined. Get the occurences of each character in every show (Grey's Anatomy, Private Practice, Station 19) normalized by the number of season 

undefined. Define the main universe of the character based on the occurences

undefined. Define categories of characters (Doctors, Nurses, Patients or other such as Family or Friends)

undefined. Extract the list of aliases for characters and the status (Dead or alive) from the infobox

undefined. Scrape the clean description for each character 

undefined. Get the history for each character

In [4]:
# open the list of characters and save to a dataframe 
f = open('characters.json')
data = json.load(f) 
df = pd.DataFrame(data)
df.sample(10)

In [None]:
# drop the ns column as it does not contain any valuable information
df.drop(columns = ['ns'], inplace=True)

In [None]:
# create a column for the file name to store the character page based on the character name in title
df['file'] = df['title']
df = df.replace({"file":{" ":"_", "/":"_", "\"":"", "\?":""}}, regex=True)
df = df[~df.file.str.contains("Unnamed_Characters")]
df = df[~df.file.str.contains("Unseen")]
df = df[~df.file.str.contains("User")]
df = df[~df.file.str.contains("Category")]

In [None]:
# scrape the character pages and save them in a folder called character wikipages
for index, row in df.iterrows():
    pageid = row['pageid']
    title = row['file']
    
    query = "https://greysanatomy.fandom.com/api.php?action=query&pageids={}&prop=revisions&rvprop=content&format=json".format(pageid)
    
    wikiresponse = urllib.request.urlopen(query)
    wikidata = wikiresponse.read()
    wikitext = wikidata.decode('utf-8')
    data_json = json.loads(wikitext)
    
    with open("Character Wikipages/" + title+'.json', 'w') as f:
        json.dump(data_json, f)

KeyboardInterrupt: 

In [None]:
# get the outlinks for each character page for the universe assignment and connection betwenn characters
df['outlinks_clean'] = ""
df['outlinks_clean'] = df['outlinks_clean'].astype('object')

for index, row in df.iterrows():
    title = row['file']
    pageid = str(row['pageid'])
    
    f = open("Character Wikipages/" + title + ".json")
    data = json.load(f)
    text = json.dumps(data)
    
    
    res = re.findall(r"\[\[.*?\]\]", text)
    res_clean = []
    for element in res:
        element = element.strip("[[").strip("]]")
        res_clean.append(element)
    
    df.at[index, 'outlinks_clean'] = res_clean

In [None]:
# count the occurence of a character in the seasons and normalize by the number of seasons
for index, row in df.iterrows():
    ga_count = 0
    pp_count = 0
    s19_count = 0
    for element in row['outlinks_clean']:
        if element.startswith("Category"):
            ga_count += element.count("GA S")
            pp_count += element.count("PP S")
            s19_count += element.count("S19 S")
        
    df.at[index, "ga_occurences"] = ga_count/19
    df.at[index, "pp_occurences"] = pp_count/6
    df.at[index, "s19_occurences"] = s19_count/6

In [None]:
# define a main universe based on the occurences and drop characters without main universe
for index, row in df.iterrows():   
    if row['ga_occurences'] >= row['pp_occurences'] and row['ga_occurences'] >= row['s19_occurences'] and row['ga_occurences'] > 0:
        df.at[index, 'main_universe'] = "Grey's Anatomy"
    elif row['pp_occurences'] > row['ga_occurences'] and row['pp_occurences'] >= row['s19_occurences'] and row['pp_occurences'] > 0:
        df.at[index, 'main_universe'] = "Private Practice"
    elif row['s19_occurences'] > row['ga_occurences'] and row['s19_occurences'] > row['ga_occurences'] and row['s19_occurences'] > 0:
        df.at[index, 'main_universe'] = "Station 19"

df = df[~df.main_universe.isnull()]

In [None]:
# create a column with the encoded universe
le = preprocessing.LabelEncoder()
df['universe_encoded'] = le.fit_transform(df['main_universe'])

In [None]:
# define categories of characters (Doctors, Nurses, Patients or other such as Family or Friends)
for index, row in df.iterrows():
    if 'Category:Doctors' in row['outlinks_clean'] :
        df.at[index, 'category'] = "Doctor"
    elif 'Category:Firefighters' in row['outlinks_clean']:
        df.at[index, 'category'] = "Firefighter"
    elif 'Category:Nurses' in row['outlinks_clean']:
        df.at[index, 'category'] = "Nurses"
    elif 'Category:Patients' in row['outlinks_clean']:
        df.at[index, 'category'] = "Patient"
    else: 
        df.at[index, 'category'] = "Other"

In [None]:
# get the status of each character whether the character is alive or dead and the list of aliases
for index, row in df.iterrows():
    title = row['file']
    pageid = str(row['pageid'])
    
    f = open("Character Wikipages/" + title + ".json")
    data = json.load(f)
    text = json.dumps(data)
    
    pattern_infobox = r'\{\{\w+\sInfobox.*?\}\}'
    infobox = re.findall(pattern_infobox, text)
    if len(infobox) >0:
        infobox = infobox[0]
        infobox = infobox.strip("{{").strip("}}").replace('\\n|', '\n')
        character_info = {}
        for line in infobox.split('\n'):
            if 'Infobox' not in line:
                info = line.split(' = ')
                if len(info)>1:
                    character_info[info[0]] = info[1]
        
        if "status" in character_info.keys():
            df.at[index, 'status'] = character_info['status']
            
        if "alias" in character_info.keys():
            alias_list = []
            alias_list = character_info['alias'].split("\\n")
    
            df.at[index, 'alias_list'] = alias_list

In [None]:
# get a description of each character
for index, row in df.iterrows():
    pageid = row['pageid']
    title = row['file']
    
    query = "https://greysanatomy.fandom.com/api.php?action=query&pageids={}&prop=pageprops&format=json".format(pageid)
    
    wikiresponse = urllib.request.urlopen(query)
    wikidata = wikiresponse.read()
    wikitext = wikidata.decode('utf-8')
    data_json = json.loads(wikitext)
    
    if "fandomdescription" in data_json['query']['pages'][str(pageid)]['pageprops'].keys():
        df.at[index, 'description'] = data_json['query']['pages'][str(pageid)]['pageprops']['fandomdescription']

In [None]:
# to get an overview of the scraped and cleaned dara
df.sample(10)

Unnamed: 0,pageid,title,file,outlinks_clean,character_links,ga_occurences,pp_occurences,s19_occurences,main_universe,category,status,alias_list,description
206,50052,Corinne Bennett,Corinne_Bennett,"[Sam Bennett, Dee Bennett, Raymond McCray, Nao...","[Dee Bennett, Naomi Bennett, Raymond McCray, O...",0.0,0.166667,0.0,Private Practice,Patient,,,
1317,81918,Anthony Hughes,Anthony_Hughes,"[Lenya Hughes, Victoria Hughes, Marion Hughes,...","[Theo Ruiz, Lenya Hughes, Marion Hughes, Victo...",0.0,0.0,0.333333,Station 19,Other,,,Anthony Hughes is the father of Victoria Hughe...
999,34286,Gary,Gary,"[Roseridge Home for Extended Care, If Only You...",[Adele Webber],0.052632,0.0,0.0,Grey's Anatomy,Other,,,
360,7213,Alana Cahill,Alana_Cahill,"[Seattle Grace Mercy West Hospital, Walking on...","[Owen Hunt, Harper Avery, Roberta Thompson]",0.052632,0.0,0.0,Grey's Anatomy,Doctor,Alive,"[*Dr. Pantsuit, *The Efficiency Fairy]",
1296,51224,Patrick Hoffman,Patrick_Hoffman,"[Cooper Freedman, Amelia Shepherd, Dennis Hoff...","[Amelia Shepherd, Cooper Freedman, Dennis Hoff...",0.0,0.166667,0.0,Private Practice,Patient,Alive,,
2473,57054,Bob Reeves,Bob_Reeves,"[April Kepner, Jackson Avery, Ben Warren, Andr...","[Laura Reeves, Robbie Reeves, Ben Warren, Andr...",0.052632,0.0,0.0,Grey's Anatomy,Patient,Alive,,
2972,52816,Terrence,Terrence,"[Good Fries Are Hard to Come By, Private Pract...",[],0.0,0.166667,0.0,Private Practice,Other,,,
1748,31971,Phil Leggett,Phil_Leggett,"[Dana Leggett, James Leggett, The End is the B...","[Derek Shepherd, James Leggett, Mark Sloan, Da...",0.052632,0.0,0.0,Grey's Anatomy,Other,Alive,,
1749,74923,Len,Len,"[Alex Karev, Owen Hunt, Doug Miller, Tanya, It...","[Tanya, Owen Hunt, Alex Karev, Doug Miller]",0.052632,0.0,0.0,Grey's Anatomy,Patient,Alive,,
2268,50719,Oliver,Oliver,"[Cooper Freedman, Sheldon Wallace, Kelly (Seco...","[Sheldon Wallace, Cooper Freedman, Scott Nelson]",0.0,0.166667,0.0,Private Practice,Patient,Alive,,


In [None]:
# save the data to a characters file
df.to_csv("characters.csv")

In [13]:
#define helper functions to get a characters history and clean it 

def clean_text(text):
    text = re.sub(r"\[\[(?:[^\]\]|:]+?)\|([^(\|)]+?)\]\]", r"\1", text)
    text = re.sub(r"\[\[([^(\|)]+?)\]\]", r"\1", text)
    text = re.sub(r"\[\[(?:Image|File).+?\|([^\|]+?)\]\]", r"", text)
    text = re.sub(r"==.+?==", r"", text)
    text = re.sub(r"<ref>.*?<\/ref>", r"", text)
    return text.replace("\n"," ").replace("*","").replace("=","")

def get_character_history(title):
    character_params = {
        "format": "json",
        "page": title,
        "action": "parse",
        "prop": "wikitext",
        "section": 1,
        "disabletoc": 1
    }
    request = session.get(url=url, params=character_params)
    if 'parse' in request.json().keys():
        return clean_text(request.json()['parse']['wikitext']['*'])
    else:
        return ""

# Set up characters dataframe
df = pd.read_csv("characters.csv")

In [14]:
# get character history
character_history = {}
for index, row in df.iterrows():
    title = row['file']
    character_history[title] = get_character_history(title)

with open('characters_history.json', 'w', encoding='utf-8') as file:
    json.dump(character_history, file, ensure_ascii=False, indent=4)

### 1.3 Scrape the episode and season summaries

The last step of the data scraping is gettting the episode and season data from Fandom.

In [None]:
shows_dict = {
    'GA': {'seasons': 19, 'name': "Grey's Anatomy"},
    'S19': {'seasons': 6, 'name': 'Station 19'},
    'PP': {'seasons': 6, 'name': 'Private Practice'}
}

def get_season_data(show, season):
    season_params = {
        "format": "json",
        "page": "Season {} ({})".format(season, shows_dict[show]['name']),
        "action": "parse",
        "prop": "wikitext",
        "section": 1,
        "disabletoc": 1
    }
    
    episodes_params = {
    "format": "json",
    "list": "categorymembers",
    "action": "query",
    "cmtitle": "Category:{} S{} Episodes".format(show, season),
    "cmlimit": 50
    }
    
    season_data = {"nr": season, "show": show}
    request = session.get(url=url, params=season_params)
    summary = request.json()['parse']['wikitext']['*']
    season_params['section'] = 2
    request = session.get(url=url, params=season_params)
    plots = request.json()['parse']['wikitext']['*']
    season_data['summary_and_plots'] = clean_text(summary + plots)
    
    request = session.get(url=url, params=episodes_params)
    episodes = request.json()['query']['categorymembers']
    for episode in episodes:
        episode['summary'] = clean_text(get_episode_summary(episode['title']))
        episode['nr'] = get_episode_number(episode['title'])
        del episode['ns']
        del episode['pageid']
    season_data['episodes'] = sorted(episodes, key=lambda d: d['nr'])
    return season_data

def get_episode_summary(title):
    episode_params = {
        "format": "json",
        "prop": "wikitext",
        "action": "parse",
        "page": title,
        "section": 2,
        "disabletoc": 1
    }
    request = session.get(url=url, params=episode_params)
    if "Episode in detail." in request.json()['parse']['wikitext']['*'] or request.json()['parse']['wikitext']['*'] == "==Full Summary==":
        episode_params['section'] = 1
        request = session.get(url=url, params=episode_params)
    return clean_text(request.json()['parse']['wikitext']['*'])

def get_episode_number(title):
    episode_params = {
        "format": "json",
        "prop": "wikitext",
        "action": "parse",
        "page": title,
        "section": 0,
        "disabletoc": 1
    }
    request = session.get(url=url, params=episode_params)
    return int(re.search(r"episode\s*=\s*(\d+)", request.json()['parse']['wikitext']['*']).group(1))

In [None]:
def get_all_data(show):
    seasons = []
    for i in range(shows_dict[show]['seasons']):
        seasons.append(get_season_data(show, i+1))
    return seasons

seasons_data = get_all_data('GA')
seasons_data.extend(get_all_data('PP'))
seasons_data.extend(get_all_data('S19'))

In [None]:
with open('episodes.json', 'w', encoding='utf-8') as file:
    json.dump(seasons_data, file, ensure_ascii=False, indent=4)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6c50234c-0fc5-40eb-b0ef-2d1cda57d893' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>