In [1]:
import networkx as nx
import pandas as pd
from pyvis.network import Network
from IPython.display import display, HTML


# Network Basics

### Visualising a Network
Often when we think about network analysis, the first thing that comes to mind is the visual of a network. We find it easiest to imagine them as pictures. When it comes to analysis, the visual aspect of a network can be extremely helpful in helping us understand patterns in relational data. 

Later we will consider how networks are also models, representations that can be measured in different ways.

For now let's get to grips with how we can visualise network data.

In [2]:
#*
# This helper function will help display the network inline in Colab
# If you are working locally on a laptop using a different IDE like Visual Code etc, you may find it easier to
# open the created html file in a web browser like Chrome, Firefox, Safari or Edge.
def show_graph(file_name:str)-> None:
    display(HTML(file_name))

In [3]:
#*
# Some completely random names I came up with
friends = ['Rachel Green',
           'Joey Tribbiani',
           'Chandler Bing',
           'Monica Geller',
           'Ross Geller',
           'Phoebe Buffay']

# Random phrases for no reason
quotes = ["Oh. I'm sorry. Did my back hurt your knife?",
          "If I had to, I'd pee on any one of you!",
          "If I don't input those numbers, it doesn't make much of a difference.", # relatable
          "I'm Monica, I'm disgusting, I stalk guys and keep their underpants.",
          "Pivot!",
          "Oh, I wish I could, but I don't want to."]

gender_colors = ['#0000FF'
                 ,"#74E08B"
                 ,'#74E08B',
                 '#0000FF',
                 '#74E08B',
                 '#0000FF']

Let's create our network visualisation, that for now just contains the nodes, the circles that represent the 'thing' being represented. In our case, some people. We'll color the nodes by gender and give them 'titles' 

In [4]:
net = Network(notebook=True, cdn_resources='remote',)
net.add_nodes(friends, 
              color=gender_colors,
              title=quotes)
net.show('friends.html')
# show_graph('friends.html')


friends.html


What makes a network interesting is the relations between the nodes. These are called different things in different disiplines but generally 'edges' is a broadly accepted term.

An edge has three pieces of information.
1. Where it starts
2. Where it ends
3. The weight of the edge. I.e. how strong the relation is, but exactly what this means differs depending on what is being related and what the relation represents.

For our edges we're going to weight them by the number of times a pair of characters shared a scene across the series, which I'm sure we all know as standard.

In [5]:
#*
# If you don't know, load this cell to cheat.

friend_edges = pd.read_csv("https://github.com/Minyall/sc207_290_public/raw/refs/heads/main/data/friends_edges.csv", index_col=0)
friend_edges


Unnamed: 0,source,target,weight
0,Chandler Bing,Joey Tribbiani,920
1,Chandler Bing,Monica Geller,974
2,Chandler Bing,Phoebe Buffay,701
3,Chandler Bing,Rachel Green,686
4,Chandler Bing,Ross Geller,750
5,Joey Tribbiani,Monica Geller,724
6,Joey Tribbiani,Phoebe Buffay,710
7,Joey Tribbiani,Rachel Green,725
8,Joey Tribbiani,Ross Geller,720
9,Monica Geller,Phoebe Buffay,820


In [6]:
#*
# Also if you don't know how many scenes each character was in...
friend_nodes = pd.read_csv("https://github.com/Minyall/sc207_290_public/raw/refs/heads/main/data/friends_nodes.csv", index_col=0)
scene_count_lookup = friend_nodes.set_index('name')['n_scenes'].to_dict()
scene_count_lookup


{'Chandler Bing': 1508,
 'Joey Tribbiani': 1451,
 'Monica Geller': 1440,
 'Phoebe Buffay': 1341,
 'Rachel Green': 1461,
 'Ross Geller': 1416}

In [7]:
edges = friend_edges.to_records(index=False)


In [8]:
net = Network(notebook=True, cdn_resources='remote')
net.add_nodes(friends,
              color=gender_colors,
              title=quotes,
              size=[scene_count_lookup[friend] / 100 for friend in friends])

# For pyvis, you generally want weights to be between 0 and 1 for the best effect
# As all our weights are in the hundreds, dividing by 1000 will ensure every weight is under 1,
# What matters is the relative differences between the weights.
weight_scaler = 1000
for source, target, weight in edges:
    weight_value = weight / weight_scaler
    weight_label = str(weight)

    net.add_edge(source,target, weight=weight_value, label=weight_label)


net.show_buttons(filter_=['physics'])
net.show('friends.html')

friends.html


# Scaling up
So far our network has just been 6 nodes, and as every character has a relation with every other character in the network it's limited in what it could tell us.

Let's use a fuller dataset covering every character from the show across its full run. This will give us an opportunity to also think about filtering and managing large networks.

In [9]:
all_friends_nodes = pd.read_csv('https://github.com/Minyall/sc207_290_public/raw/refs/heads/main/data/all_friends_nodes.csv', index_col=0)
all_friends_edges = pd.read_csv('https://github.com/Minyall/sc207_290_public/raw/refs/heads/main/data/all_friends_edges.csv', index_col=0)

In [10]:
all_friends_nodes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 695 entries, 0 to 694
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      695 non-null    object
 1   n_scenes  695 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 16.3+ KB


In [11]:
all_friends_edges.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3107 entries, 0 to 3106
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   source  3107 non-null   object
 1   target  3107 non-null   object
 2   weight  3107 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 97.1+ KB


Each edge represents a pair of characters that have been co-present in at least one scene across the show's run. The weight indicates the number of scenes they've been co-present in. This includes even incidental generic characters.

In [14]:
all_friends_edges.iloc[0]

source     1st Customer
target    Monica Geller
weight                1
Name: 0, dtype: object

If we examine the weights of the edges we'll see how many are just one off co-occurences.

In [None]:
import plotly.express as px

px.box(data_frame=all_friends_edges, y='weight', log_y=True, hover_data=['source','target'])

The box plot shows us that the vast majority of edges represent a max of 2 scenes across the series. The major outliers at the top are our core characters, and the outliers between weight of 4 and 60 are recurring side characters.

In [None]:
all_friends_edges = all_friends_edges[all_friends_edges['weight'] > 4]

Now we've taken those edges out, there will be nodes in our nodes dataset that no longer exist in the edges. Graph filtering can get a bit complicated because as edges get removed nodes become redundant, and vice versa.

At this point it's best if we start managing our network in a special object designed for this purpose.

## NetworkX
`NetworkX` is a library designed specifically for representing and exploring networks, also known as Graphs (Graph is the mathematical term for a network). We'll need NetworkX for other tools later once we get into more advanced analysis, but for now it can be helpful for us to manage filtering. Like Pandas, it is a well regarded library and so many other network related libraries integrate with it, including `pyvis`.

In [29]:
import networkx as nx # importing as nx is another convention

G = nx.from_pandas_edgelist(all_friends_edges, source='source', target='target', edge_attr='weight')
G.edges(data=True)

EdgeDataView([('Alice Knight', 'Frank Buffay Jr.', {'weight': 8}), ('Alice Knight', 'Phoebe Buffay', {'weight': 10}), ('Frank Buffay Jr.', 'Chandler Bing', {'weight': 9}), ('Frank Buffay Jr.', 'Joey Tribbiani', {'weight': 7}), ('Frank Buffay Jr.', 'Monica Geller', {'weight': 8}), ('Frank Buffay Jr.', 'Phoebe Buffay', {'weight': 23}), ('Frank Buffay Jr.', 'Rachel Green', {'weight': 9}), ('Frank Buffay Jr.', 'Ross Geller', {'weight': 8}), ('Phoebe Buffay', 'Ben Geller', {'weight': 7}), ('Phoebe Buffay', 'Bobby Rush', {'weight': 5}), ('Phoebe Buffay', 'Bonnie', {'weight': 5}), ('Phoebe Buffay', 'Both', {'weight': 5}), ('Phoebe Buffay', 'Carol Willick', {'weight': 8}), ('Phoebe Buffay', 'Chandler Bing', {'weight': 701}), ('Phoebe Buffay', 'Charlie Wheeler', {'weight': 11}), ('Phoebe Buffay', 'David', {'weight': 15}), ('Phoebe Buffay', 'Director', {'weight': 5}), ('Phoebe Buffay', 'Dr. Harad', {'weight': 5}), ('Phoebe Buffay', 'Emily Waltham', {'weight': 14}), ('Phoebe Buffay', 'Eric', {'we

Building the network from the edge list means NetworkX has inferred the nodes automatically.

In [30]:
G.nodes

NodeView(('Alice Knight', 'Frank Buffay Jr.', 'Phoebe Buffay', 'Amy Green', 'Joey Tribbiani', 'Rachel Green', 'Ross Geller', 'Andrea Waltham', 'Chandler Bing', 'Judy Geller', 'Monica Geller', 'Stephen Waltham', 'Barry Farber', 'Ben Geller', 'Carol Willick', "Bob (Chandler's coworker)", 'Bobby Rush', 'Bonnie', 'Both', 'Susan Bunch', 'Cecilia', 'Charlie Wheeler', 'David', 'Doug', 'Eddie Menuek', 'Emily Waltham', 'Erica', 'Gary', 'Gunther', 'Guy', 'Guys', 'Jack Geller', 'Janice Litman Goralnik', 'Janine Lecroix', 'Jill Goodacre', 'Joanna', 'Julie', 'Kathy', 'Man', 'Mike Hannigan', 'Mona', 'Nora Tyler Bing', 'Nurse', 'Peter Becker', 'Richard Burke', 'Sandra Green', 'Tag Jones', 'The Girls', 'The Guys', 'The Interviewer', 'Woman', 'Chloe', 'Customer', 'Danny', 'Dina', 'Director', 'Dr. Drake Ramoray', 'Dr. Harad', 'Dr. Long', 'Elizabeth Stevens', 'Paul Stevens', 'Eric', 'Erin', 'Estelle Leonard', 'Gavin Mitchell', 'Girl', 'Jill Green', 'Sophie', 'Joey Tribbiani Sr.', 'Kate Miller', 'Lauren',

In [53]:
all_friends_nodes['size'] = all_friends_nodes['n_scenes'] / 100
node_attrs = all_friends_nodes.set_index('name').to_dict(orient='index')
nx.set_node_attributes(G, node_attrs)

In [54]:
G.nodes(data=True)

NodeDataView({'Alice Knight': {'size': 0.1, 'n_scenes': 10}, 'Frank Buffay Jr.': {'size': 0.26, 'n_scenes': 26}, 'Phoebe Buffay': {'size': 13.41, 'n_scenes': 1341}, 'Amy Green': {'size': 0.1, 'n_scenes': 10}, 'Joey Tribbiani': {'size': 14.51, 'n_scenes': 1451}, 'Rachel Green': {'size': 14.61, 'n_scenes': 1461}, 'Ross Geller': {'size': 14.16, 'n_scenes': 1416}, 'Andrea Waltham': {'size': 0.09, 'n_scenes': 9}, 'Chandler Bing': {'size': 15.08, 'n_scenes': 1508}, 'Judy Geller': {'size': 0.52, 'n_scenes': 52}, 'Monica Geller': {'size': 14.4, 'n_scenes': 1440}, 'Stephen Waltham': {'size': 0.12, 'n_scenes': 12}, 'Barry Farber': {'size': 0.09, 'n_scenes': 9}, 'Ben Geller': {'size': 0.17, 'n_scenes': 17}, 'Carol Willick': {'size': 0.44, 'n_scenes': 44}, "Bob (Chandler's coworker)": {'size': 0.05, 'n_scenes': 5}, 'Bobby Rush': {'size': 0.07, 'n_scenes': 7}, 'Bonnie': {'size': 0.09, 'n_scenes': 9}, 'Both': {'size': 0.15, 'n_scenes': 15}, 'Susan Bunch': {'size': 0.32, 'n_scenes': 32}, 'Cecilia': {

In [63]:
nx.write_gexf(G,'friends.gxf')

In [60]:
net = Network()
net.from_nx(G, edge_weight_transf=lambda x: x/1000)
net.show_buttons(filter_=['physics', 'nodes'])
net.save_graph('all_friends.html')

In [None]:
edge_records = all_friends_edges.to_records(index=False)

AssertionError: non existent node 'Alice Knight'

In [13]:
got = pd.read_csv('GOT_1.csv')
got_G = nx.from_pandas_edgelist(got, edge_attr='weight')
vis = Network()
vis.from_nx(got_G, edge_scaling=True)
vis.show_buttons(filter_=['physics'])
vis.save_graph('got.html')


In [None]:
nx.set_node_attributes(G, node_attributes)
G.nodes(data=True)

In [None]:
labels = {node: f"{data['name']} \n {data['occupation']}" for node, data in G.nodes(data=True)}

nx.draw(G, labels=labels, with_labels=True)
plt.tight_layout()
plt.show()

In [None]:
pos = nx.fruchterman_reingold_layout(G, seed=1)
to_plot = pd.DataFrame(pos)
to_plot.T.merge(nodes, left_index=True, right_index=True)