# Task 3 - Social Network Analysis

In [5]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

g = nx.Graph()

## Load data

In [None]:
films_dictionary = {}
actors_dictionary = {}

df = pd.read_csv('data/casts.csv',header=0,names=["film_id", "title", "actor", "type_role"], delimiter=";", index_col=False)

## Create nodes

In [44]:
g = nx.Graph()
# actors = df['actor'].unique()
# for actor in actors:
#     g.add_node(actor)


In [45]:
df_grouped = df.groupby('film_id')['film_id','title','actor']
for name, group in df_grouped:
    actors = group['actor']
    # Adding reduntant nodes and edges is not possible by default -> uniqueness satisfied
    if actors.shape[0] > 2: continue # movies with less or equal number of five actors
    for actor1 in actors:
        g.add_node(actor1)
        for actor2 in actors:
            g.add_node(actor2)
            if actor1 != actor2:
                g.add_edge(actor1, actor2, film_id=group['film_id'].agg('unique')[0], title=group['title'].agg('unique')[0])

In [58]:
backup_g = g.copy()

In [56]:
# g = backup_g.copy()

## General statistics

In [59]:
nodes = g.number_of_nodes()
edges = g.number_of_edges()
density = nx.density(g)
components = nx.number_connected_components(g)
print("""
GENERAL STATISTICS:
Number of nodes: {}
Number of edges: {}
Density: {}
Number of components: {}
""".format(nodes, edges, density, components))


GENERAL STATISTICS:
Number of nodes: 7941
Number of edges: 19409
Density: 0.0006156550656811872
Number of components: 804



## Kevin Bacon Number

Simple explanation: Who has less than 6 degrees of seperation from Kevin Bacon.

https://simple.wikipedia.org/wiki/Bacon_number

In this example, Kevin Bacon Number is calculator towards an actor Ray Milland.

In [60]:
for actor in g.nodes:
    g.nodes[actor]["kevin_bacon"] = float("inf")

kevin_bacon = nx.single_source_shortest_path_length(g, "Ray Milland")
for actor in kevin_bacon:
    g.nodes[actor]['kevin_bacon'] = kevin_bacon[actor]

avg_kevin_bacon = sum(kevin_bacon[actor] for actor in kevin_bacon)/len(kevin_bacon)
lowest = list(kevin_bacon.items())[:5]
highest = list(kevin_bacon.items())[-5:]

print("""
Avg. Kevin Bacon: {}
Lowest 5: {}
Highest 5: {}
""".format(avg_kevin_bacon, lowest, highest))


Avg. Kevin Bacon: 3.8615435574693606
Lowest 5: [('Ray Milland', 0), ('Ellen Drew', 1), ('Guy Middleton', 1), ('Ronald Culver', 1), ('Loretta Young', 1)]
Highest 5: [('Jeff Glodblum', 7), ('Cecile Aubry', 7), ('Earl Schevk', 8), ('Charles Bryant', 8), ('Ernest Thesiger', 8)]



## Centralities (Key Actors/Players)

In [61]:
from collections import Counter
centralities_funcs = {
    'Degree Centrality': nx.degree_centrality,
    # 'Closeness Centrality': nx.closeness_centrality, # took too much time
    # 'Betweenness Centrality': nx.betweenness_centrality, # took too much time
    'Eigenvector Centrality': nx.eigenvector_centrality
}
for centrality_func in centralities_funcs:
    centrality = centralities_funcs[centrality_func](g)
    for actor in g.nodes:
        g.nodes[actor][centrality_func] = centrality[actor]
    
    highest = Counter(centrality).most_common(5)

    print("""
    Top 5 actors with highest {}:
    {}
    """.format(centrality_func, highest))


    Top 5 actors with highest Degree Centrality:
    [('s a', 0.09659949622166246), ('Clint Eastwood', 0.00654911838790932), ('Robert Duvall', 0.006297229219143576), ('Henry Fonda', 0.005793450881612091), ('Robert Mitchum', 0.005667506297229219)]
    

    Top 5 actors with highest Eigenvector Centrality:
    [('s a', 0.6808058389463773), ('Robert Duvall', 0.0399289901783157), ('Michael Douglas', 0.03929066774242478), ('Anthony Hopkins', 0.03809205107080914), ('Brad Pitt', 0.03661210055781485)]
    


## Finding communities using K Clique

In [92]:
communities = {node:cid+1 for cid,community in enumerate(nx.algorithms.community.k_clique_communities(g,3)) for node in community}

for actor in communities:
    g.nodes[actor]['community'] = communities[actor]

### Each actor has these attributes now

In [95]:
g.nodes['Kristin ScottThomas']

{'kevin_bacon': 5,
 'Degree Centrality': 0.0005037783375314861,
 'Eigenvector Centrality': 0.00022470149080921193,
 'community': 110}

## Export into GEFX file

In [96]:
networkx.write_gexf(graph, 'casts_sna.gexf')

NameError: name 'networkx' is not defined