In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from itertools import combinations
import community
import community as community_louvain
from networkx.algorithms.community import k_clique_communities

import random
import plotly.graph_objects as go
import heapq
import numpy as np

from igraph import *
import igraph as ig

from IPython.display import display

# Set random seed for Python's random module
random.seed(42)

# Set random seed for NumPy
np.random.seed(42)

In [4]:
df = pd.read_csv("/Users/zy/Documents/GitHub/Social-Media-Analysis-Project/Group Project/Sampling Data/citation_sample.csv", index_col=0)

### Visualization Function

In [2]:
def visualize_network_with_plotly(G, layout_type='spring', color_dict=None, node_size=300, title="Network", pos=None):
    # Compute node positions using the specified layout algorithm
    if pos is None:
        if layout_type == 'spring':
            pos = nx.spring_layout(G)
        elif layout_type == 'random':
            pos = nx.random_layout(G)
        elif layout_type == 'circular':
            pos = nx.circular_layout(G)
        elif layout_type == 'kamada_kawai':
            pos = nx.kamada_kawai_layout(G)
        elif layout_type == 'shell':
            pos = nx.shell_layout(G)

    # Create edge trace
    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines')

    # Create node trace
    node_x = []
    node_y = []
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        hoverinfo='text',
        marker=dict(
            showscale=True,
            colorscale='viridis',
            reversescale=True,
            color=[],
            size=node_size,
            colorbar=dict(
                thickness=15,
                title='Node Connections',
                xanchor='left',
                titleside='right'
            ),
            line_width=2))

    node_adjacencies = []
    node_text = []
    for node, adjacencies in G.adjacency():
        x, y = pos[node]
        node_adjacencies.append(len(adjacencies))
        node_text.append(f'Node {node}<br># of connections: {len(adjacencies)}')

    node_trace.marker.color = node_adjacencies
    node_trace.text = node_text

    # Create figure
    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title=title,
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        annotations=[dict(
                            showarrow=False,
                            xref="paper", yref="paper",
                            x=0.005, y=-0.002)],
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                    )

    # Show plot
    fig.show()

### Community Library

In [3]:
def community_layout(g, partition):
    """
    Compute the layout for a modular graph.
    """

    pos_communities = _position_communities(g, partition, scale=3.)

    pos_nodes = _position_nodes(g, partition, scale=1.)

    # combine positions
    pos = dict()
    for node in g.nodes():
        pos[node] = pos_communities[node] + pos_nodes[node]

    return pos

def _position_communities(g, partition, **kwargs):

    # create a weighted graph, in which each node corresponds to a community,
    # and each edge weight to the number of edges between communities
    between_community_edges = _find_between_community_edges(g, partition)

    communities = set(partition.values())
    hypergraph = nx.DiGraph()
    hypergraph.add_nodes_from(communities)
    for (ci, cj), edges in between_community_edges.items():
        hypergraph.add_edge(ci, cj, weight=len(edges))

    # find layout for communities
    pos_communities = nx.spring_layout(hypergraph, **kwargs)

    # set node positions to position of community
    pos = dict()
    for node, community in partition.items():
        pos[node] = pos_communities[community]

    return pos

def _find_between_community_edges(g, partition):

    edges = dict()

    for (ni, nj) in g.edges():
        ci = partition[ni]
        cj = partition[nj]

        if ci != cj:
            try:
                edges[(ci, cj)] += [(ni, nj)]
            except KeyError:
                edges[(ci, cj)] = [(ni, nj)]

    return edges

def _position_nodes(g, partition, **kwargs):
    """
    Positions nodes within communities.
    """

    communities = dict()
    for node, community in partition.items():
        try:
            communities[community] += [node]
        except KeyError:
            communities[community] = [node]

    pos = dict()
    for ci, nodes in communities.items():
        subgraph = g.subgraph(nodes)
        pos_subgraph = nx.spring_layout(subgraph, **kwargs)
        pos.update(pos_subgraph)

    return pos

## 1. Author Network

### Network Graph

In [5]:
# Function to construct co-authorship network
def construct_coauthorship_network(data):
    G = nx.Graph()
    
    # Add nodes (authors)
    authors = set()
    for authors_list in data['authors']:
        authors.update(eval(authors_list))
    for author in authors:
        G.add_node(author)
    
    # Add edges (co-authorships)
    for authors_list in data['authors']:
        authors_list = eval(authors_list)
        for i in range(len(authors_list)):
            for j in range(i + 1, len(authors_list)):
                if not G.has_edge(authors_list[i], authors_list[j]):
                    G.add_edge(authors_list[i], authors_list[j])
    
    return G

network_author = construct_coauthorship_network(df)
visualize_network_with_plotly(network_author, node_size=10, title="Co-authorship Network", pos=nx.spring_layout(network_author, k=0.1, seed=42))

#### 1.1 Influencer Analysis

In [None]:
def calculate_degree_centrality(G):
    degree_centrality = nx.degree_centrality(G)
    sorted_degree_centrality = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)
    return sorted_degree_centrality

def calculate_closeness_centrality(G):
    closeness_centrality = nx.closeness_centrality(G)
    sorted_closeness_centrality = sorted(closeness_centrality.items(), key=lambda x: x[1], reverse=True)
    return sorted_closeness_centrality

def calculate_betweenness_centrality(G):
    betweenness_centrality = nx.betweenness_centrality(G)
    sorted_betweenness_centrality = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)
    return sorted_betweenness_centrality

def calculate_igenvector_entrality(G):
    eigenvector_centrality = nx.eigenvector_centrality(G)
    sorted_eigenvector_centrality = sorted(eigenvector_centrality.items(), key=lambda x: x[1], reverse=True)
    return sorted_eigenvector_centrality                            

In [None]:
degree_centrality = calculate_degree_centrality(network_author)[:5]
closeness_centrality = calculate_closeness_centrality(network_author)[:5]
betweenness_centrality = calculate_betweenness_centrality(network_author)[:5]
eigenvector_centrality = calculate_igenvector_entrality(network_author)[:5]

# Create DataFrames
df_degree_centrality = pd.DataFrame(degree_centrality, columns=['Author', 'Degree Centrality'])
df_closeness_centrality = pd.DataFrame(closeness_centrality, columns=['Author', 'Closeness Centrality'])
df_betweenness_centrality = pd.DataFrame(betweenness_centrality, columns=['Author', 'Betweenness Centrality'])
df_eigenvector_centrality = pd.DataFrame(eigenvector_centrality, columns=['Author', 'Eigenvector Centrality'])

In [None]:
for i in (df_degree_centrality, df_closeness_centrality, df_betweenness_centrality, df_eigenvector_centrality):
    display(i)

Unnamed: 0,Author,Degree Centrality
0,Raymond A. Lorie,0.035326
1,Mike W. Blasgen,0.035326
2,W. Frank King III,0.035326
3,Vera Watson,0.035326
4,Jim Gray,0.035326


#### 1.2 Community Detection

In [None]:
def compute_community_statistics(G, partition):
    # Invert the partition dictionary to map community IDs to nodes
    community_nodes = {}
    for node, community_id in partition.items():
        if community_id not in community_nodes:
            community_nodes[community_id] = []
        community_nodes[community_id].append(node)
    
    # Compute statistics for each community
    community_stats = {}
    for community_id, nodes in community_nodes.items():
        subgraph = G.subgraph(nodes)
        avg_degree_centrality = np.mean(list(nx.degree_centrality(subgraph).values()))
        collaboration_count = subgraph.number_of_edges()
        avg_closeness_centrality = np.mean(list(nx.closeness_centrality(subgraph).values()))
        avg_betweenness_centrality = np.mean(list(nx.betweenness_centrality(subgraph).values()))
        community_size = len(nodes)
        community_stats[community_id] = {
            'Average Degree Centrality': avg_degree_centrality,
            'Collaboration Count': collaboration_count,
            'Average Closeness Centrality': avg_closeness_centrality,
            'Average Betweenness Centrality': avg_betweenness_centrality,
            'Community Size': community_size
        }
    return community_stats

# Compute community statistics
partition = community.best_partition(network_author.to_undirected())
community_stats = compute_community_statistics(network_author, partition)

# Print insightful information for each community
for community_id, stats in community_stats.items():
    print(f"Community {community_id} Statistics:")
    for metric, value in stats.items():
        print(f"- {metric}: {value}")
    print()

Community 0 Statistics:
- Average Degree Centrality: 1.0
- Collaboration Count: 3
- Average Closeness Centrality: 1.0
- Average Betweenness Centrality: 0.0
- Community Size: 3

Community 1 Statistics:
- Average Degree Centrality: 1.0
- Collaboration Count: 3
- Average Closeness Centrality: 1.0
- Average Betweenness Centrality: 0.0
- Community Size: 3

Community 2 Statistics:
- Average Degree Centrality: 1.0
- Collaboration Count: 3
- Average Closeness Centrality: 1.0
- Average Betweenness Centrality: 0.0
- Community Size: 3

Community 3 Statistics:
- Average Degree Centrality: 1.0
- Collaboration Count: 6
- Average Closeness Centrality: 1.0
- Average Betweenness Centrality: 0.0
- Community Size: 4

Community 4 Statistics:
- Average Degree Centrality: 1.0
- Collaboration Count: 3
- Average Closeness Centrality: 1.0
- Average Betweenness Centrality: 0.0
- Community Size: 3

Community 5 Statistics:
- Average Degree Centrality: 1.0
- Collaboration Count: 6
- Average Closeness Centrality: 1

#### 1.3 Collaboration Pattern Over Time

In [None]:
def analyze_collaboration_over_time(data):
    collaboration_over_time = {}
    for i, row in data.iterrows():
        year = row['year']
        authors_list = eval(row['authors'])
        for i in range(len(authors_list)):
            for j in range(i + 1, len(authors_list)):
                collaboration = tuple(sorted([authors_list[i], authors_list[j]]))
                if collaboration not in collaboration_over_time:
                    collaboration_over_time[collaboration] = []
                collaboration_over_time[collaboration].append(year)
    return collaboration_over_time

# Use the function
collaboration_over_time = analyze_collaboration_over_time(df)

# Convert to DataFrame
collaboration_df = pd.DataFrame(columns=['Author 1', 'Author 2', 'Collaboration Years'])
rows = []
for collab, years in collaboration_over_time.items():
    author1, author2 = collab
    rows.append({'Author 1': author1, 'Author 2': author2, 'Collaboration Years': years})

collaboration_df = pd.concat([collaboration_df, pd.DataFrame(rows)], ignore_index=True)

# Print the collaboration patterns over time
print(collaboration_df)

                  Author 1              Author 2 Collaboration Years
0    Joseph M. Hellerstein   Michael Stonebraker  [1993, 1993, 1993]
1       Richard H. Lathrop  Thomas G. Dietterich              [1997]
2     Thomas G. Dietterich    Tomás Lozano-Pérez              [1997]
3       Richard H. Lathrop    Tomás Lozano-Pérez              [1997]
4          David J. DeWitt          Goetz Graefe  [1987, 1987, 1987]
..                     ...                   ...                 ...
517       Geneviève Jomier         Khaled Jouini              [2007]
518        George Beskales    Mohamed A. Soliman              [2008]
519        George Beskales         Ihab F. Ilyas              [2008]
520          Ihab F. Ilyas    Mohamed A. Soliman              [2008]
521        Christian Kalus           Peter Dadam              [1995]

[522 rows x 3 columns]


#### 1.4 Potential Collaborators

In [None]:
def identify_potential_collaborators(G, author):
    potential_collaborators = []
    for other_author in G.nodes():
        if other_author != author and not G.has_edge(author, other_author):
            common_neighbors = len(list(nx.common_neighbors(G, author, other_author)))
            if common_neighbors > 0:
                potential_collaborators.append((other_author, common_neighbors))
    # Sort potential collaborators by number of common neighbors
    potential_collaborators.sort(key=lambda x: x[1], reverse=True)
    return potential_collaborators

# Use the function
author = 'Joseph M. Hellerstein'
potential_collaborators = identify_potential_collaborators(network_author, author)
print(potential_collaborators)  # Print the potential collaborators for the author

[('Ramesh Govindan', 1), ('Alec Woo', 1)]


## 2. Citation Network

### Network Graph

In [None]:
# Initialize a directed graph
G = nx.DiGraph()

# Add nodes for each paper
for index, row in df.iterrows():
    paper_id = row["id"]
    references = row["references"]
    if references:
        references = eval(references)  # Convert string representation of list to actual list
        for ref_id in references:
            G.add_node(ref_id, title=row["title"])

# Add edges connecting papers to their references
for index, row in df.iterrows():
    paper_id = row["id"]
    references = row["references"]
    #if references:
    references = eval(references)  # Convert string representation of list to actual list
    for ref_id in references:
        G.add_edge(ref_id, paper_id)

visualize_network_with_plotly(G, node_size=3,title="Citation Network")

#### 2.1 Centrality Measures

In [None]:
# Calculate degree centrality
degree_centrality = nx.degree_centrality(G)

# Calculate betweenness centrality
betweenness_centrality = nx.betweenness_centrality(G)

# Calculate closeness centrality
closeness_centrality = nx.closeness_centrality(G)

# Find the most influential papers based on each centrality measure
most_influential_degree = max(degree_centrality, key=degree_centrality.get)
most_influential_betweenness = max(betweenness_centrality, key=betweenness_centrality.get)
most_influential_closeness = max(closeness_centrality, key=closeness_centrality.get)

# Create DataFrame for centrality measures
centrality_df = pd.DataFrame({'Paper ID': list(G.nodes())})

# Add columns for most influential papers, hubs, and authorities
centrality_df['Most Influential (Degree)'] = centrality_df['Paper ID'] == most_influential_degree
centrality_df['Most Influential (Betweenness)'] = centrality_df['Paper ID'] == most_influential_betweenness
centrality_df['Most Influential (Closeness)'] = centrality_df['Paper ID'] == most_influential_closeness

In [None]:
# Extract most influential papers based on each centrality measure
most_influential_papers = {
    'Degree Centrality': centrality_df[centrality_df['Most Influential (Degree)']]['Paper ID'].iloc[0],
    'Betweenness Centrality': centrality_df[centrality_df['Most Influential (Betweenness)']]['Paper ID'].iloc[0],
    'Closeness Centrality': centrality_df[centrality_df['Most Influential (Closeness)']]['Paper ID'].iloc[0]
}

# Display extracted values
print("Most Influential Papers:")
print(most_influential_papers)

Unnamed: 0,Author,Degree Centrality
0,Miodrag Potkonjak,0.005042
1,Alberto L. Sangiovanni-Vincentelli,0.00381
2,Chung-Kuan Cheng,0.003025
3,Gerhard J. Woeginger,0.002913
4,Rance Cleaveland,0.002465


Unnamed: 0,Author,Closeness Centrality
0,Christos H. Papadimitriou,0.028636
1,Esther M. Arkin,0.028314
2,Sándor P. Fekete,0.027934
3,Mihalis Yannakakis,0.027591
4,Sudipto Guha,0.027588


Unnamed: 0,Author,Betweenness Centrality
0,Christos H. Papadimitriou,0.016319
1,Sándor P. Fekete,0.014304
2,Esther M. Arkin,0.012581
3,Mihalis Yannakakis,0.012539
4,Alberto L. Sangiovanni-Vincentelli,0.012178


Unnamed: 0,Author,Eigenvector Centrality
0,John Derrick,0.265659
1,Rance Cleaveland,0.265571
2,Gerald Lüttgen,0.257847
3,Marian Gheorghe,0.256773
4,Hussein Zedan,0.256773


#### 2.2 Network Analysis

In [None]:
# Number of nodes and edges
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

# Network density
density = nx.density(G)

# Transitivity
transitivity = nx.transitivity(G)

# Print the results
print("Number of Nodes:", num_nodes)
print("Number of Edges:", num_edges)
print("Network Density:", density)
print("Transitivity:", transitivity)

Community 0 Statistics:
- Average Degree Centrality: 0.07738095238095238
- Collaboration Count: 156
- Average Closeness Centrality: 0.2915511952911724
- Average Betweenness Centrality: 0.04111463133640553
- Community Size: 64

Community 1 Statistics:
- Average Degree Centrality: 0.14285714285714285
- Collaboration Count: 58
- Average Closeness Centrality: 0.3582276559645309
- Average Betweenness Centrality: 0.06914796569968983
- Community Size: 29

Community 2 Statistics:
- Average Degree Centrality: 0.02866814767798582
- Collaboration Count: 271
- Average Closeness Centrality: 0.19538486536688732
- Average Betweenness Centrality: 0.03131669995830766
- Community Size: 138

Community 10 Statistics:
- Average Degree Centrality: 0.04847963281698223
- Collaboration Count: 169
- Average Closeness Centrality: 0.2226759229503607
- Average Betweenness Centrality: 0.04437960902844829
- Community Size: 84

Community 4 Statistics:
- Average Degree Centrality: 1.0
- Collaboration Count: 3
- Averag

#### 2.3 Community Analysis

In [None]:
# Compute the best partition using community detection
partition = community.best_partition(G.to_undirected())

# Create a new layout based on the community structure
pos = community_layout(G, partition)

# Visualize the graph with the new layout
visualize_network_with_plotly(G, layout_type=pos, node_size=5, title="Citation Network (Community)", pos=pos)

                   Author 1          Author 2 Collaboration Years
0          David S. Johnson       M. R. Garey        [1979, 1983]
1      Charles E. Leiserson  Thomas H. Cormen              [1989]
2          Ronald L. Rivest  Thomas H. Cormen              [1989]
3      Charles E. Leiserson  Ronald L. Rivest              [1989]
4              Richard Hull   Serge Abiteboul        [1995, 1988]
...                     ...               ...                 ...
12729       Daniel Hausmann     Dennis Walter              [2009]
12730        Christoph Lüth     Holger Täubig              [2009]
12731        Christoph Lüth     Dennis Walter              [2009]
12732         Dennis Walter     Holger Täubig              [2009]
12733    Isabella Mastroeni     Musard Balliu              [2009]

[12734 rows x 3 columns]


In [None]:
partition_lists = {}
for node, community_id in partition.items():
    if community_id not in partition_lists:
        partition_lists[community_id] = [node]
    else:
        partition_lists[community_id].append(node)

# Analyze community size, density, and centrality
community_data = {}
for community_id, nodes in partition_lists.items():
    community_graph = G.subgraph(nodes)
    community_size = len(nodes)
    community_density = nx.density(community_graph)
    community_centrality = nx.degree_centrality(community_graph)
    community_data[community_id] = {
        'Size': community_size,
        'Density': community_density,
        'Centrality': community_centrality
    }

# Determine key nodes within each community
key_nodes = {}
for community_id, data in community_data.items():
    centrality = data['Centrality']
    if centrality:  # Check if there are nodes in the community
        key_node = max(centrality, key=centrality.get)
        key_nodes[community_id] = key_node
    else:
        key_nodes[community_id] = None

# Run the CPM algorithm to detect overlapping communities
k = 3  # Define the size of the cliques
overlapping_communities = list(k_clique_communities(G.to_undirected(), k))

# Print results
print("Community Analysis Results:")
print("Community Data:", community_data)
print("Key Nodes within Each Community:", key_nodes)

# Print the overlapping communities
print("Overlapping Communities (Cliques of size", k, "):")
for i, community in enumerate(overlapping_communities):
    print("Community", i+1, ":", community)

Potential Collaborators for David S. Johnson
   Potential Collaborator  Common Neighbors
0         Joan Feigenbaum                 1
1             Anne Condon                 1
2           Bonnie Berger                 1
3              Nir Halman                 1
4              Moshe Dror                 1
5            Carsten Lund                 1
6       David Simchi-Levi                 1
7        Mohamed Mostagir                 1
8           Diego Klabjan                 1
9         Edmund K. Burke                 1
10           Carol Meyers                 1
