In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from itertools import combinations
import community
import community as community_louvain
from networkx.algorithms.community import k_clique_communities

import random
import plotly.graph_objects as go
import heapq
import numpy as np

from igraph import *
import igraph as ig

from IPython.display import display

# Set random seed for Python's random module
random.seed(42)

# Set random seed for NumPy
np.random.seed(42)

### Visualization Function

In [2]:
def visualize_network_with_plotly(G, layout_type='spring', color_dict=None, node_size=300, title="Network", pos=None):
    # Compute node positions using the specified layout algorithm
    if pos is None:
        if layout_type == 'spring':
            pos = nx.spring_layout(G)
        elif layout_type == 'random':
            pos = nx.random_layout(G)
        elif layout_type == 'circular':
            pos = nx.circular_layout(G)
        elif layout_type == 'kamada_kawai':
            pos = nx.kamada_kawai_layout(G)
        elif layout_type == 'shell':
            pos = nx.shell_layout(G)

    # Create edge trace
    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines')

    # Create node trace
    node_x = []
    node_y = []
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        hoverinfo='text',
        marker=dict(
            showscale=True,
            colorscale='viridis',
            reversescale=True,
            color=[],
            size=node_size,
            colorbar=dict(
                thickness=15,
                title='Node Connections',
                xanchor='left',
                titleside='right'
            ),
            line_width=2))

    node_adjacencies = []
    node_text = []
    for node, adjacencies in G.adjacency():
        x, y = pos[node]
        node_adjacencies.append(len(adjacencies))
        node_text.append(f'Node {node}<br># of connections: {len(adjacencies)}')

    node_trace.marker.color = node_adjacencies
    node_trace.text = node_text

    # Create figure
    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title=title,
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        annotations=[dict(
                            showarrow=False,
                            xref="paper", yref="paper",
                            x=0.005, y=-0.002)],
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                    )

    # Show plot
    fig.show()

### Community Library

In [3]:
def community_layout(g, partition):
    """
    Compute the layout for a modular graph.
    """

    pos_communities = _position_communities(g, partition, scale=3.)

    pos_nodes = _position_nodes(g, partition, scale=1.)

    # combine positions
    pos = dict()
    for node in g.nodes():
        pos[node] = pos_communities[node] + pos_nodes[node]

    return pos

def _position_communities(g, partition, **kwargs):

    # create a weighted graph, in which each node corresponds to a community,
    # and each edge weight to the number of edges between communities
    between_community_edges = _find_between_community_edges(g, partition)

    communities = set(partition.values())
    hypergraph = nx.DiGraph()
    hypergraph.add_nodes_from(communities)
    for (ci, cj), edges in between_community_edges.items():
        hypergraph.add_edge(ci, cj, weight=len(edges))

    # find layout for communities
    pos_communities = nx.spring_layout(hypergraph, **kwargs)

    # set node positions to position of community
    pos = dict()
    for node, community in partition.items():
        pos[node] = pos_communities[community]

    return pos

def _find_between_community_edges(g, partition):

    edges = dict()

    for (ni, nj) in g.edges():
        ci = partition[ni]
        cj = partition[nj]

        if ci != cj:
            try:
                edges[(ci, cj)] += [(ni, nj)]
            except KeyError:
                edges[(ci, cj)] = [(ni, nj)]

    return edges

def _position_nodes(g, partition, **kwargs):
    """
    Positions nodes within communities.
    """

    communities = dict()
    for node, community in partition.items():
        try:
            communities[community] += [node]
        except KeyError:
            communities[community] = [node]

    pos = dict()
    for ci, nodes in communities.items():
        subgraph = g.subgraph(nodes)
        pos_subgraph = nx.spring_layout(subgraph, **kwargs)
        pos.update(pos_subgraph)

    return pos

## 1. Sample Top 20 Paper Author Network

### Network Graph

In [4]:
# Read the CSV file into a pandas DataFrame
df_sample = pd.read_csv('/Users/zy/Documents/GitHub/Social-Media-Analysis-Project/Group Project/Sample Top 20 & Full Top3/Citation Network/top_20_sample.csv', index_col=0)  # Assuming the first column contains the author names

# Function to construct co-authorship network
def construct_coauthorship_network(data):
    G = nx.Graph()
    
    # Add nodes (authors)
    authors = set()
    for authors_list in data['authors']:
        authors.update(eval(authors_list))
    for author in authors:
        G.add_node(author)
    
    # Add edges (co-authorships)
    for authors_list in data['authors']:
        authors_list = eval(authors_list)
        for i in range(len(authors_list)):
            for j in range(i + 1, len(authors_list)):
                if not G.has_edge(authors_list[i], authors_list[j]):
                    G.add_edge(authors_list[i], authors_list[j])
    
    return G

network_author = construct_coauthorship_network(df_sample)
visualize_network_with_plotly(network_author, node_size=10, title="Sample Co-authorship Network", pos=nx.spring_layout(network_author, k=0.2, seed=42))

#### 1.1 Influencer Analysis

In [5]:
def calculate_degree_centrality(G):
    degree_centrality = nx.degree_centrality(G)
    sorted_degree_centrality = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)
    return sorted_degree_centrality

def calculate_closeness_centrality(G):
    closeness_centrality = nx.closeness_centrality(G)
    sorted_closeness_centrality = sorted(closeness_centrality.items(), key=lambda x: x[1], reverse=True)
    return sorted_closeness_centrality

def calculate_betweenness_centrality(G):
    betweenness_centrality = nx.betweenness_centrality(G)
    sorted_betweenness_centrality = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)
    return sorted_betweenness_centrality

def calculate_igenvector_entrality(G):
    eigenvector_centrality = nx.eigenvector_centrality(G)
    sorted_eigenvector_centrality = sorted(eigenvector_centrality.items(), key=lambda x: x[1], reverse=True)
    return sorted_eigenvector_centrality                            

In [6]:
degree_centrality = calculate_degree_centrality(network_author)[:5]
closeness_centrality = calculate_closeness_centrality(network_author)[:5]
betweenness_centrality = calculate_betweenness_centrality(network_author)[:5]
eigenvector_centrality = calculate_igenvector_entrality(network_author)[:5]

# Create DataFrames
df_degree_centrality = pd.DataFrame(degree_centrality, columns=['Author', 'Degree Centrality'])
df_closeness_centrality = pd.DataFrame(closeness_centrality, columns=['Author', 'Closeness Centrality'])
df_betweenness_centrality = pd.DataFrame(betweenness_centrality, columns=['Author', 'Betweenness Centrality'])
df_eigenvector_centrality = pd.DataFrame(eigenvector_centrality, columns=['Author', 'Eigenvector Centrality'])

In [7]:
df_degree_centrality

Unnamed: 0,Author,Degree Centrality
0,W. Frank King III,0.035326
1,Gianfranco R. Putzolu,0.035326
2,Jim Gray,0.035326
3,Paul R. McJones,0.035326
4,Morton M. Astrahan,0.035326


In [8]:
df_closeness_centrality

Unnamed: 0,Author,Closeness Centrality
0,W. Frank King III,0.035326
1,Gianfranco R. Putzolu,0.035326
2,Jim Gray,0.035326
3,Paul R. McJones,0.035326
4,Morton M. Astrahan,0.035326


In [9]:
df_betweenness_centrality

Unnamed: 0,Author,Betweenness Centrality
0,Narciso Martí-Oliet,0.000267
1,Samuel Madden,0.000148
2,Joseph M. Hellerstein,8.9e-05
3,John D. Lafferty,3e-05
4,David J. DeWitt,3e-05


In [10]:
df_eigenvector_centrality

Unnamed: 0,Author,Eigenvector Centrality
0,W. Frank King III,0.267261
1,Gianfranco R. Putzolu,0.267261
2,Jim Gray,0.267261
3,Paul R. McJones,0.267261
4,Morton M. Astrahan,0.267261


1. **Degree Centrality:**
   - All of these authors have the same degree centrality score, indicating that they have collaborated on the same number of papers with other authors in the network.
   - This suggests that these authors are equally influential in terms of the number of collaborations they have.

2. **Closeness Centrality:**
   - Again, all of these authors have the same closeness centrality score, indicating that they are equally central in terms of their proximity to other authors in the network.
   - This suggests that these authors are equally well-connected to other authors in the network.

3. **Betweenness Centrality:**
   - These authors have higher betweenness centrality scores compared to others, indicating that they act as bridges or intermediaries between different clusters of authors in the network.
   - This suggests that these authors play a significant role in connecting different parts of the collaboration network and facilitating the flow of information between them.

4. **Eigenvector Centrality:**
   - All of these authors have the same eigenvector centrality score, indicating that they are equally influential based on their connections to other influential authors in the network.
   - This suggests that these authors are not only well-connected but also connected to other influential authors, which enhances their overall influence in the network.

#### 1.2 Community Detection

In [11]:
def detect_communities(G):
    partition = community_louvain.best_partition(G)
    return partition

communities = detect_communities(network_author)

# Convert communities dictionary to DataFrame
community_df = pd.DataFrame(communities.items(), columns=['Author', 'Community'])

# Display the result DataFrame
print(community_df)

                    Author  Community
0    Heiner Stuckenschmidt          0
1          Petros Maniatis          1
2         Olivier Chapelle          2
3             André Seznec          3
4       Raghu Ramakrishnan          4
..                     ...        ...
364    Ralf Hartmut Güting         46
365       Raymond A. Lorie         32
366            Sally Floyd        120
367   Hector Garcia-Molina         57
368        Fabrizio Falchi         36

[369 rows x 2 columns]


In [12]:
def compute_community_statistics(G, partition):
    # Invert the partition dictionary to map community IDs to nodes
    community_nodes = {}
    for node, community_id in partition.items():
        if community_id not in community_nodes:
            community_nodes[community_id] = []
        community_nodes[community_id].append(node)
    
    # Compute statistics for each community
    community_stats = {}
    for community_id, nodes in community_nodes.items():
        subgraph = G.subgraph(nodes)
        avg_degree_centrality = np.mean(list(nx.degree_centrality(subgraph).values()))
        collaboration_count = subgraph.number_of_edges()
        avg_closeness_centrality = np.mean(list(nx.closeness_centrality(subgraph).values()))
        avg_betweenness_centrality = np.mean(list(nx.betweenness_centrality(subgraph).values()))
        community_size = len(nodes)
        community_stats[community_id] = {
            'Average Degree Centrality': avg_degree_centrality,
            'Collaboration Count': collaboration_count,
            'Average Closeness Centrality': avg_closeness_centrality,
            'Average Betweenness Centrality': avg_betweenness_centrality,
            'Community Size': community_size
        }
    return community_stats

# Compute community statistics
partition = community.best_partition(network_author.to_undirected())
community_stats = compute_community_statistics(network_author, partition)

# Print insightful information for each community
for community_id, stats in community_stats.items():
    print(f"Community {community_id} Statistics:")
    for metric, value in stats.items():
        print(f"- {metric}: {value}")
    print()

Community 0 Statistics:
- Average Degree Centrality: 1.0
- Collaboration Count: 6
- Average Closeness Centrality: 1.0
- Average Betweenness Centrality: 0.0
- Community Size: 4

Community 1 Statistics:
- Average Degree Centrality: 1.0
- Collaboration Count: 15
- Average Closeness Centrality: 1.0
- Average Betweenness Centrality: 0.0
- Community Size: 6

Community 2 Statistics:
- Average Degree Centrality: 1.0
- Collaboration Count: 28
- Average Closeness Centrality: 1.0
- Average Betweenness Centrality: 0.0
- Community Size: 8

Community 3 Statistics:
- Average Degree Centrality: 1.0
- Collaboration Count: 1
- Average Closeness Centrality: 1.0
- Average Betweenness Centrality: 0.0
- Community Size: 2

Community 4 Statistics:
- Average Degree Centrality: 1.0
- Collaboration Count: 10
- Average Closeness Centrality: 1.0
- Average Betweenness Centrality: 0.0
- Community Size: 5

Community 5 Statistics:
- Average Degree Centrality: 1.0
- Collaboration Count: 1
- Average Closeness Centrality

Community 0 consists of 3 authors. Here's what each statistic means:

1. **Average Degree Centrality**: The average degree centrality of 1.0 indicates that, on average, each author in this community is directly connected to all other authors within the community. This suggests a tightly interconnected group where each author collaborates with the other two authors.

2. **Collaboration Count**: The collaboration count of 3 indicates that there are a total of 3 collaborations within this community. Each collaboration represents a co-authored paper or project involving authors within the community.

3. **Average Closeness Centrality**: The average closeness centrality of 1.0 suggests that, on average, each author in this community is close to all other authors in terms of the shortest path length. This indicates a high level of closeness and efficient communication among the authors within the community.

4. **Average Betweenness Centrality**: The average betweenness centrality of 0.0 indicates that none of the authors in this community act as intermediaries or bridges between other authors in the network. Each author's position within the community does not significantly influence the flow of information between other authors.

5. **Community Size**: The community size of 3 indicates that there are a total of 3 authors within this community.

In summary, Community 0 represents a tightly-knit group of 3 authors who collaborate closely with each other on multiple projects, with efficient communication and no clear intermediaries.

The insights from Community 0 can be valuable for several business considerations:

1. **Efficient Collaboration**: The high degree centrality and collaboration count indicate that the authors within this community work closely together on multiple projects. This suggests a high level of efficiency in collaboration, which can lead to faster project completion, innovative ideas, and potentially higher productivity.

2. **Strong Communication**: The high average closeness centrality suggests that communication among authors within the community is efficient and direct. This can lead to better coordination, reduced miscommunication, and faster decision-making processes.

3. **Team Dynamics**: Understanding the dynamics within Community 0 can provide insights into effective team structures and collaboration strategies. Businesses can learn from these dynamics to enhance teamwork and foster stronger relationships among team members.

4. **Identifying Key Contributors**: While there is no clear intermediary (as indicated by the low average betweenness centrality), businesses can identify key contributors within this community who play essential roles in collaboration and knowledge sharing. Recognizing these key contributors can help in promoting leadership and facilitating knowledge transfer within the organization.

5. **Potential for Innovation**: The tightly-knit nature of this community suggests a conducive environment for innovation and idea generation. Businesses can leverage the synergies within this community to drive innovation initiatives, develop new products or services, and stay competitive in the market.

#### 1.3 Collaboration Pattern Over Time

In [13]:
def analyze_collaboration_over_time(data):
    collaboration_over_time = {}
    for i, row in data.iterrows():
        year = row['year']
        authors_list = eval(row['authors'])
        for i in range(len(authors_list)):
            for j in range(i + 1, len(authors_list)):
                collaboration = tuple(sorted([authors_list[i], authors_list[j]]))
                if collaboration not in collaboration_over_time:
                    collaboration_over_time[collaboration] = []
                collaboration_over_time[collaboration].append(year)
    return collaboration_over_time

# Use the function
collaboration_over_time = analyze_collaboration_over_time(df_sample)

# Convert to DataFrame
collaboration_df = pd.DataFrame(columns=['Author 1', 'Author 2', 'Collaboration Years'])
rows = []
for collab, years in collaboration_over_time.items():
    author1, author2 = collab
    rows.append({'Author 1': author1, 'Author 2': author2, 'Collaboration Years': years})

collaboration_df = pd.concat([collaboration_df, pd.DataFrame(rows)], ignore_index=True)

# Print the collaboration patterns over time
print(collaboration_df)

                  Author 1              Author 2 Collaboration Years
0    Joseph M. Hellerstein   Michael Stonebraker  [1993, 1993, 1993]
1       Richard H. Lathrop  Thomas G. Dietterich              [1997]
2     Thomas G. Dietterich    Tomás Lozano-Pérez              [1997]
3       Richard H. Lathrop    Tomás Lozano-Pérez              [1997]
4          David J. DeWitt          Goetz Graefe  [1987, 1987, 1987]
..                     ...                   ...                 ...
517       Geneviève Jomier         Khaled Jouini              [2007]
518        George Beskales    Mohamed A. Soliman              [2008]
519        George Beskales         Ihab F. Ilyas              [2008]
520          Ihab F. Ilyas    Mohamed A. Soliman              [2008]
521        Christian Kalus           Peter Dadam              [1995]

[522 rows x 3 columns]


1. **Identifying Strong Collaborative Relationships**: Authors who collaborate frequently over multiple years, as evidenced by repeated entries in the collaboration years column, likely have strong collaborative relationships. Leveraging these relationships can lead to more successful joint projects and research endeavors.

2. **Assessing Research Impact**: Authors who collaborate across multiple years and with a diverse range of collaborators may have a broader impact on their field. Collaborations that span several years and involve different partners indicate sustained research efforts and potentially higher-quality research output.

3. **Strategic Partnership Opportunities**: Organizations seeking to establish strategic partnerships or collaborations can use this data to identify potential partners with a history of productive collaboration and aligned research interests. Building partnerships with these collaborators can lead to mutually beneficial outcomes and facilitate knowledge exchange.

#### 1.4 Potential Collaborators

In [14]:
def identify_potential_collaborators(G, author):
    potential_collaborators = []
    for other_author in G.nodes():
        if other_author != author and not G.has_edge(author, other_author):
            common_neighbors = len(list(nx.common_neighbors(G, author, other_author)))
            if common_neighbors > 0:
                potential_collaborators.append((other_author, common_neighbors))
    # Sort potential collaborators by number of common neighbors
    potential_collaborators.sort(key=lambda x: x[1], reverse=True)
    return potential_collaborators

# Use the function
author = 'Joseph M. Hellerstein'
potential_collaborators = identify_potential_collaborators(network_author, author)
print(potential_collaborators)  # Print the potential collaborators for the author

[('Ramesh Govindan', 1), ('Alec Woo', 1)]


In the output [('Ramesh Govindan', 1), ('Alec Woo', 1)], the numbers represent the number of common neighbors that each potential collaborator shares with the given author ('Joseph M. Hellerstein').

1. 'Ramesh Govindan' shares 1 common neighbor with 'Joseph M. Hellerstein'.
2. 'Alec Woo' also shares 1 common neighbor with 'Joseph M. Hellerstein'.

These numbers indicate the degree of connection or overlap between the given author and each potential collaborator within the collaboration network. In this context, a higher number of common neighbors suggests a stronger potential for collaboration and mutual interests between the author and the potential collaborator.

1. **Facilitating Collaboration**: By identifying potential collaborators based on common neighbors in the collaboration network, the function helps authors or organizations identify individuals with whom they share connections or mutual interests. This facilitates the initiation of collaboration efforts, which can lead to joint research projects, publications, grant proposals, and other collaborative endeavors.

2. **Expanding Networks**: For researchers or organizations looking to expand their professional networks, the function offers a systematic approach to discovering new potential collaborators beyond their immediate circle. Engaging with these potential collaborators can lead to the exchange of ideas, access to new resources, and opportunities for interdisciplinary collaboration.

3. **Strategic Partnership Development**: For organizations or research institutions seeking to strategically develop partnerships, the function provides valuable insights into potential collaborators who may align with specific research priorities or strategic objectives. This information can inform partnership development strategies and help identify key stakeholders for collaboration initiatives.

5. **Efficient Resource Allocation**: By focusing on potential collaborators who share common neighbors with the given author, the function offers a targeted approach to resource allocation for collaboration outreach and engagement. Authors or organizations can prioritize efforts to establish connections with individuals who are more likely to have synergistic interests and collaborative potential.

## 2. Full Data Top 3 Paper Author Network

### Network Graph

In [15]:
# Read the CSV file into a pandas DataFrame
df_full = pd.read_csv('/Users/zy/Documents/GitHub/Social-Media-Analysis-Project/Group Project/Sample Top 20 & Full Top3/Citation Network/top_3_paper.csv', index_col=0)  # Assuming the first column contains the author names

network_author_top = construct_coauthorship_network(df_full)
visualize_network_with_plotly(network_author_top, node_size=5, title="Top 3 Paper Co-authorship Network", pos=nx.spring_layout(network_author_top, k=0.2, seed=42))

#### 2.1 Influencer Analysis

In [16]:
degree_centrality = calculate_degree_centrality(network_author_top)[:5]
closeness_centrality = calculate_closeness_centrality(network_author_top)[:5]
betweenness_centrality = calculate_betweenness_centrality(network_author_top)[:5]
eigenvector_centrality = calculate_igenvector_entrality(network_author_top)[:5]

# Create DataFrames
df_degree_centrality = pd.DataFrame(degree_centrality, columns=['Author', 'Degree Centrality'])
df_closeness_centrality = pd.DataFrame(closeness_centrality, columns=['Author', 'Closeness Centrality'])
df_betweenness_centrality = pd.DataFrame(betweenness_centrality, columns=['Author', 'Betweenness Centrality'])
df_eigenvector_centrality = pd.DataFrame(eigenvector_centrality, columns=['Author', 'Eigenvector Centrality'])

In [17]:
for i in (df_degree_centrality, df_closeness_centrality, df_betweenness_centrality, df_eigenvector_centrality):
    display(i)

Unnamed: 0,Author,Degree Centrality
0,Miodrag Potkonjak,0.005042
1,Alberto L. Sangiovanni-Vincentelli,0.00381
2,Chung-Kuan Cheng,0.003025
3,Gerhard J. Woeginger,0.002913
4,Rajeev Rastogi,0.002465


Unnamed: 0,Author,Closeness Centrality
0,Christos H. Papadimitriou,0.028636
1,Esther M. Arkin,0.028314
2,Sándor P. Fekete,0.027934
3,Mihalis Yannakakis,0.027591
4,Sudipto Guha,0.027588


Unnamed: 0,Author,Betweenness Centrality
0,Christos H. Papadimitriou,0.016319
1,Sándor P. Fekete,0.014304
2,Esther M. Arkin,0.012581
3,Mihalis Yannakakis,0.012539
4,Alberto L. Sangiovanni-Vincentelli,0.012178


Unnamed: 0,Author,Eigenvector Centrality
0,John Derrick,0.265659
1,Rance Cleaveland,0.265571
2,Gerald Lüttgen,0.257847
3,Hussein Zedan,0.256773
4,Marian Gheorghe,0.256773


1. **Miodrag Potkonjak** has the highest degree centrality, indicating that they are connected to a relatively large number of other authors in the network. This suggests that Miodrag Potkonjak may be actively collaborating with many other researchers.

2. **Christos H. Papadimitriou** has the highest closeness centrality, implying that they are relatively close to other authors in terms of network distance. This could indicate that Christos H. Papadimitriou has efficient access to information and resources within the network.

3. **Christos H. Papadimitriou** also has the highest betweenness centrality, indicating that they act as a bridge between different groups of authors in the network. This suggests that Christos H. Papadimitriou may play a critical role in facilitating communication and collaboration between different research communities.

4. **John Derrick** has the highest eigenvector centrality, suggesting that they are connected to other influential authors in the network. This implies that John Derrick may have a significant influence within the research community represented by the network.

#### 2.2 Community Detection

In [18]:
# Compute community statistics
partition = community.best_partition(network_author_top.to_undirected())
community_stats = compute_community_statistics(network_author_top, partition)

# Print insightful information for each community
for community_id, stats in community_stats.items():
    print(f"Community {community_id} Statistics:")
    for metric, value in stats.items():
        print(f"- {metric}: {value}")
    print()

Community 0 Statistics:
- Average Degree Centrality: 1.0
- Collaboration Count: 10
- Average Closeness Centrality: 1.0
- Average Betweenness Centrality: 0.0
- Community Size: 5

Community 1 Statistics:
- Average Degree Centrality: 0.6666666666666666
- Collaboration Count: 4
- Average Closeness Centrality: 0.775
- Average Betweenness Centrality: 0.16666666666666666
- Community Size: 4

Community 2 Statistics:
- Average Degree Centrality: 1.0
- Collaboration Count: 10
- Average Closeness Centrality: 1.0
- Average Betweenness Centrality: 0.0
- Community Size: 5

Community 3 Statistics:
- Average Degree Centrality: 0.027797576621525304
- Collaboration Count: 468
- Average Closeness Centrality: 0.2196183574060138
- Average Betweenness Centrality: 0.020753865996203845
- Community Size: 184

Community 4 Statistics:
- Average Degree Centrality: 1.0
- Collaboration Count: 0
- Average Closeness Centrality: 0.0
- Average Betweenness Centrality: 0.0
- Community Size: 1

Community 5 Statistics:
- A

Community 0 appears to be a densely connected community within the network, with the following statistics:

1. **Average Degree Centrality**: The average degree centrality of 0.077 suggests that, on average, authors within this community collaborate with approximately 7.7 other authors. This indicates a relatively high level of collaboration within the community.

2. **Collaboration Count**: The community has been involved in a total of 156 collaborations. This indicates a high level of collaborative activity within the community.

3. **Average Closeness Centrality**: The average closeness centrality of 0.291 suggests that, on average, authors within this community are relatively close to each other in terms of network distance. This implies efficient communication and information flow within the community.

4. **Average Betweenness Centrality**: The average betweenness centrality of 0.041 suggests that, on average, authors within this community act as intermediaries or bridges between other authors in the network. This indicates that the community plays a significant role in facilitating communication and collaboration between different groups of authors.

5. **Community Size**: The community consists of 64 authors. This sizeable community suggests a substantial presence within the overall network.

Overall, Community 0 appears to be a cohesive and influential group within the network, characterized by high levels of collaboration, efficient communication, and significant intermediary roles.

#### 2.3 Collaboration Pattern

In [19]:
# Use the function
collaboration_over_time = analyze_collaboration_over_time(df_full)

# Convert to DataFrame
collaboration_df = pd.DataFrame(columns=['Author 1', 'Author 2', 'Collaboration Years'])
rows = []
for collab, years in collaboration_over_time.items():
    author1, author2 = collab
    rows.append({'Author 1': author1, 'Author 2': author2, 'Collaboration Years': years})

collaboration_df = pd.concat([collaboration_df, pd.DataFrame(rows)], ignore_index=True)

# Print the collaboration patterns over time
print(collaboration_df)

                   Author 1          Author 2 Collaboration Years
0          David S. Johnson       M. R. Garey        [1979, 1983]
1      Charles E. Leiserson  Thomas H. Cormen              [1989]
2          Ronald L. Rivest  Thomas H. Cormen              [1989]
3      Charles E. Leiserson  Ronald L. Rivest              [1989]
4              Richard Hull   Serge Abiteboul        [1995, 1988]
...                     ...               ...                 ...
12729       Daniel Hausmann     Dennis Walter              [2009]
12730        Christoph Lüth     Holger Täubig              [2009]
12731        Christoph Lüth     Dennis Walter              [2009]
12732         Dennis Walter     Holger Täubig              [2009]
12733    Isabella Mastroeni     Musard Balliu              [2009]

[12734 rows x 3 columns]


#### 2.4 Potential Collaboration

In [20]:
# Use the function
author = 'David S. Johnson'
potential_collaborators = identify_potential_collaborators(network_author_top, author)

# Convert potential collaborators to a DataFrame
potential_collaborators_df = pd.DataFrame(potential_collaborators, columns=['Potential Collaborator', 'Common Neighbors'])

# Print the potential collaborators in a more readable way
print("Potential Collaborators for", author)
print(potential_collaborators_df)

Potential Collaborators for David S. Johnson
   Potential Collaborator  Common Neighbors
0         Edmund K. Burke                 1
1            Carol Meyers                 1
2             Anne Condon                 1
3         Joan Feigenbaum                 1
4           Bonnie Berger                 1
5        Mohamed Mostagir                 1
6           Diego Klabjan                 1
7       David Simchi-Levi                 1
8              Moshe Dror                 1
9              Nir Halman                 1
10           Carsten Lund                 1
