In [8]:
import numpy as np
import pandas as pd
import networkx as nx
import community as community_louvain
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import chart_studio
from chart_studio.plotly import plot
from tqdm import tqdm

chart_studio.tools.set_credentials_file(username='Injokim', api_key='ZAsiIjcDIzXdYHa8CXbw')

# 1. Coword network analysis

Analyze the coword network.

The network imported in this study has already undergone edge normalization.

## 1-1. Load network

In [2]:
# read network using pandas 
network = pd.read_csv('data/network/0.3_filtered_normalized_coword.csv', index_col=0)

# switch network dataframe to networkx
G = nx.from_pandas_adjacency(network)

In [3]:
# analize network features 
print(nx.info(G))
print('network density : {:.5f}'.format(nx.density(G)))

Graph with 6456 nodes and 13921 edges
network density : 0.00067


## 1-2. Filtering node based on degree

As a result of visualization through gephi, it was most appropriately visualized when the node degree was 3.     

So, in this code as well, the analysis is performed after network filtering by setting node degree 3 or higher as the standard.

In [4]:
# check node degree and extract nodes which its degree is lower than 3
filtered_nodes = []

for (node, degree) in G.degree :
    if degree <= 3 :
        filtered_nodes.append((node, degree))


# remove nodes from G 
for node, _ in filtered_nodes : 
    G.remove_node(node)
    

# network features after filtering 
print(nx.info(G))
print('network density : {:.5f}'.format(nx.density(G)))

Graph with 3206 nodes and 11291 edges
network density : 0.00220


## 1-3. Community detection using Louvain method

Apply community detection technique to the filtered network.   

At this time, it is expected that each cluster (technically, a community, but hereinafter referred to as a cluster) will be divided into sub-technology groups, applications, libraries, etc. of machine learning.

In [7]:
# conduct community detection using louvain method
clusters_dict = community_louvain.best_partition(G)

# arrange by clusters 
# key : clsuter id
# values : included node name 
clusters = {}

for node, cluster_id in clusters_dict.items() :  
    if cluster_id not in clusters.keys() :
        clusters[cluster_id] = []
    clusters[cluster_id].append(node)

In [48]:
# A cluster with less than n nodes is judged as meaningless and removed.
# Count the number of clusters with less than n nodes
clusters_freq = {k : len(v) for k, v in clusters.items()}
cluster_count_threshold = 7

print('number of clusters : {}'.format(len(clusters_freq)))
print('number of clusters which has more than {} nodes : {}'.format(cluster_count_threshold, 
                                                            len([k for k, v in clusters_freq.items() if v >= cluster_count_threshold])))


# remove non-meaning clusters 
filtered_clusters = {k : v for k, v in clusters.items() if clusters_freq[k] >= cluster_count_threshold}


# remove non-meaning nodes in network 
# remain original network G 
# create new network filtered_G
filtered_G = G.copy()
non_meaning_nodes = []

for _, v in clusters.items() :
    if len(v) < cluster_count_threshold :
        non_meaning_nodes += v 

for node in G.nodes :
    if node in non_meaning_nodes :
        filtered_G.remove_node(node)

number of clusters : 263
number of clusters which has more than 7 nodes : 121


In [87]:
# Visualize the number of nodes per cluster
fig = make_subplots(
    rows=1, cols=1
)

fig.add_trace(
    go.Bar(x=list(clusters_freq.keys()), y=list(clusters_freq.values()))
)

plot(fig, filename='cluster_count', auto_open=True)

'https://plotly.com/~injokim/17/'

Clusters 2, 59, 73, 111, and 117 are the top 5 clusters with a large number of clusters, so they are separately collected and visualized.

In [80]:
meaning_clusters = [2, 59, 73, 111, 117]

def create_subnetwork(id) :
    new_G = G.copy()

    nodes = clusters[id]
    remove_nodes = [node for node in new_G.nodes if node not in nodes]

    for node in remove_nodes :
        new_G.remove_node(node)

    return nx.to_pandas_edgelist(new_G)

# make subnetwork 
for id in meaning_clusters :
    vars()[f'subnetwork_{id}'] = create_subnetwork(id)

## 1-4. Store network as pandas dataframe and csv

In [69]:
# change network as dataframe
new_network = nx.to_pandas_edgelist(filtered_G)

# change community as dataframe
community_df = {}
for k, v in filtered_clusters.items() :
    for node in v : 
        community_df[node] = k

community_df = pd.DataFrame.from_dict(community_df, orient='index').reset_index()
community_df.columns = ['node', 'cluster']

In [71]:
# store network info
new_network.to_csv('data/network/0.3_clustered_normalized_coword.csv')
community_df.to_csv('data/network/0.3_cluster.csv')

In [84]:
# store subnetwork 
for id in meaning_clusters :
    vars()[f'subnetwork_{id}'].to_csv('data/network/subnetwork_' + str(id) + '.csv', index=False)

# 2. Contributor-coupling network analysis

## 2-1. Network load

In [2]:
network = pd.read_csv('data/network/contributor_coupling.csv', index_col=0)

## 2-2. Edge normalization using association strength

Unlike the coword network, this network is not normalized, so normalization is performed first using the association strength.

In [6]:
# extract diagonal term for association strength
diagonal = {node : network.loc[node, node] for node in network.columns}

# conduct association strength
normalized_network = network.copy()

for node1 in tqdm(network.columns) :
    for node2 in network.columns : 
        normalized_network.loc[node1, node2]= network.loc[node1, node2]/(diagonal[node1] * diagonal[node2])

100%|██████████| 3367/3367 [06:31<00:00,  8.60it/s]


## 2-3. Network analysis

As a result of putting it as an input to gephi, empirically, the best looking network was generated when visualizing an edge weight of 0.03 or more.   

Therefore, in this code, the same edge weight is set and network analysis is performed.


The analysis procedure is

1. Network dichotomize
2. Remove nodes below a certain degree
3. Community Detection
4. Remove communities below a certain number of nodes
5. Node Centrality Analysis

In [34]:
# network construct
# remove self-loop 
for i in range(len(normalized_network)) :
    normalized_network.iloc[i, i] = 0

# dichotomize 
# initialize all edge weight to 0
edge_threshold = 0.03

columns = normalized_network.columns
dichotomized_network = normalized_network.values
dichotomized_network = np.where(dichotomized_network >= edge_threshold , 1, 0)
dichotomized_network = pd.DataFrame(dichotomized_network, index=columns, columns=columns)


# change data type df to networkx
G = nx.from_pandas_adjacency(dichotomized_network)

In [35]:
# record initial network statement 
nx.info(G)

'Graph with 3367 nodes and 5043 edges'

In [36]:
# Remove isolated nodes
isolated = []

for node, degree in G.degree : 
    if degree == 0 :
        isolated.append(node)

for node in isolated :
    G.remove_node(node)

nx.info(G)


'Graph with 1091 nodes and 5043 edges'

In [38]:
# community detection using louvain method 
cluster = community_louvain.best_partition(G)

# make new dictionary about community and nodes
# keys : cluster id 
# values : list of nodes
cluster_node_dict = {}
for k, v in cluster.items() :
    if v not in list(cluster_node_dict.keys()) :
        cluster_node_dict[v] = []
    cluster_node_dict[v].append(k)

The number of created communities is 245

In [42]:
# Communities with 5 or fewer nodes are considered unimportant and all are removed
# extract nodes in unimportant cluster 
unimportant = []
cluster_threshold = 5

for k, v in cluster_node_dict.items() : 
    if len(v) <= 5 :
        unimportant += v

# remove nodes in network G
for node in unimportant : 
    G.remove_node(node)

nx.info(G)

'Graph with 528 nodes and 4595 edges'

In [47]:
# node centrality analysis 
eigen = nx.eigenvector_centrality(G)

# ordering by eigenvector centrality 
eigen = pd.DataFrame.from_dict(eigen, orient='index').reset_index()
eigen.columns = ['node', 'eigenvector_centrality']
eigen.sort_values(by=['eigenvector_centrality'], ascending=False, inplace=True)

# Visualizing the top 20 nodes in an eigenvector
fig = make_subplots(
    rows=1, cols=1
)

fig.add_trace(
    go.Bar(x=eigen.node[:20], y=eigen.eigenvector_centrality[:20])
)

## 2-4. Extract subnetwork

As a result of network visualization, it turned out that clusters 1 and 9 were not nodes of a specific company.   

Extract their subnetworks and examine the components that make up them.   

Since the size of the subnetwork is small, networks 1 and 9 are combined and expressed as one subnetwork.

In [59]:
meaning_cluster = [1, 9]

# find nodes of cluster 1 and 9 
# combine two cluster's nodes
meaning_nodes = [] + cluster_node_dict[1] + cluster_node_dict[9]

# create new network and remove nodes which not in meaning nodes(nodes of cluster 1 and cluster 9)
G_sub = G.copy()
all_nodes = G.nodes

for node in all_nodes : 
    if node not in meaning_nodes : 
        G_sub.remove_node(node)

## 2-5. Store network and community 

Change networkx back to pandas format and save it as a csv file.

In [49]:
# change network networkx to pandas edgelist
final_network = nx.to_pandas_edgelist(G)

# change subnetwork networkx to pandas edgelist
subnetwork = nx.to_pandas_edgelist(G_sub)

# change cluster dictionary to pandas nodelist
cluster = pd.DataFrame.from_dict(cluster, orient='index').reset_index()
cluster.columns = ['Id', 'cluster']

In [52]:
# save network 
final_network.to_csv('data/network/normalized_contributor_coupling/normalized_cc.csv', index=False)

# save subnetwork
subnetwork.to_csv('data/network/normalized_contributor_coupling/normalized_cc_sub.csv', index=False)

# save cluster info
cluster.to_csv('data/network/normalized_contributor_coupling_node/cluster.csv', index=False)