In [1]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import chart_studio
from chart_studio.plotly import plot


# 1. Coword 네트워크 분석

머신러닝 관련 저장소의 토픽을 이용하여 토픽네트워크를 구축한 후, 커뮤니티 탐지 기법을 적용한 결과 머신러닝 주요 기술로 분류가 되었다. 

대표적으로 'machine learning and appliation', 'deep learning and application', 'natural language processing' 과 같은 기술군으로 분류가 되었다. 이는 유의미하나 너무 포괄적인 정보를 나타내는 경향이 있다. 

본 코드에서는 각 키워드의 중앙성을 분석하여, 특정 기술군에서 특히 중요한 기술들을 분류하고, 규모가 큰 기술군에 대해 hirecical한 분류를 진행한다.

In [2]:
data = pd.read_csv('data/network/node_table.csv')

In [3]:
data

Unnamed: 0,Id,Label,timeset,modularity_class,Eccentricity,closnesscentrality,harmonicclosnesscentrality,betweenesscentrality,Degree,Weighted Degree,clustering,triangles,eigencentrality
0,git,git,,0,0,0.000000,0.000000,0.000000,0,0,0.000000,0,0.000000
1,machine-learning,machine-learning,,0,2,0.950980,0.974227,0.504970,276,10528,0.030303,1150,1.000000
2,data-analysis,data-analysis,,1,3,0.500861,0.515464,0.000091,13,224,0.730769,57,0.158675
3,statistics,statistics,,1,3,0.503460,0.520619,0.000148,16,242,0.691667,83,0.189111
4,bigdata,bigdata,,0,3,0.498288,0.510309,0.000034,10,158,0.800000,36,0.138696
...,...,...,...,...,...,...,...,...,...,...,...,...,...
752,encoder-decoder,encoder-decoder,,0,0,0.000000,0.000000,0.000000,0,0,0.000000,0,0.000000
753,deepface,deepface,,0,0,0.000000,0.000000,0.000000,0,0,0.000000,0,0.000000
754,neural-search,neural-search,,0,0,0.000000,0.000000,0.000000,0,0,0.000000,0,0.000000
755,max78000,max78000,,0,0,0.000000,0.000000,0.000000,0,0,0.000000,0,0.000000


## 1-1. 노드 분석 

전체 데이터를 클러스터별 데이터로 분리한다. 이후 각 sub-network의 노드 중앙성을 파악하여 각 기술군에서 가장 중요한 기술을 파악하고자 한다.

In [4]:
# split data by modularity class 
# check unique number of modularity 
unique_modularity = list(set(data.modularity_class))

# append dataframe in dictionary per modularity class
data_per_cluster = {}

for mod in unique_modularity :
    data_per_cluster[mod] = data[data.modularity_class==mod]

0번부터 19번까지 20개의 modularity 존재 

In [5]:
# visualize frequency of cluster 
fig = make_subplots(
    rows=1, cols=1
)

fig.add_trace(
    go.Bar(x=unique_modularity[1:], y=[len(data_per_cluster[i]) for i in unique_modularity][1:]),
    row=1, col=1
)

fig.show()

modularity 0은 machine-learning 클러스터 + 미분류 단어 전체 포함     
혼자 너무 값이 크므로 여기선 제외하고 시각화

In [18]:
# print dataframe sorted by specific node feature 
# feature type : 'betweenesscentrality(default), Degree, Weighted Degree, closnesscentrality, eigencentrality 
# if cluster_num = None, all clusters are considered 

def print_high_centrality_keyword(cluster_num=None, filtering_feature='betweenesscentrality') :
    if cluster_num == None :
        output = data.sort_values(by=[filtering_feature], ascending=False).iloc[:20, :]
    else : 
        output = data_per_cluster[cluster_num].sort_values(by=[filtering_feature], ascending=False)


    return output

In [45]:
print_high_centrality_keyword(19, 'eigencentrality')

Unnamed: 0,Id,Label,timeset,modularity_class,Eccentricity,closnesscentrality,harmonicclosnesscentrality,betweenesscentrality,Degree,Weighted Degree,clustering,triangles,eigencentrality
66,speech-recognition,speech-recognition,,19,3,0.497436,0.508591,0.000248,9,70,0.472222,17,0.100897
717,speech-to-text,speech-to-text,,19,3,0.492386,0.498282,0.0,3,26,1.0,3,0.064637
64,text-to-speech,text-to-speech,,19,3,0.489899,0.496564,0.0,3,19,1.0,3,0.040384
598,voice,voice,,19,3,0.489899,0.496564,0.0,3,16,1.0,3,0.037061
599,voice-commands,voice-commands,,19,3,0.489899,0.496564,0.0,3,15,1.0,3,0.037061


---

## 1-2. Sub-network 구축

규모가 큰 네트워크를 대상으로 추가적인 커뮤니티 탐지를 진행하여 더 세부적인 기술군으로 분류하고자한다.      
대상 클러스터는 
1. machine learning and application (0)
2. deep learning and application (3)
3. natural language processing (10)
4. python and application (2)
   
총 4개의 클러스터이다

In [46]:
network = pd.read_csv('data/network/4_filtered_coword.csv', index_col=0)

In [48]:
# extract subnetwork
def extract_subnetwork(cluster_num) : 
    assert cluster_num in [0, 2, 3, 10]

    output = network.loc[list(data_per_cluster[cluster_num].Id), list(data_per_cluster[cluster_num].Id)]

    return output

In [56]:
ml_subnetwork = extract_subnetwork(0)
dl_subnetwork = extract_subnetwork(3)
nlp_subnetwork = extract_subnetwork(10)
python_subnetwork = extract_subnetwork(2)

In [58]:
# csv 형태로 저장 

ml_subnetwork.to_csv('data/network/ml_subnetwork.csv')
dl_subnetwork.to_csv('data/network/dl_subnetwork.csv')
nlp_subnetwork.to_csv('data/network/nlp_subnetwork.csv')
python_subnetwork.to_csv('data/network/python_subnetwork.csv')

# 2. Contributor-coupling network 분석

contributor-coupling 네트워크를 분석한 결과, 특정 하나의 클러스터가 기업의 생태계 특성을 띄지 않을 뿐만 아니라 모든 기업들을 연결하여, 매우 높은 중앙성을 띄는 것을 확인하였다.    
본 코드에서는 해당 클러스터의 특성을 파악하고 그 서브네트워크를 구축하고자 한다.

In [5]:
data = pd.read_csv('data/network/contributor_coupling_node_table.csv')
network = pd.read_csv('data/network/contributor_4_filtered_network.csv', index_col=0)

## 2-1. 서브 네트워크 추출 

분석하고자 하는 클러스터는 9번 클러스터이므로, modularity가 9번인 네트워크를 추출하여 새로운 서브네트워크를 구축한다

In [9]:
target_data = data[data.modularity_class==9]
new_node = list(data[data.modularity_class==9].Id)
subnetwork = network.loc[new_node, new_node]

In [None]:
subnetwork.to_csv('data/network/contributor_coupling_subnetwork.csv')

## 2-2. 노드 특성 분석 

서브네트워크를 구성하는 노드들의 특징(디그리, 중앙성등)을 파악한다. 단, 여기서 나오는 값들은 서브네트워크에서의 지표가 아닌 전체 네트워크에서의 지표이다.     
발표할때는 혼동할 수 있으므로 이 내용을 먼저 발표할 것 

In [12]:
# eigenvector centrality 
target_data.sort_values(by=['eigencentrality'], ascending=False)

Unnamed: 0,Id,Label,timeset,modularity_class,Degree,Weighted Degree,Eccentricity,closnesscentrality,harmonicclosnesscentrality,betweenesscentrality,eigencentrality
128,apache/tvm,apache/tvm,,9,27,261,6,0.336722,0.409811,0.019745,0.019703
42,dmlc/xgboost,dmlc/xgboost,,9,29,240,6,0.345953,0.424906,0.107713,0.017157
279,horovod/horovod,horovod/horovod,,9,23,129,6,0.293142,0.367736,0.00542,0.015977
234,scikit-learn/scikit-learn,scikit-learn/scikit-learn,,9,21,182,6,0.335019,0.399119,0.020893,0.015222
620,OAID/MXNet-HRT,OAID/MXNet-HRT,,9,23,276,6,0.295429,0.368365,0.004401,0.014622
284,dmlc/dmlc-core,dmlc/dmlc-core,,9,19,220,6,0.332915,0.393459,0.016335,0.013581
285,pymc-devs/pymc,pymc-devs/pymc,,9,15,96,7,0.309942,0.378455,0.005816,0.010961
82,ray-project/ray,ray-project/ray,,9,15,93,5,0.355705,0.401069,0.069315,0.00924
31,kubeflow/kubeflow,kubeflow/kubeflow,,9,14,154,5,0.306005,0.359623,0.012982,0.009026
235,nltk/nltk,nltk/nltk,,9,13,89,7,0.306358,0.36965,0.005042,0.008915


In [13]:
# degree 
target_data.sort_values(by=['Degree'], ascending=False)

Unnamed: 0,Id,Label,timeset,modularity_class,Degree,Weighted Degree,Eccentricity,closnesscentrality,harmonicclosnesscentrality,betweenesscentrality,eigencentrality
42,dmlc/xgboost,dmlc/xgboost,,9,29,240,6,0.345953,0.424906,0.107713,0.017157
128,apache/tvm,apache/tvm,,9,27,261,6,0.336722,0.409811,0.019745,0.019703
620,OAID/MXNet-HRT,OAID/MXNet-HRT,,9,23,276,6,0.295429,0.368365,0.004401,0.014622
279,horovod/horovod,horovod/horovod,,9,23,129,6,0.293142,0.367736,0.00542,0.015977
234,scikit-learn/scikit-learn,scikit-learn/scikit-learn,,9,21,182,6,0.335019,0.399119,0.020893,0.015222
284,dmlc/dmlc-core,dmlc/dmlc-core,,9,19,220,6,0.332915,0.393459,0.016335,0.013581
30,kubeflow/pipelines,kubeflow/pipelines,,9,16,189,5,0.311398,0.364528,0.261014,0.008184
82,ray-project/ray,ray-project/ray,,9,15,93,5,0.355705,0.401069,0.069315,0.00924
285,pymc-devs/pymc,pymc-devs/pymc,,9,15,96,7,0.309942,0.378455,0.005816,0.010961
31,kubeflow/kubeflow,kubeflow/kubeflow,,9,14,154,5,0.306005,0.359623,0.012982,0.009026
