In [2]:
import numpy as np
import pandas as pd
import math
import pickle
from tqdm import tqdm

# 1. 데이터 불러오기

In [3]:
raw_data = pd.read_csv('../data/data/filtered_data.csv')

### contributor 4이상으로 필터링
- 우리 네트워크의 엣지 기준은 co-contributor가 4 이상 
- 이를 위해서는 반드시 한 repository에 최소 4명 이상의 contributor가 존재해야함 <br></br>

---

In [4]:
# contributor가 4명 이상인 repository만 고려 

data = raw_data[raw_data.contributors_count>=4]
data = data.reset_index(drop=True)

print('contributor가 4이상인 데이터 수 : {}'.format(len(data)))

contributor가 4이상인 데이터 수 : 3367


---

# 2. Repository-contributor 사전 구축

각 저장소 별, contributor를 담은 사전 자료구조를 생성 <br></br>

- 의문점 : owner도 contributor에 포함시켜야할까?

In [5]:
# repo-contributor dictionary 생성
repo_contributor_dict = {data.loc[row_idx, 'full_name'] : data.loc[row_idx, 'contributors'].split('#') for row_idx in range(len(data.contributors))}

# repo에 owner 추가 
for repo in list(repo_contributor_dict) :
    repo_contributor_dict[repo].append(data[data.full_name==repo].owner.values[0])

---

# 3. Heterogeneous 네트워크 구축

In [6]:
# repository 수, contributor 수 구하기 
# 중복 제거된 유니크한 contributor 리스트 
unique_contributor = []
for contributors in repo_contributor_dict.values() :
    unique_contributor += contributors
unique_contributor = np.unique(unique_contributor)

len_repo = len(data); len_contributor = len(unique_contributor)


In [7]:
# 네트워크 구축 
hetero_network = pd.DataFrame(np.zeros((len_repo, len_contributor)), index=list(data.full_name), columns=unique_contributor)

for k, v in tqdm(repo_contributor_dict.items()) :
    for contributor in v :
        hetero_network.loc[k, contributor] +=1 


100%|██████████| 3367/3367 [00:09<00:00, 364.39it/s]


---

# 4. repository 네트워크 구축 

In [8]:
repository_network = hetero_network @ hetero_network.T

In [9]:
# Save network
repository_network.to_csv('data/network/contributor_coupling.csv')

In [8]:
def find_edge(network, repo) :
    edge = []
    
    for i, element in enumerate(network.loc[repo, :]) :
        if i == list(network).index(repo) :
            diagonal = element
        elif element != 0 :
            edge.append((i, element))
    
    return edge, diagonal 

## repository 네트워크 의미 


- diagonal term : 해당 노드가 가진 contributor의 수 
- edge weight : 두 노드(저장소)의 공동 개발자의 수 

- self loop를 제외하고는 weight 2인 링크 단 1개만 생성됨
    - MachineLearningJournalClub/MLJC-UniTo-ProjectX-2020-public
    - MachineLearningJournalClub/HowToTackleAMLCompetition
    - edge weight 2
- 실제 확인 결과 'Valerio Pagliarino', 'sazio' 두 개발자가 공동으로 개발에 참여
    - 이 중 'sazio'는 'HowToTackleAMLCompetition'의 owner

In [46]:
# 엣지 기반 필터링 
edge_threshold = 10
repository_network = repository_network.where(repository_network>=edge_threshold, 0)

# self loop 제거
for i in range(len(repository_network)) :
    repository_network.iloc[i, i] = 0

# isolated 노드 확인
nonisolated_node = []
for node, edge in repository_network.iterrows() : 
     if sum(edge) != 0 :
         nonisolated_node.append(node)

# isolated 노드 제거 
final_network = repository_network.loc[nonisolated_node, nonisolated_node]

In [47]:
# 필터링된 네트워크 저장
final_network.to_csv('data/contributor_' + str(edge_threshold) + '_filtered_network.csv')

---

# 5. Coword 네트워크 구축 
이를 기반으로 추후 co-word 및 word-coupling network 구축 

In [11]:
with open('../data/data/repo_topic_dict.pickle', 'rb') as f :
    repo_topic_dict = pickle.load(f)

In [12]:
# 중복 의미를 가지는 키워드 통합
'''''''''''
machinelearning, machine-learning -> machine-learning
deep-learning, deeplearning -> deep-learning
'''''''''''

change_word_dict = {'ml' : 'machine-learning', 'machine-learning-algorithms' : 'machine-learning', 'machinelearning' : 'machine-learning', 'machine-learning-models' : 'machine-learning', 
                    'machine' : 'machine-learning', 'learning' : 'machine-learning',
                    'nlp' : 'natural-language-processing',
                    'nlu' : 'natural-language-understanding',
                    'deep-neural-networks' : 'deep-learning', 'deeplearning' : 'deep-learning', 'neural-network' : 'deep-learning', 
                    'edge' : 'edge-computing', 'edge-ai' : 'edge-computing', 
                    'python3' : 'python', 
                    'deep-reinforcement-learning' : 'reinforcement-learning', 'rl' : 'reinforcement-learning',
                    'visualization' : 'data-visualization', 
                    'ai' : 'artificial-intelligence', 
                    'bot' : 'chatbot', 
                    'notebook' : 'jupyter-notebook', 'jupyter' : 'jupyter-notebook',
                    'cnn' : 'convolutional-neural-networks',
                    'automated-machine-learning' : 'automl', 'auto-ml' : 'automl',
                    'explainable-ml' : 'explainable-ai', 'xai' : 'explainable-ai', 
                    'optimization' : 'hyperparameter-optimization', 
                    'datascience' : 'data-science', 
                    'big-data' : 'bigdata', 
                    'sklearn' : 'scikit-learn',
                    'distributed' : 'distributed-computing', 
                    'sciml' : 'scientific-machine-learning',
                    'differentialequations' : 'differential-equations', 
                    'segmentation' : 'image-segmentation', 
                    'gan' : 'generative-adversarial-network', 
                    'c-plus-plus' : 'cpp', 
                    'rnn' : 'recurrent-neural-networks',
                    'tensorflow2' : 'tensorflow',
                    'recommender-system' : 'recommendation-system'}

stopwords = ['hacktoberfest2021', 'hacktoberfest', 'awesome', 'awesome-list']
duplicated_words = list(change_word_dict.keys())


In [13]:
# change words in 'repo_topic_dict'

for k, v in repo_topic_dict.items() : 
    for i, word in enumerate(v) : 
        if word in duplicated_words : 
            repo_topic_dict[k][i] = change_word_dict[word]

        if word in stopwords : 
            del repo_topic_dict[k][i]

In [14]:
# 토픽 유니크 리스트 만들기 
topics = []

for topic_corpus in repo_topic_dict.values() :
    topics += topic_corpus 

unique_topics = list(set(topics))


print('유니크한 토픽의 갯수 : {}'.format(len(unique_topics)))

유니크한 토픽의 갯수 : 6456


In [15]:
# heterogeneous 네트워크 구축 
repo_topic_network = pd.DataFrame(np.zeros((len(repo_topic_dict.keys()), len(unique_topics))), index=repo_topic_dict.keys(), columns=unique_topics)

for k, v in tqdm(repo_topic_dict.items()) : 
    for topic_name in v :
        repo_topic_network.loc[k, topic_name] += 1  

100%|██████████| 3367/3367 [00:01<00:00, 1841.86it/s]


## 5-1. co-word 네트워크 구축 
노드가 word인 네트워크 

In [25]:
coword = repo_topic_network.T @ repo_topic_network

# diagonal term 제거 
for topic in unique_topics : 
    coword.loc[topic, topic] = 0

In [65]:
# 네트워크 저장하기 
coword.to_csv('data/network/coword.csv')

- 경험적으로 edge weight를 8로 했을때 네트워크가 가장 깔끔하게 그려짐 

In [26]:
# edge weight n이상인 네트워크만 남기기 
edge_weight = 4
filtered_coword = coword.where(coword>edge_weight, 0)

# 네트워크 저장하기 
filtered_coword.to_csv('data/network/' + str(edge_weight) +'_filtered_coword.csv')

### Network normalization using association strength

![](../ipynb_img/association_strength.png)

Above image is equation of association strength. Here, $c_{ij}$ means value of each element. Also, $s_i$ denotes the i-th diagonal term of the co-occurrence matrix.

In [9]:
network =repo_topic_network.T @ repo_topic_network

In [13]:
# Create a dictionary that stores the node name as a key and the value of the corresponding diagonal term as a value
repo_diag_dict = {node : network.loc[node, node] for node in network.columns}

# Association strength practice
normalized_network = network.copy()

for node1 in tqdm(network.columns) :
    for node2 in network.columns :
        normalized_network.loc[node1, node2] = network.loc[node1, node2] / (repo_diag_dict[node1] * repo_diag_dict[node2])

100%|██████████| 6456/6456 [23:38<00:00,  4.55it/s]


In [15]:
normalized_network.to_csv('data/network/normalized_coword.csv')

As a result of drawing the network by directly inputting it into gephi, it was confirmed that the most significant network was drawn when the threshold was 0.4.

In [9]:
normalized_network = pd.read_csv('data/network/normalized_coword.csv', index_col=0)

# edge weight n이상인 네트워크만 남기기 
edge_weight = 0.5
columns = normalized_network.columns
normalized_network = normalized_network.values

filtered_normalized_coword = np.where(normalized_network>edge_weight, 1, 0)
filtered_normalized_coword = pd.DataFrame(filtered_normalized_coword, index=columns, columns=columns)

# self-loop 제거
for i in range(len(filtered_normalized_coword)) : 
    filtered_normalized_coword.iloc[i, i] = 0

# 네트워크 저장하기 
filtered_normalized_coword.to_csv('data/network/' + str(edge_weight) +'_filtered_normalized_coword.csv')

## 5-2. word-coupling 네트워크 구축 

In [28]:
word_coupling = repo_topic_network @ repo_topic_network.T

# diagonal term 제거 
for repo in word_coupling.columns : 
    word_coupling.loc[repo, repo] = 0

In [30]:
# 네트워크 저장하기 
word_coupling.to_csv('data/network/word_coupling.csv')

- 엣지의 수가 너무 많음;;; 
- 엣지 weight 8정도로 컷팅하고 이후 weight 조절하며 그림 확인하기 

In [35]:
# edge weight n이상인 네트워크만 남기기 
edge_weight = 4
filtered_word_coupling= word_coupling.where(word_coupling>edge_weight, 0)

# 네트워크 저장하기 
filtered_word_coupling.to_csv('data/network/' + str(edge_weight) +'_filtered_word_coupling.csv')

---

# 6. Construct topic-repository network 

Topic quasi-networks have too many links to a particular topic, and are distinctly different from the original network.   

This does not conform to the hypothesis that the predicted network will be a network at a future point in the original network.   

Therefore, instead of applying the link prediction technique to the current quasi-network, we want to build the quasi-network again after applying the link prediction to the original network.

In this code, we try to follow the network format proposed by TextGCN.   

Nodes are composed of two types: topics and storage. 

An edge is created between the topic and the repository based on the existence of the topic.   

The reason why tf-idf weights are not used here is that the number of appearances of topics per storage is fixed at 1.   

That is, the only factor that affects the weight is idf, which gives a high weight to a topic that is not used often (such as a non-mainstream topic or a typo).    

Therefore, in this network, binary edges are allocated based on the appearance of topics without using tf-idf weights.

Also, there are edges between each topic. An edge is created between them based on mutual information.   

The established network is as follows.

![textgcn](../ipynb_img/textgcn_network.png)



The approximate network structure is as follows.

![network_form](../ipynb_img/textgcn.jpeg)

In the above figure, zones 2 and 3 are repo-topic networks, so there is no need to implement them separately.

ㅑ will call the top 1 and 2 zones as the upper network, and the bottom 3 and 4 zones as the lower network.

In [16]:
# get list data and return combination of each data 
# input : list
# output : list of tuples 
def combination(data) :
    output = []
    for w1 in data :
        for w2 in data :
            if (w1 == w2) or ((w2, w1) in output) : 
                continue 
            output.append((w1, w2))

    return output

In [17]:
# There are already constructred repo-topic network named 'repo_topic_network'
# So, we should cacluate PPMI between topics 
# This is zone 1 of the above network.

# Calculate probability of occurence per each topics 
n_repo, n_topic = repo_topic_network.shape

total_words = []
for v in repo_topic_dict.values() :
    total_words += v 

occur_prob = {topic : total_words.count(topic)/n_repo for topic in list(repo_topic_network.columns)}

# Calculate probabilty of co-occurence per topic pairs
co_occur_prob = {}
for corpus in repo_topic_dict.values() : 
    comb_of_words = combination(corpus)
    for c in comb_of_words : 
        if c not in co_occur_prob.keys() :
            co_occur_prob[c] = 1
        else : 
            co_occur_prob[c] += 1 

for k, v in co_occur_prob.items() : 
    co_occur_prob[k] = v/n_repo

# Caluclate PPMI 
for k, v in co_occur_prob.items() :
    co_occur_prob[k] = math.log10(v/(occur_prob[k[0]] * occur_prob[k[1]])) 
    if co_occur_prob[k] < 0 :
        co_occur_prob[k] = 0

In [18]:
# Constuct topic-topic network 
ppmi_network = pd.DataFrame(np.zeros((n_topic, n_topic)), columns=repo_topic_network.columns, index=repo_topic_network.columns)

for k, v in tqdm(co_occur_prob.items()) :
    ppmi_network.loc[k[0], k[1]] = v
    ppmi_network.loc[k[1], k[0]] = v

100%|██████████| 72860/72860 [00:04<00:00, 17091.73it/s]


In [45]:
# Create zone 4 in above network drawing
repo_repo_network = pd.DataFrame(np.zeros((len(repo_topic_network.index), len(repo_topic_network.index))),
                                    columns=repo_topic_network.index, index=repo_topic_network.index)


In [50]:
# Construct upper network 
upper = pd.concat([ppmi_network, repo_topic_network.T], axis=1)

# Construct lower network
lower = pd.concat([repo_topic_network, repo_repo_network], axis=1)

# Concat final textgcn network
output_network = pd.concat([upper, lower]) 

In [54]:
# Save network 
repo_topic_network.to_csv('../data/network/repo_topic/repo_topic_network.csv')
output_network.to_csv('../data/network/repo_topic/textgcn_network.csv')

In [55]:
repo_topic_network

Unnamed: 0,manifold-learning,odroid,driving-cars,alphago-zero,ordinary-differential-equations,cognitive-search-engine,amazon,scala-library,network-embedding,fast,...,customer-journey-map,coco-dataset,watson-machine-learning,beam-search,medical-physics,gemm,early-warning-systems,textgraphs,swiftui,quality-assessment
PeterL1n/RobustVideoMatting,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ml-tooling/opyrator,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4paradigm/OpenMLDB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hora-search/hora,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
salesforce/Merlion,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BuggleInc/PLM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rieck/sally,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
jeremybarnes/jml,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
shogun-toolbox/shogun-data,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
