In [1]:
# Basic data analysis and data IO
import numpy as np
import pandas as pd
import pickle

# Network preprocssing 
import scipy.sparse as sp
import torch

# GNN 
from torch_geometric.data import InMemoryDataset, Data
from torch_geometric.nn import GCNConv
import torch_geometric.transforms as T 
from torch_geometric.utils import negative_sampling

# Evaluation 
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

# Clustering
from sklearn.cluster import KMeans
from collections import Counter

# Visualization
import plotly.express as px
import chart_studio
from chart_studio.plotly import plot

# ETC
from tqdm import tqdm
import warnings 

warnings.filterwarnings(action='ignore')
chart_studio.tools.set_credentials_file(username='Injokim', api_key='ZAsiIjcDIzXdYHa8CXbw')

# 1. Construct Data loader 

In [2]:
"""
- Select feature information to be input to the GNN algorithm
- Mainly divided into two types: None and basic
     - None: contains no information, identity matrix as input, total length 39
     - basic : readme information 
"""

node_data = pd.read_csv('data/network/repo_topic/textgcn_network.csv', index_col=0).columns
n_repo = 3367
n_topic = 6456
repo_node, topic_node = node_data[n_topic:], node_data[:n_topic]

def feature_selector(network, feature=None) :
    if feature == 'basic' : 
        data = pd.read_csv('data/data/filtered_data.csv')


        # call document embedding vector
        with open('data/readme/16_length_readme.pickle', 'rb') as f :
            doc2vec = pickle.load(f)
        repo_feature = torch.tensor(doc2vec).to(torch.float)
        topic_feature = torch.zeros((n_topic, repo_feature.size(1))).to(torch.float)
        x = torch.cat((topic_feature, repo_feature), 0)
        
        # not use in this project 
        '''
        # call social metric 
        need_feature = ['contributors_count', 'stargazers_count', 'forks_count', 'watchers_count']
        feature_data = data[need_feature]

        x = torch.tensor(feature_data.values).to(torch.float)
        x = torch.cat((x, doc2vec), 1)
        '''
        

    # add all genereated feature vectors
    elif feature == None :
        x = torch.eye(network.shape[0]).to(torch.float)
        
    return x

In [3]:
class MyData(InMemoryDataset) :
    def __init__(self, transform) :
        super(MyData, self).__init__()

        network = pd.read_csv('data/network/repo_topic/textgcn_network.csv', index_col=0)
        self.node_name = network.columns
        network = network.values

        adj = sp.csr_matrix(network).tocoo()
        row = torch.from_numpy(adj.row).to(torch.float)
        col = torch.from_numpy(adj.col).to(torch.float)

        edge_index = torch.stack([row, col], dim=0).to(torch.long)
        x = feature_selector(network, feature='basic')
        edge_weight = torch.tensor(adj.data).to(torch.long)

        data = Data(x=x, edge_index=edge_index, edge_attr=edge_weight)
        if transform : 
            data = transform(data)

        self.data, self.slices = self.collate([data])

- slices에는 데이터의 shape 정보가 포함되어 있으나 이를 쓸 일은 없음
- data 내에 모든 정보가 포함되어 있으니, 그냥 얘만 쓰면 됨 

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


transform_train = T.Compose([
    T.NormalizeFeatures(),
    T.ToDevice(device),
    T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=False,
                      add_negative_train_samples=True),
])

transform_embedding = T.Compose([
    T.NormalizeFeatures(),
    T.ToDevice(device),
])

# train splited data 
data_load = MyData(transform=transform_train)
train_data, val_data, test_data = data_load[0]

# embedding full data 
full_data = MyData(transform=transform_embedding)
data = full_data.data

- 데이터 로딩 결과, 4가지 기본 정보가 포함되어 있음 
    - x :원본 네트워크, sparse network 형태
    - edge_index : sparse 네트워크에서 변형한 형태로, 값이 존재하는 인덱스의 쌍만을 포함 
    - edge_attribute : edge weight
    - train, val, test mask : 각 edge의 용도를 나타냄 <br></br>

- train, val, test split된 edge index, edge weight 정보 또한 포함 

---

# 2. Model construction

In [5]:
class GCNmodel(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return prob_adj

---

# 3. GCN based link prediction

In [6]:
device = 'cpu'
hidden_size = 128
embedding_size = 64
epochs = 100
iteration = 1000

model = GCNmodel(data_load.num_features, hidden_size, embedding_size).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

In [7]:
def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)

    # We perform a new round of negative sampling for every training epoch:
    neg_edge_index = negative_sampling(
        edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
        num_neg_samples=train_data.edge_label_index.size(1), method='sparse')

    edge_label_index = torch.cat(
        [train_data.edge_label_index, neg_edge_index],
        dim=-1,
    )
    edge_label = torch.cat([
        train_data.edge_label,
        train_data.edge_label.new_zeros(neg_edge_index.size(1))
    ], dim=0)

    out = model.decode(z, edge_label_index).view(-1)
    loss = criterion(out, edge_label)
    loss.backward()
    optimizer.step()
    return loss

@torch.no_grad()
def test(data):
    model.eval()
    z = model.encode(data.x, data.edge_index)
    out = model.decode(z, data.edge_label_index).view(-1).sigmoid()
    out_binary = np.array([1 if ele >=0.95 else 0 for ele in out.cpu().numpy()])
    label = data.edge_label.cpu().numpy()
    return roc_auc_score(label, out.cpu().numpy()), f1_score(label, out_binary), precision_score(label, out_binary), recall_score(label, out_binary)

In [16]:
best_val_auc = final_test_auc = 0
final_test_f1 = final_test_precision = final_test_recall = 0

# visualize training process 
# add all validation and test metrics into list to draw graph 
val_auc_list, val_f1_list, val_precision_list, val_recall_list = [],[],[],[]
test_auc_list, test_f1_list, test_precision_list, test_recall_list = [],[],[],[]

# training parts
for epoch in range(1, epochs+1):
    loss = train()
    val_auc, val_f1, val_precision, val_recall = test(val_data)
    test_auc, test_f1, test_precision, test_recall = test(test_data)
    
    if val_auc > best_val_auc:
        best_val = val_auc
        final_test_auc = test_auc
        final_test_f1 = test_f1
        final_test_precision = test_precision
        final_test_recall = test_recall 

    if epoch % 10 == 0 :
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val: {val_auc:.4f}, '
            f'Test: {test_auc:.4f}')

print('\n')
print('Final Result')
print(f'Final Test AUC: {final_test_auc:.4f}')
print(f'Final Test F1 score: {final_test_f1:.4f}')
print(f'Final Test Precision: {final_test_precision:.4f}')
print(f'Final Test Recall: {final_test_recall:.4f}')

z = model.encode(data.x, data.edge_index)
final_hetero = model.decode_all(z)

Epoch: 010, Loss: 0.6623, Val: 0.8226, Test: 0.8200
Epoch: 020, Loss: 0.6611, Val: 0.8489, Test: 0.8446
Epoch: 030, Loss: 0.6592, Val: 0.8623, Test: 0.8595
Epoch: 040, Loss: 0.6583, Val: 0.8695, Test: 0.8669
Epoch: 050, Loss: 0.6561, Val: 0.8747, Test: 0.8719
Epoch: 060, Loss: 0.6561, Val: 0.8601, Test: 0.8582
Epoch: 070, Loss: 0.6528, Val: 0.8742, Test: 0.8736
Epoch: 080, Loss: 0.6510, Val: 0.8724, Test: 0.8734
Epoch: 090, Loss: 0.6489, Val: 0.8638, Test: 0.8649
Epoch: 100, Loss: 0.6492, Val: 0.8859, Test: 0.8873


Final Result
Final Test AUC: 0.8873
Final Test F1 score: 0.2977
Final Test Precision: 0.9942
Final Test Recall: 0.1751


In [17]:
# Extract repo-topic heterogeneous network
# Conduct dichotomize
pred_repo_topic = final_hetero[:n_topic, n_topic:].sigmoid()
pred_repo_topic = torch.where(pred_repo_topic>0.9, 1, 0)

# 4. Predicted Quasi-topic network

In [18]:
# Create new quasi network
final_word = pred_repo_topic @ pred_repo_topic.T
final_word = pd.DataFrame(final_word.detach().numpy(), columns=topic_node, index=topic_node)

# remove diagonal term 
for node in final_word.columns :
    final_word.loc[node, node] = 0

In [19]:
final_word.to_csv('data/network/predicted_network/4_pred_word_network.csv')

wandb: Network error (ConnectionError), entering retry loop.
wandb: Network error (ConnectTimeout), entering retry loop.


In [None]:
# edge normalization using association strength
node_diag = {node : final_word.loc[node, node] for node in final_word.columns}

norm_final_word = final_word.copy()
for node1 in tqdm(final_word.columns) :
    for node2 in final_word.columns :
        if node1 == node2 :
            norm_final_word.loc[node1, node2] = 0
        norm_final_word.loc[node1, node2] = norm_final_word.loc[node1, node2]/(node_diag[node1]*node_diag[node2])

In [None]:
norm_final_word.to_csv('data/network/predicted_network/1_norm_final_network.csv')

# 5. Predicted network analysis 

## 5-1. Network filtering

In [None]:
import networkx as nx
G = nx.from_pandas_adjacency(norm_final_word)

##  5-2. Network clustering

In [None]:
embedding_vector = z.detach().numpy()

# 클러스터링 
kmeans = KMeans(n_clusters=20, random_state=11).fit(embedding_vector)
clusters = kmeans.labels_

# 노드에 클러스터 할당 
repo_cluster = {row.full_name : clusters[row_idx] for row_idx, row in node_data.iterrows()}
repo_cluster = pd.DataFrame.from_dict(repo_cluster, orient='index')

# 데이터프레임 정리
repo_cluster.reset_index(inplace=True)
repo_cluster.columns = ['Id', 'cluster']

In [None]:
repo_cluster.to_csv('data/network/cluster.csv', index=False)

## 5-3. Check data distribution and topic distribution 

In [None]:
repo_cluster = pd.read_csv('data/network/cluster.csv')

# 클러스터 분포 확인 
cluster_freq = pd.DataFrame.from_dict(Counter(repo_cluster.cluster), orient='index').reset_index()
cluster_freq.columns = ['cluster', 'freq']

In [None]:
fig = px.bar(cluster_freq, x='cluster', y='freq')
plot(fig, filename='gcn_kmeans', auto_open=True)

In [None]:
# 클러스터별 리드미 분류 
for cluster in np.unique(repo_cluster.cluster) :
    list(repo_cluster[repo_cluster.cluster==cluster].Id)


# 6. 테스트용 

In [11]:
import matplotlib.pyplot as plt
%matplotlib qt

In [15]:
a, b  = [], []
plt.axis([0, 100000, 0, 100000])

for i in range(100000) : 
    a.append(i)
    b.append(i)
    plt.plot(a, b)
    plt.pause(0.05)


plt.show()

KeyboardInterrupt: 

KeyboardInterrupt: 