In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-2.0.0+cu117.html
!pip install torch-geometric

Looking in links: https://data.pyg.org/whl/torch-2.0.0+cu117.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcu117/torch_scatter-2.1.2%2Bpt20cu117-cp311-cp311-linux_x86_64.whl (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcu117/torch_sparse-0.6.18%2Bpt20cu117-cp311-cp311-linux_x86_64.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-cluster
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcu117/torch_cluster-1.6.3%2Bpt20cu117-cp311-cp311-linux_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-spline-conv
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcu117/torch_spli

#데이터 로드 및 전처리

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sampled_5000_matches.csv')

In [None]:
df.head()

Unnamed: 0,date,game_size,match_id,match_mode,party_size,player_assists,player_dbno,player_dist_ride,player_dist_walk,player_dmg,player_kills,player_name,player_survive_time,team_id,team_placement
0,2018-01-03T23:23:03+0000,27,2U4GBNA0YmlmN9OV04IhlvWAEi5_umKbgD3OYbUyA1eaSh...,tpp,4,0,0,0.0,2115.04761,19,0,ssdaybreak,1084.986,1,12
1,2018-01-03T23:23:03+0000,27,2U4GBNA0YmlmN9OV04IhlvWAEi5_umKbgD3OYbUyA1eaSh...,tpp,4,0,0,0.0,2663.02563,0,0,Hung0,1185.968,1,12
2,2018-01-03T23:23:03+0000,27,2U4GBNA0YmlmN9OV04IhlvWAEi5_umKbgD3OYbUyA1eaSh...,tpp,4,0,1,0.0,75.27817,138,1,Red_Acher,158.299,6,26
3,2018-01-03T23:23:03+0000,27,2U4GBNA0YmlmN9OV04IhlvWAEi5_umKbgD3OYbUyA1eaSh...,tpp,4,0,0,0.0,26.331602,0,0,2pacTupac,130.263,6,26
4,2018-01-03T23:23:03+0000,27,2U4GBNA0YmlmN9OV04IhlvWAEi5_umKbgD3OYbUyA1eaSh...,tpp,4,0,1,0.0,77.08412,175,1,qweaaaa2,157.039,6,26


#그래프 생성

In [None]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data, DataLoader
from tqdm import tqdm

# df: 주어진 데이터프레임 (컬럼: match_id, team_id, player_dmg, player_kills, player_assists,
# player_dist_walk, player_survive_time, player_dbno, player_dist_ride, player_name, ...)

def build_team_graph(group):
    """
    group: 한 팀에 속한 플레이어들의 DataFrame (같은 match_id, team_id)

    노드 피처: ['player_dmg', 'player_kills', 'player_assists',
                'player_dist_walk', 'player_survive_time', 'player_dbno', 'player_dist_ride']
    플레이어 이름 리스트도 저장합니다.
    """
    # 선택한 피처들 사용하여 노드 피처 행렬 생성
    features = group[['player_dmg', 'player_kills', 'player_assists',
                      'player_dist_walk', 'player_survive_time',
                      'player_dbno', 'player_dist_ride']].values
    x = torch.tensor(features, dtype=torch.float)

    num_nodes = x.size(0)
    edge_index = []
    # 완전 그래프: 모든 노드 쌍(자기 자신 제외)에 대해 엣지 생성
    for i in range(num_nodes):
        for j in range(num_nodes):
            if i != j:
                edge_index.append([i, j])
    if len(edge_index) > 0:
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    else:
        edge_index = torch.empty((2, 0), dtype=torch.long)

    # Data 객체 생성 (y는 라벨 없이 생성, 나중에 자가지도 학습이나 클러스터링 적용 예정)
    data = Data(x=x, edge_index=edge_index)
    # 메타 정보 저장: match_id, team_id, 그리고 팀원들의 player_name 리스트
    data.match_id = group['match_id'].iloc[0]
    data.team_id = group['team_id'].iloc[0]
    data.player_names = group['player_name'].tolist()
    return data

# 각 팀별로 그룹화 (match_id, team_id 기준)
grouped = df.groupby(['match_id', 'team_id'])

# 모든 팀에 대해 그래프 생성 (팀 구성원이 2명 이상인 경우)
team_graphs = []
for (match_id, team_id), group in tqdm(grouped, total=len(grouped), desc="Building team graphs"):
    if len(group) >= 2:
        graph = build_team_graph(group)
        team_graphs.append(graph)

# DataLoader 생성 (배치 학습용)
loader = DataLoader(team_graphs, batch_size=32, shuffle=True)

print(f"Total team graphs: {len(team_graphs)}")

Building team graphs: 100%|██████████| 133333/133333 [01:29<00:00, 1492.78it/s]

Total team graphs: 127600





#GAE 모델 정의

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn.models import GAE
from tqdm import tqdm

# 1. GCN 기반 인코더 정의
class GCNEncoder(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.5):
        super(GCNEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
        self.dropout = dropout

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# 2. 하이퍼파라미터 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
in_channels = 7  # 사용한 피처 수
hidden_channels = 64
embedding_dim = 32

# 모델 초기화
encoder = GCNEncoder(in_channels, hidden_channels, embedding_dim)
model_gae = GAE(encoder).to(device)
optimizer = torch.optim.Adam(model_gae.parameters(), lr=0.005)

# 3. GAE 학습 함수
def train_gae(loader, model, optimizer, device):
    model.train()
    total_loss = 0
    pbar = tqdm(loader, desc="Training GAE", leave=False)
    for data in pbar:
        data = data.to(device)
        optimizer.zero_grad()
        z = model.encode(data.x, data.edge_index)
        loss = model.recon_loss(z, data.edge_index)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_graphs
        pbar.set_postfix(loss=f"{loss.item():.4f}")
    return total_loss / len(loader.dataset)

# 4. 학습 실행
num_epochs = 30
for epoch in range(1, num_epochs + 1):
    loss = train_gae(loader, model_gae, optimizer, device)
    print(f"Epoch {epoch:02d} | Loss: {loss:.4f}")



Epoch 01 | Loss: 34.4906




Epoch 02 | Loss: 34.4871




Epoch 03 | Loss: 34.5137




Epoch 04 | Loss: 34.5351




Epoch 05 | Loss: 34.5354




Epoch 06 | Loss: 34.5356




Epoch 07 | Loss: 34.5337




Epoch 08 | Loss: 34.4831




Epoch 09 | Loss: 34.4616




Epoch 10 | Loss: 34.4574




Epoch 11 | Loss: 34.4593




Epoch 12 | Loss: 34.4614




Epoch 13 | Loss: 34.4612




Epoch 14 | Loss: 34.4566




Epoch 15 | Loss: 34.4550




Epoch 16 | Loss: 34.4615




Epoch 17 | Loss: 34.4628




Epoch 18 | Loss: 34.4558




Epoch 19 | Loss: 34.4619




Epoch 20 | Loss: 34.4636




Epoch 21 | Loss: 34.4598




Epoch 22 | Loss: 34.4570




Epoch 23 | Loss: 34.4577




Epoch 24 | Loss: 20.9646




Epoch 25 | Loss: 19.3623




Epoch 26 | Loss: 19.3792




Epoch 27 | Loss: 19.4001




Epoch 28 | Loss: 33.2876




Epoch 29 | Loss: 33.8323


                                                                                

Epoch 30 | Loss: 33.7573




In [None]:
import torch
from torch_geometric.nn import global_mean_pool
from tqdm import tqdm
import pandas as pd

model_gae.eval()  # 평가 모드로 전환

all_node_anomalies = []  # 각 플레이어의 anomaly score 저장

for data in tqdm(team_graphs, desc="Extracting node embeddings"):
    data = data.to(device)
    z = model_gae.encoder(data.x, data.edge_index)  # 노드 임베딩 추출
    team_mean = z.mean(dim=0, keepdim=True)  # 팀 평균 임베딩 계산
    distances = torch.norm(z - team_mean, p=2, dim=1)  # 각 노드와 팀 평균 사이의 유클리드 거리
    distances = distances.cpu().detach().numpy()

    for i, name in enumerate(data.player_names):
        all_node_anomalies.append({
            'match_id': data.match_id,
            'team_id': data.team_id,
            'player_name': name,
            'anomaly_score': distances[i]
        })

node_anomaly_df = pd.DataFrame(all_node_anomalies)
print(node_anomaly_df.head())


Extracting node embeddings: 100%|██████████| 127600/127600 [03:25<00:00, 621.80it/s]


                                            match_id  team_id  player_name  \
0  2U4GBNA0Ymk-33gkSbXwzd9OkoC4wAW7jqjQHbM4wLSXbM...        2  cassiebelle   
1  2U4GBNA0Ymk-33gkSbXwzd9OkoC4wAW7jqjQHbM4wLSXbM...        2   LAOWAI-CNM   
2  2U4GBNA0Ymk-33gkSbXwzd9OkoC4wAW7jqjQHbM4wLSXbM...        2      NickHKS   
3  2U4GBNA0Ymk-33gkSbXwzd9OkoC4wAW7jqjQHbM4wLSXbM...        2         YHwd   
4  2U4GBNA0Ymk-33gkSbXwzd9OkoC4wAW7jqjQHbM4wLSXbM...        3       xgCola   

   anomaly_score  
0       0.000082  
1       0.000035  
2       0.000035  
3       0.000034  
4       0.000068  


In [None]:
# Step 2에서 생성한 node_anomaly_df에는 각 플레이어의 match_id, team_id, player_name, anomaly_score가 들어 있습니다.
# 예시:
#    match_id       team_id    player_name  anomaly_score
# 0  2U4GB...      4          ILovefantasy  0.5432
# 1  2U4GB...      9          jenn1769       0.3210
# ...

# 1. 플레이어별 anomaly score 통계 계산 (여러 매치에서의 평균, 최대, 표준편차 등)
player_anomaly_stats = node_anomaly_df.groupby('player_name')['anomaly_score'].agg(['mean', 'max', 'std']).reset_index()
print("플레이어별 anomaly score 통계:")
print(player_anomaly_stats.head())

# 2. 기준 설정: 예를 들어, 플레이어별 평균 anomaly score의 상위 10%를 이상치 후보로 선정
threshold = player_anomaly_stats['mean'].quantile(0.99)
print(f"이상치 후보 선정 기준 (평균 anomaly score 상위 1%): {threshold:.4f}")

# 3. 후보 선별: 평균 anomaly score가 기준 이상인 플레이어
candidates = player_anomaly_stats[player_anomaly_stats['mean'] >= threshold]
print("급격히 실력이 저하되는 후보 플레이어 수:", candidates.shape[0])
print(candidates)

# 추가: 후보 플레이어의 상세 anomaly score 분포를 시각화하거나, 다른 통계치를 함께 고려할 수 있습니다.

플레이어별 anomaly score 통계:
      player_name      mean       max       std
0     0---Wan---0  0.000245  0.000245       NaN
1  0-0-0-0-0-0-0-  0.000086  0.000086       NaN
2         0-0-0-7  0.000977  0.000977       NaN
3       0-ALieZ-9  0.001177  0.001228  0.000073
4       0-ASTAR-0  0.000065  0.000065       NaN
이상치 후보 선정 기준 (평균 anomaly score 상위 10%): 0.0036
급격히 실력이 저하되는 후보 플레이어 수: 3896
         player_name      mean       max  std
8       0-Exciting-0  0.005213  0.005213  NaN
165             00GB  0.003845  0.003845  NaN
242       0123456666  0.003735  0.003735  NaN
456            0951_  0.005051  0.005051  NaN
462             0980  0.004312  0.004312  NaN
...              ...       ...       ...  ...
388028      zxjinjin  0.005152  0.005152  NaN
388198          zy55  0.003678  0.003678  NaN
388304  zylihailihai  0.003891  0.003891  NaN
388480        zz963z  0.003758  0.003758  NaN
388741     zzxiaohai  0.005098  0.005098  NaN

[3896 rows x 4 columns]
