In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-2.0.0+cu117.html
!pip install torch-geometric

Looking in links: https://data.pyg.org/whl/torch-2.0.0+cu117.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcu117/torch_scatter-2.1.2%2Bpt20cu117-cp311-cp311-linux_x86_64.whl (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcu117/torch_sparse-0.6.18%2Bpt20cu117-cp311-cp311-linux_x86_64.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m107.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-cluster
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcu117/torch_cluster-1.6.3%2Bpt20cu117-cp311-cp311-linux_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m83.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-spline-conv
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcu117/torch_

#데이터 로드 및 전처리

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sampled_5000_matches.csv')

In [None]:
df.head()

Unnamed: 0,date,game_size,match_id,match_mode,party_size,player_assists,player_dbno,player_dist_ride,player_dist_walk,player_dmg,player_kills,player_name,player_survive_time,team_id,team_placement
0,2018-01-03T23:23:03+0000,27,2U4GBNA0YmlmN9OV04IhlvWAEi5_umKbgD3OYbUyA1eaSh...,tpp,4,0,0,0.0,2115.04761,19,0,ssdaybreak,1084.986,1,12
1,2018-01-03T23:23:03+0000,27,2U4GBNA0YmlmN9OV04IhlvWAEi5_umKbgD3OYbUyA1eaSh...,tpp,4,0,0,0.0,2663.02563,0,0,Hung0,1185.968,1,12
2,2018-01-03T23:23:03+0000,27,2U4GBNA0YmlmN9OV04IhlvWAEi5_umKbgD3OYbUyA1eaSh...,tpp,4,0,1,0.0,75.27817,138,1,Red_Acher,158.299,6,26
3,2018-01-03T23:23:03+0000,27,2U4GBNA0YmlmN9OV04IhlvWAEi5_umKbgD3OYbUyA1eaSh...,tpp,4,0,0,0.0,26.331602,0,0,2pacTupac,130.263,6,26
4,2018-01-03T23:23:03+0000,27,2U4GBNA0YmlmN9OV04IhlvWAEi5_umKbgD3OYbUyA1eaSh...,tpp,4,0,1,0.0,77.08412,175,1,qweaaaa2,157.039,6,26


##Autoencoder 기반

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm  # tqdm 라이브러리 임포트

# ------------------------------
# 1. 데이터 준비 및 전처리
# ------------------------------
# 예시: df라는 DataFrame이 이미 존재한다고 가정
# 사용 피처: player_kills, player_assists, player_dbno, 총 이동 거리, player_survive_time

# 총 이동 거리 계산 (player_dist_ride + player_dist_walk)
df["total_distance"] = df["player_dist_ride"] + df["player_dist_walk"]

# 사용할 피처 선택
feature_cols = ["player_kills", "player_assists", "player_dbno", "total_distance", "player_survive_time"]
features = df[feature_cols].values.astype(np.float32)

# 피처 정규화 (표준화)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# TensorDataset 생성
dataset = TensorDataset(torch.tensor(features_scaled))
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# ------------------------------
# 2. 자동 인코더(Autoencoder) 모델 정의
# ------------------------------
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super(AutoEncoder, self).__init__()
        # 인코더: 입력을 낮은 차원의 임베딩으로 압축
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, embedding_dim)  # embedding_dim을 1로 설정하면 복합 성과 점수가 단일 스칼라가 됨
        )
        # 디코더: 임베딩을 원래 차원으로 복원
        self.decoder = nn.Sequential(
            nn.Linear(embedding_dim, 16),
            nn.ReLU(),
            nn.Linear(16, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_recon = self.decoder(z)
        return z, x_recon

input_dim = features_scaled.shape[1]   # 5
embedding_dim = 1                      # 복합 성과 점수를 1차원으로 압축
model = AutoEncoder(input_dim, embedding_dim)

# ------------------------------
# 3. 모델 학습 설정
# ------------------------------
criterion = nn.MSELoss()       # 입력 복원 오차를 최소화하는 손실 함수
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 100

# ------------------------------
# 4. 학습 루프 (tqdm 적용)
# ------------------------------
for epoch in range(num_epochs):
    total_loss = 0
    # tqdm로 배치 진행바 생성
    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False):
        x_batch = batch[0]
        optimizer.zero_grad()
        z, x_recon = model(x_batch)
        loss = criterion(x_recon, x_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x_batch.size(0)
    avg_loss = total_loss / len(dataset)
    if (epoch+1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

# ------------------------------
# 5. 복합 성과 점수(임베딩) 추출
# ------------------------------
model.eval()
with torch.no_grad():
    embeddings, _ = model(torch.tensor(features_scaled))
    composite_scores = embeddings.numpy().flatten()

# DataFrame에 복합 성과 점수 추가
df["composite_performance_score"] = composite_scores



Epoch [10/100], Loss: 0.2472




Epoch [20/100], Loss: 0.1818




Epoch [30/100], Loss: 0.1655




Epoch [40/100], Loss: 0.1598




Epoch [50/100], Loss: 0.1522




Epoch [60/100], Loss: 0.1513




Epoch [70/100], Loss: 0.1444




Epoch [80/100], Loss: 0.1440




Epoch [90/100], Loss: 0.1398


                                                                     

Epoch [100/100], Loss: 0.1428




In [None]:
df.head()

Unnamed: 0,date,game_size,match_id,match_mode,party_size,player_assists,player_dbno,player_dist_ride,player_dist_walk,player_dmg,player_kills,player_name,player_survive_time,team_id,team_placement,total_distance,composite_performance_score
0,2018-01-03T23:23:03+0000,27,2U4GBNA0YmlmN9OV04IhlvWAEi5_umKbgD3OYbUyA1eaSh...,tpp,4,0,0,0.0,2115.04761,19,0,ssdaybreak,1084.986,1,12,2115.04761,-2.514812
1,2018-01-03T23:23:03+0000,27,2U4GBNA0YmlmN9OV04IhlvWAEi5_umKbgD3OYbUyA1eaSh...,tpp,4,0,0,0.0,2663.02563,0,0,Hung0,1185.968,1,12,2663.02563,-2.458499
2,2018-01-03T23:23:03+0000,27,2U4GBNA0YmlmN9OV04IhlvWAEi5_umKbgD3OYbUyA1eaSh...,tpp,4,0,1,0.0,75.27817,138,1,Red_Acher,158.299,6,26,75.27817,-3.027798
3,2018-01-03T23:23:03+0000,27,2U4GBNA0YmlmN9OV04IhlvWAEi5_umKbgD3OYbUyA1eaSh...,tpp,4,0,0,0.0,26.331602,0,0,2pacTupac,130.263,6,26,26.331602,-23.21347
4,2018-01-03T23:23:03+0000,27,2U4GBNA0YmlmN9OV04IhlvWAEi5_umKbgD3OYbUyA1eaSh...,tpp,4,0,1,0.0,77.08412,175,1,qweaaaa2,157.039,6,26,77.08412,-3.028286


In [None]:
len(df['player_name'].unique())

393286

##라벨링 작업

In [None]:
import pandas as pd
import numpy as np

# 예시: 이미 df라는 DataFrame이 있다고 가정
# df의 컬럼: date, game_size, match_id, match_mode, party_size,
# player_assists, player_dbno, player_dist_ride, player_dist_walk,
# player_dmg, player_kills, player_name, player_survive_time, team_id, team_placement

'''
# =============================================================================
# 1. 각 매치에서의 성과 스코어 계산 (여기서는 간단히 player_kills를 성과로 사용)
# =============================================================================
# 1. 총 이동 거리 계산 (라이드 + 워크)
df["total_distance"] = df["player_dist_ride"] + df["player_dist_walk"]

# 2. 각 요소별 가중치 설정 (이 값들은 도메인에 맞게 튜닝 필요)
weight_kills        = 100   # 킬에 높은 가중치 부여
weight_dbno         = 50    # DBNO(Down But Not Out) 횟수
weight_assists      = 25    # 어시스트
weight_distance     = 0.05  # 총 이동 거리 (단위: 미터)
weight_survive_time = 0.1   # 생존 시간 (단위: 초)

# 3. 복합 성과 지표 performance_score 계산
df["performance_score"] = (
    weight_kills * df["player_kills"] +
    weight_dbno * df["player_dbno"] +
    weight_assists * df["player_assists"] +
    weight_distance * df["total_distance"] +
    weight_survive_time * df["player_survive_time"]
)
'''

# =============================================================================
# 2. 플레이어별 평소 실력(기본 성과) 산출: 전체 매치에서의 평균 성과
# =============================================================================
player_baseline = (
    df.groupby("player_name")["composite_performance_score"]
      .mean()
      .reset_index()
      .rename(columns={"composite_performance_score": "baseline_score"})
)

# =============================================================================
# 3. 각 매치 행(row)에 플레이어의 평소 실력을 합치기 위해 baseline과 병합
# =============================================================================
df = df.merge(player_baseline, on="player_name", how="left")

# =============================================================================
# 4. 각 매치에서의 성과 편차 계산
#    편차 = 해당 매치 성과 - 플레이어 평소 평균 성과
# =============================================================================
df["performance_deviation"] = df["composite_performance_score"] - df["baseline_score"]

# 또한 상대적 성과 비율 계산 (평소 대비 몇 배의 성과를 내었는지)
df["performance_ratio"] = df["composite_performance_score"] / (df["baseline_score"] + 1e-6)  # 0으로 나누는 오류 방지

# =============================================================================
# 5. 같은 매치(match_id) 내 같은 팀(team_id)끼리 팀원들의 성과를 이용해 팀 강약(팀원 평균 성과)을 계산
#    - 각 그룹 내에서, 자신을 제외한 나머지 플레이어의 평균 성과를 구합니다.
# =============================================================================
def compute_team_strength(series):
    total = series.sum()
    count = series.count()
    # 팀원이 1명인 경우는 계산할 수 없으므로 NaN 처리
    return series.apply(lambda x: (total - x) / (count - 1) if count > 1 else np.nan)

df["team_strength"] = df.groupby(["match_id", "team_id"])["composite_performance_score"].transform(compute_team_strength)

# =============================================================================
# 6. 약한 팀(weak team) 판별
#    - 전체 팀 강약 분포에서 하위 30% 이하를 약팀으로 설정 (이 값은 조정 가능)
# =============================================================================
weak_threshold = df["team_strength"].quantile(0.3)
df["weak_team"] = df["team_strength"] < weak_threshold

# =============================================================================
# 7. 약팀에서의 플레이어 성과 편차 및 상대적 성과 비율을 플레이어별로 집계
# =============================================================================
# 약팀 매치에서의 평균 성과 편차 (절대값)
player_weak_deviation = (
    df[df["weak_team"]]
    .groupby("player_name")["performance_deviation"]
    .mean()
    .reset_index()
)

# 약팀 매치에서의 평균 성과 비율
player_weak_ratio = (
    df[df["weak_team"]]
    .groupby("player_name")["performance_ratio"]
    .mean()
    .reset_index()
)

# =============================================================================
# 8. 플레이어 기본 성과와 약팀에서의 성과 지표를 병합하여 분석용 DataFrame 생성
# =============================================================================
player_analysis = (
    player_baseline
    .merge(player_weak_deviation, on="player_name", how="left")
    .merge(player_weak_ratio, on="player_name", how="left")
    .rename(columns={
        "baseline_score": "avg_score_overall",
        "performance_deviation": "avg_deviation_weak",
        "performance_ratio": "avg_ratio_weak"
    })
)

# =============================================================================
# 9. 평소에는 준수한 실력을 가지지만 약팀에서 급격히 성과가 떨어지는 유저 선별
#    - 예: 평소 평균 성과가 0보다 크고, 약팀에서의 성과 비율이 0.5 미만인 유저
# =============================================================================
potential_throwers = player_analysis[
    (player_analysis["avg_score_overall"] > 0) & (player_analysis["avg_ratio_weak"] < 0.5)
].sort_values("avg_ratio_weak")

# 결과 출력
print("잠재적 게임 던지는(트롤) 유저:")
print(potential_throwers)

잠재적 게임 던지는(트롤) 유저:
            player_name  avg_score_overall  avg_deviation_weak  avg_ratio_weak
379042  xuanyunmingfeng           0.000093           -2.178993   -23272.476562
202343     SegwayJesus_           0.003831           -2.942895     -767.029236
104942       HoldMePapi           0.003571           -2.425426     -678.049866
198937           SY_YHC           0.004813           -3.046978     -631.884338
5278         2904713218           0.008238           -3.026577     -366.360291
...                 ...                ...                 ...             ...
13781         APPLEPEET           1.121171           -0.798143        0.288116
379088            xucoo           9.073892           -5.789828        0.361924
145410            Lyenn           1.139013           -0.636236        0.441414
232883      UchihaObito           2.306042           -1.279980        0.444945
157815           Mlenys           1.466952           -0.764696        0.478717

[801 rows x 4 columns]


In [None]:
len(potential_throwers['player_name'].unique())
# 1000명중에 2명꼴로 트롤

801