In [6]:
# API 키 설정 (!!!본인 키 넣기!!!)
import os

# 상위 폴더의 key.txt에서 API 키 읽어오기
with open(os.path.join('..', 'key.txt'), 'r') as f:
    API_KEY = f.read().strip()

# 직접 입력
# API_KEY = "여기에_본인_API_키_입력"

In [15]:
#START_TIME을 pubg_match_data.csv 파일의 가장 마지막 데이터의 createdAt 값보다 하루 많게 설정
import pandas as pd
existing_df = pd.read_csv('pubg_match_data.csv')
existing_df['createdAt'] = pd.to_datetime(existing_df['createdAt'])
START_TIME = (existing_df['createdAt'].max() + pd.Timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%SZ')
print(f"Using START_TIME: {START_TIME}")

Using START_TIME: 2025-11-16T23:32:08Z


In [16]:
import requests
import pandas as pd

PLATFORM = 'steam'
#START_TIME = '2025-11-03T23:22:24Z'  # 받아오고 싶은 시간 (UTC ISO8601)

URL = f'https://api.pubg.com/shards/{PLATFORM}/samples'
HEADERS = {
    'Authorization': f'Bearer {API_KEY}',
    'Accept': 'application/vnd.api+json'
}
PARAMS = {
    'filter[createdAt-start]': START_TIME
}

response = requests.get(URL, headers=HEADERS, params=PARAMS)
print('Status Code:', response.status_code)

data = response.json()

Status Code: 400


In [18]:
# 샘플 데이터의 match id 추출
match_ids = []

if 'relationships' in data['data'] and 'matches' in data['data']['relationships']:
    matches_data = data['data']['relationships']['matches'].get('data', [])
    
    if isinstance(matches_data, list):
        for match in matches_data:
            if isinstance(match, dict) and 'id' in match:
                match_ids.append(match['id'])

print(f"Total match IDs: {len(match_ids)}")

KeyError: 'data'

In [10]:
# 각 match id로 상세 정보 요청 (간소화)
headers = {
    'Authorization': f'Bearer {API_KEY}',
    'Accept': 'application/vnd.api+json'
}

match_data_list = []
for match_id in match_ids:
    url = f'https://api.pubg.com/shards/{PLATFORM}/matches/{match_id}'
    response = requests.get(url, headers=headers)
    
    if response.ok:
        data = response.json()
        attrs = data['data']['attributes']
        
        # telemetry URL 찾기
        telemetry_url = next(
            (item['attributes']['URL'] for item in data.get('included', []) 
            if item.get('attributes', {}).get('URL')),
            None
        )
        
        # 매치 정보 저장
        match_data_list.append({
            'id': data['data']['id'],
            'createdAt': attrs.get('createdAt'),
            'gameMode': attrs.get('gameMode'),
            'mapName': attrs.get('mapName'),
            'duration': attrs.get('duration'),
            'telemetryURL': telemetry_url,
            'isCustomGame': attrs.get('isCustomMatch'),
            'shardId': attrs.get('shardId'),
        })
    else:
        print(f"Failed to get match {match_id}: {response.status_code}")

# DataFrame으로 변환
df = pd.DataFrame(match_data_list)
df.head()

Unnamed: 0,id,createdAt,gameMode,mapName,duration,telemetryURL,isCustomGame,shardId
0,03383493-df57-4416-b3cc-ed7387481f48,2025-11-10T23:33:26Z,squad,Tiger_Main,1513,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,False,steam
1,aadafddf-4bbd-4208-a8c4-e8781360559b,2025-11-10T23:47:09Z,duo-fpp,Savage_Main,4474,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,False,steam
2,8bf3f5a8-b9bd-404c-bdf1-575f6f1289e3,2025-11-10T23:42:38Z,duo-fpp,Baltic_Main,1637,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,False,steam
3,31cf4c79-66c0-498d-ae73-8460b8673793,2025-11-10T23:50:36Z,duo,Savage_Main,3753,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,False,steam
4,72381820-4873-4cf3-b8f1-c60929572713,2025-11-11T00:05:06Z,squad,Baltic_Main,1388,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,False,steam


In [11]:
# 전처리
# 맵 이름 매핑
map_name_dict = {
  "Baltic_Main": "Erangel",
  "Chimera_Main": "Paramo",
  "Desert_Main": "Miramar",
  "DihorOtok_Main": "Vikendi",
  "Erangel_Main": "Erangel",
  "Heaven_Main": "Haven",
  "Kiki_Main": "Deston",
  "Range_Main": "Camp Jackal",
  "Savage_Main": "Sanhok",
  "Summerland_Main": "Karakin",
  "Tiger_Main": "Taego",
  "Neon_Main": "Rondo"
}

# 맵 이름 딕셔너리에 있는 값 빼고 제거 & 매핑
df = df[df['mapName'].isin(map_name_dict.keys())]
df['mapName'] = df['mapName'].map(map_name_dict)

# 커스텀 게임 제거
df = df[df['isCustomGame'] == False]
df = df.drop(columns=['isCustomGame'])

# 게임모드 리스트에 없는 값 제거
game_modes = ['duo', 'duo-fpp', 'squad', 'squad-fpp',
              'solo', 'solo-fpp']
df = df[df['gameMode'].isin(game_modes)]
df.reset_index(drop=True, inplace=True)

print(f"Records after preprocessing: {len(df)}")
df.head()

Records after preprocessing: 529


Unnamed: 0,id,createdAt,gameMode,mapName,duration,telemetryURL,shardId
0,03383493-df57-4416-b3cc-ed7387481f48,2025-11-10T23:33:26Z,squad,Taego,1513,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam
1,aadafddf-4bbd-4208-a8c4-e8781360559b,2025-11-10T23:47:09Z,duo-fpp,Sanhok,4474,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam
2,8bf3f5a8-b9bd-404c-bdf1-575f6f1289e3,2025-11-10T23:42:38Z,duo-fpp,Erangel,1637,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam
3,31cf4c79-66c0-498d-ae73-8460b8673793,2025-11-10T23:50:36Z,duo,Sanhok,3753,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam
4,72381820-4873-4cf3-b8f1-c60929572713,2025-11-11T00:05:06Z,squad,Erangel,1388,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam


In [12]:
#createdAt 컬럼을 datetime 타입으로 변환 후 정렬
df['createdAt'] = pd.to_datetime(df['createdAt'])
df = df.sort_values(by='createdAt').reset_index(drop=True)
df.head()

Unnamed: 0,id,createdAt,gameMode,mapName,duration,telemetryURL,shardId
0,03383493-df57-4416-b3cc-ed7387481f48,2025-11-10 23:33:26+00:00,squad,Taego,1513,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam
1,8bf3f5a8-b9bd-404c-bdf1-575f6f1289e3,2025-11-10 23:42:38+00:00,duo-fpp,Erangel,1637,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam
2,aadafddf-4bbd-4208-a8c4-e8781360559b,2025-11-10 23:47:09+00:00,duo-fpp,Sanhok,4474,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam
3,31cf4c79-66c0-498d-ae73-8460b8673793,2025-11-10 23:50:36+00:00,duo,Sanhok,3753,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam
4,72381820-4873-4cf3-b8f1-c60929572713,2025-11-11 00:05:06+00:00,squad,Erangel,1388,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam


In [None]:
#optional: CSV로 저장
#df.to_csv('pubg_match_data.csv', index=False)

In [13]:
#pubg_match_data.csv 파일 불러와서 비교 후 병합
existing_df = pd.read_csv('pubg_match_data.csv')
existing_df['createdAt'] = pd.to_datetime(existing_df['createdAt'])
#새로 크롤링한 데이터와 기존 데이터 병합
combined_df = pd.concat([existing_df, df]).drop_duplicates(subset='id').reset_index(drop=True)
print(f"Total records after merging: {len(combined_df)}")

# 병합된 데이터 CSV로 저장
combined_df.to_csv('pubg_match_data.csv', index=False)

Total records after merging: 5889


In [14]:
# 두 번째 셀(인덱스 1)부터 마지막 셀(인덱스 8)까지 5번 반복 실행
for i in range(5):
    print(f"\n{'='*50}")
    print(f"반복 실행 {i+1}회차 시작")
    print(f"{'='*50}\n")
    
    # CELL 1: START_TIME 설정
    print("Step 1: START_TIME 설정 중...")
    existing_df = pd.read_csv('pubg_match_data.csv')
    existing_df['createdAt'] = pd.to_datetime(existing_df['createdAt'])
    START_TIME = (existing_df['createdAt'].max() + pd.Timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%SZ')
    print(f"Using START_TIME: {START_TIME}")
    
    # CELL 2: API 요청
    print("\nStep 2: API 샘플 데이터 요청 중...")
    PARAMS = {'filter[createdAt-start]': START_TIME}
    response = requests.get(URL, headers=HEADERS, params=PARAMS)
    print(f'Status Code: {response.status_code}')
    data = response.json()
    
    # CELL 3: Match ID 추출
    print("\nStep 3: Match ID 추출 중...")
    match_ids = []
    if 'relationships' in data['data'] and 'matches' in data['data']['relationships']:
        matches_data = data['data']['relationships']['matches'].get('data', [])
        if isinstance(matches_data, list):
            for match in matches_data:
                if isinstance(match, dict) and 'id' in match:
                    match_ids.append(match['id'])
    print(f"Total match IDs: {len(match_ids)}")
    
    # CELL 4: 매치 상세 정보 요청
    print("\nStep 4: 매치 상세 정보 수집 중...")
    match_data_list = []
    for match_id in match_ids:
        url = f'https://api.pubg.com/shards/{PLATFORM}/matches/{match_id}'
        response = requests.get(url, headers=headers)
        
        if response.ok:
            data = response.json()
            attrs = data['data']['attributes']
            
            telemetry_url = next(
                (item['attributes']['URL'] for item in data.get('included', []) 
                if item.get('attributes', {}).get('URL')),
                None
            )
            
            match_data_list.append({
                'id': data['data']['id'],
                'createdAt': attrs.get('createdAt'),
                'gameMode': attrs.get('gameMode'),
                'mapName': attrs.get('mapName'),
                'duration': attrs.get('duration'),
                'telemetryURL': telemetry_url,
                'isCustomGame': attrs.get('isCustomMatch'),
                'shardId': attrs.get('shardId'),
            })
        else:
            print(f"Failed to get match {match_id}: {response.status_code}")
    
    df = pd.DataFrame(match_data_list)
    print(f"수집된 매치 수: {len(df)}")
    
    # CELL 5: 전처리
    print("\nStep 5: 데이터 전처리 중...")
    df = df[df['mapName'].isin(map_name_dict.keys())]
    df['mapName'] = df['mapName'].map(map_name_dict)
    df = df[df['isCustomGame'] == False]
    df = df.drop(columns=['isCustomGame'])
    df = df[df['gameMode'].isin(game_modes)]
    df.reset_index(drop=True, inplace=True)
    print(f"Records after preprocessing: {len(df)}")
    
    # CELL 6: 날짜 변환 및 정렬
    print("\nStep 6: 날짜 변환 및 정렬 중...")
    df['createdAt'] = pd.to_datetime(df['createdAt'])
    df = df.sort_values(by='createdAt').reset_index(drop=True)
    
    # CELL 8: 기존 데이터와 병합 및 저장
    print("\nStep 7: 기존 데이터와 병합 및 저장 중...")
    existing_df = pd.read_csv('pubg_match_data.csv')
    existing_df['createdAt'] = pd.to_datetime(existing_df['createdAt'])
    combined_df = pd.concat([existing_df, df]).drop_duplicates(subset='id').reset_index(drop=True)
    print(f"Total records after merging: {len(combined_df)}")
    combined_df.to_csv('pubg_match_data.csv', index=False)
    
    print(f"\n{'='*50}")
    print(f"반복 실행 {i+1}회차 완료")
    print(f"{'='*50}\n")

print("\n모든 반복 실행 완료!")


반복 실행 1회차 시작

Step 1: START_TIME 설정 중...
Using START_TIME: 2025-11-12T23:23:58Z

Step 2: API 샘플 데이터 요청 중...
Status Code: 200

Step 3: Match ID 추출 중...
Total match IDs: 948

Step 4: 매치 상세 정보 수집 중...
수집된 매치 수: 948

Step 5: 데이터 전처리 중...
Records after preprocessing: 760

Step 6: 날짜 변환 및 정렬 중...

Step 7: 기존 데이터와 병합 및 저장 중...
Total records after merging: 6649

반복 실행 1회차 완료


반복 실행 2회차 시작

Step 1: START_TIME 설정 중...
Using START_TIME: 2025-11-13T23:27:38Z

Step 2: API 샘플 데이터 요청 중...
Status Code: 200

Step 3: Match ID 추출 중...
Total match IDs: 409

Step 4: 매치 상세 정보 수집 중...
수집된 매치 수: 409

Step 5: 데이터 전처리 중...
Records after preprocessing: 331

Step 6: 날짜 변환 및 정렬 중...

Step 7: 기존 데이터와 병합 및 저장 중...
Total records after merging: 6980

반복 실행 2회차 완료


반복 실행 3회차 시작

Step 1: START_TIME 설정 중...
Using START_TIME: 2025-11-14T23:20:44Z

Step 2: API 샘플 데이터 요청 중...
Status Code: 200

Step 3: Match ID 추출 중...
Total match IDs: 855

Step 4: 매치 상세 정보 수집 중...
수집된 매치 수: 855

Step 5: 데이터 전처리 중...
Records after preproc

KeyError: 'data'