In [67]:
# API 키 설정 (!!!본인 키 넣기!!!)
import os

# 상위 폴더의 key.txt에서 API 키 읽어오기
with open(os.path.join('..', 'key.txt'), 'r') as f:
    API_KEY = f.read().strip()

# 직접 입력
# API_KEY = "여기에_본인_API_키_입력"

In [77]:
#START_TIME을 pubg_match_data.csv 파일의 가장 마지막 데이터의 createdAt 값보다 하루 많게 설정
import pandas as pd
existing_df = pd.read_csv('pubg_match_data.csv')
existing_df['createdAt'] = pd.to_datetime(existing_df['createdAt'])
START_TIME = (existing_df['createdAt'].max() + pd.Timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%SZ')
print(f"Using START_TIME: {START_TIME}")

Using START_TIME: 2025-11-05T23:28:57Z


In [78]:
import requests
import pandas as pd

PLATFORM = 'steam'
#START_TIME = '2025-11-03T23:22:24Z'  # 받아오고 싶은 시간 (UTC ISO8601)

URL = f'https://api.pubg.com/shards/{PLATFORM}/samples'
HEADERS = {
    'Authorization': f'Bearer {API_KEY}',
    'Accept': 'application/vnd.api+json'
}
PARAMS = {
    'filter[createdAt-start]': START_TIME
}

response = requests.get(URL, headers=HEADERS, params=PARAMS)
print('Status Code:', response.status_code)

data = response.json()

Status Code: 200


In [79]:
# 샘플 데이터의 match id 추출
match_ids = []

if 'relationships' in data['data'] and 'matches' in data['data']['relationships']:
    matches_data = data['data']['relationships']['matches'].get('data', [])
    
    if isinstance(matches_data, list):
        for match in matches_data:
            if isinstance(match, dict) and 'id' in match:
                match_ids.append(match['id'])

print(f"Total match IDs: {len(match_ids)}")

Total match IDs: 542


In [80]:
# 각 match id로 상세 정보 요청 (간소화)
headers = {
    'Authorization': f'Bearer {API_KEY}',
    'Accept': 'application/vnd.api+json'
}

match_data_list = []
for match_id in match_ids:
    url = f'https://api.pubg.com/shards/{PLATFORM}/matches/{match_id}'
    response = requests.get(url, headers=headers)
    
    if response.ok:
        data = response.json()
        attrs = data['data']['attributes']
        
        # telemetry URL 찾기
        telemetry_url = next(
            (item['attributes']['URL'] for item in data.get('included', []) 
            if item.get('attributes', {}).get('URL')),
            None
        )
        
        # 매치 정보 저장
        match_data_list.append({
            'id': data['data']['id'],
            'createdAt': attrs.get('createdAt'),
            'gameMode': attrs.get('gameMode'),
            'mapName': attrs.get('mapName'),
            'duration': attrs.get('duration'),
            'telemetryURL': telemetry_url,
            'isCustomGame': attrs.get('isCustomMatch'),
            'shardId': attrs.get('shardId'),
        })
    else:
        print(f"Failed to get match {match_id}: {response.status_code}")

# DataFrame으로 변환
df = pd.DataFrame(match_data_list)
df.head()

Unnamed: 0,id,createdAt,gameMode,mapName,duration,telemetryURL,isCustomGame,shardId
0,e7c5177a-1912-4b4c-bca6-222419037b41,2025-11-04T23:40:50Z,squad,Baltic_Main,1423,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,False,steam
1,5fe1d9fd-1935-4c06-bffe-82d82984be64,2025-11-04T23:38:12Z,solo,Neon_Main,1658,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,False,steam
2,424d620c-7015-47e4-9062-447a400c49a8,2025-11-04T23:52:19Z,squad,Baltic_Main,2181,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,False,steam
3,f5d84e1d-bb1a-47a0-a52a-a48e1f2f680c,2025-11-04T23:49:54Z,squad,Neon_Main,1814,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,False,steam
4,4e5f8ad1-e843-4e41-8789-088e98cdf4b6,2025-11-05T00:02:31Z,squad,Desert_Main,1335,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,False,steam


In [81]:
# 전처리
# 맵 이름 매핑
map_name_dict = {
  "Baltic_Main": "Erangel",
  "Chimera_Main": "Paramo",
  "Desert_Main": "Miramar",
  "DihorOtok_Main": "Vikendi",
  "Erangel_Main": "Erangel",
  "Heaven_Main": "Haven",
  "Kiki_Main": "Deston",
  "Range_Main": "Camp Jackal",
  "Savage_Main": "Sanhok",
  "Summerland_Main": "Karakin",
  "Tiger_Main": "Taego",
  "Neon_Main": "Rondo"
}

# 맵 이름 딕셔너리에 있는 값 빼고 제거 & 매핑
df = df[df['mapName'].isin(map_name_dict.keys())]
df['mapName'] = df['mapName'].map(map_name_dict)

# 커스텀 게임 제거
df = df[df['isCustomGame'] == False]
df = df.drop(columns=['isCustomGame'])

# 게임모드 리스트에 없는 값 제거
game_modes = ['duo', 'duo-fpp', 'squad', 'squad-fpp',
              'solo', 'solo-fpp']
df = df[df['gameMode'].isin(game_modes)]
df.reset_index(drop=True, inplace=True)

print(f"Records after preprocessing: {len(df)}")
df.head()

Records after preprocessing: 444


Unnamed: 0,id,createdAt,gameMode,mapName,duration,telemetryURL,shardId
0,e7c5177a-1912-4b4c-bca6-222419037b41,2025-11-04T23:40:50Z,squad,Erangel,1423,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam
1,5fe1d9fd-1935-4c06-bffe-82d82984be64,2025-11-04T23:38:12Z,solo,Rondo,1658,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam
2,424d620c-7015-47e4-9062-447a400c49a8,2025-11-04T23:52:19Z,squad,Erangel,2181,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam
3,f5d84e1d-bb1a-47a0-a52a-a48e1f2f680c,2025-11-04T23:49:54Z,squad,Rondo,1814,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam
4,4e5f8ad1-e843-4e41-8789-088e98cdf4b6,2025-11-05T00:02:31Z,squad,Miramar,1335,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam


In [82]:
#createdAt 컬럼을 datetime 타입으로 변환 후 정렬
df['createdAt'] = pd.to_datetime(df['createdAt'])
df = df.sort_values(by='createdAt').reset_index(drop=True)
df.head()

Unnamed: 0,id,createdAt,gameMode,mapName,duration,telemetryURL,shardId
0,5fe1d9fd-1935-4c06-bffe-82d82984be64,2025-11-04 23:38:12+00:00,solo,Rondo,1658,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam
1,e7c5177a-1912-4b4c-bca6-222419037b41,2025-11-04 23:40:50+00:00,squad,Erangel,1423,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam
2,f5d84e1d-bb1a-47a0-a52a-a48e1f2f680c,2025-11-04 23:49:54+00:00,squad,Rondo,1814,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam
3,424d620c-7015-47e4-9062-447a400c49a8,2025-11-04 23:52:19+00:00,squad,Erangel,2181,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam
4,4e5f8ad1-e843-4e41-8789-088e98cdf4b6,2025-11-05 00:02:31+00:00,squad,Miramar,1335,https://telemetry-cdn.pubg.com/bluehole-pubg/s...,steam


In [None]:
#optional: CSV로 저장
#df.to_csv('pubg_match_data.csv', index=False)

In [83]:
#pubg_match_data.csv 파일 불러와서 비교 후 병합
existing_df = pd.read_csv('pubg_match_data.csv')
existing_df['createdAt'] = pd.to_datetime(existing_df['createdAt'])
#새로 크롤링한 데이터와 기존 데이터 병합
combined_df = pd.concat([existing_df, df]).drop_duplicates(subset='id').reset_index(drop=True)
print(f"Total records after merging: {len(combined_df)}")

# 병합된 데이터 CSV로 저장
combined_df.to_csv('pubg_match_data.csv', index=False)

Total records after merging: 2163


In [85]:
# 두 번째 셀(인덱스 1)부터 마지막 셀(인덱스 8)까지 5번 반복 실행
for i in range(5):
    print(f"\n{'='*50}")
    print(f"반복 실행 {i+1}회차 시작")
    print(f"{'='*50}\n")
    
    # CELL 1: START_TIME 설정
    print("Step 1: START_TIME 설정 중...")
    existing_df = pd.read_csv('pubg_match_data.csv')
    existing_df['createdAt'] = pd.to_datetime(existing_df['createdAt'])
    START_TIME = (existing_df['createdAt'].max() + pd.Timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%SZ')
    print(f"Using START_TIME: {START_TIME}")
    
    # CELL 2: API 요청
    print("\nStep 2: API 샘플 데이터 요청 중...")
    PARAMS = {'filter[createdAt-start]': START_TIME}
    response = requests.get(URL, headers=HEADERS, params=PARAMS)
    print(f'Status Code: {response.status_code}')
    data = response.json()
    
    # CELL 3: Match ID 추출
    print("\nStep 3: Match ID 추출 중...")
    match_ids = []
    if 'relationships' in data['data'] and 'matches' in data['data']['relationships']:
        matches_data = data['data']['relationships']['matches'].get('data', [])
        if isinstance(matches_data, list):
            for match in matches_data:
                if isinstance(match, dict) and 'id' in match:
                    match_ids.append(match['id'])
    print(f"Total match IDs: {len(match_ids)}")
    
    # CELL 4: 매치 상세 정보 요청
    print("\nStep 4: 매치 상세 정보 수집 중...")
    match_data_list = []
    for match_id in match_ids:
        url = f'https://api.pubg.com/shards/{PLATFORM}/matches/{match_id}'
        response = requests.get(url, headers=headers)
        
        if response.ok:
            data = response.json()
            attrs = data['data']['attributes']
            
            telemetry_url = next(
                (item['attributes']['URL'] for item in data.get('included', []) 
                if item.get('attributes', {}).get('URL')),
                None
            )
            
            match_data_list.append({
                'id': data['data']['id'],
                'createdAt': attrs.get('createdAt'),
                'gameMode': attrs.get('gameMode'),
                'mapName': attrs.get('mapName'),
                'duration': attrs.get('duration'),
                'telemetryURL': telemetry_url,
                'isCustomGame': attrs.get('isCustomMatch'),
                'shardId': attrs.get('shardId'),
            })
        else:
            print(f"Failed to get match {match_id}: {response.status_code}")
    
    df = pd.DataFrame(match_data_list)
    print(f"수집된 매치 수: {len(df)}")
    
    # CELL 5: 전처리
    print("\nStep 5: 데이터 전처리 중...")
    df = df[df['mapName'].isin(map_name_dict.keys())]
    df['mapName'] = df['mapName'].map(map_name_dict)
    df = df[df['isCustomGame'] == False]
    df = df.drop(columns=['isCustomGame'])
    df = df[df['gameMode'].isin(game_modes)]
    df.reset_index(drop=True, inplace=True)
    print(f"Records after preprocessing: {len(df)}")
    
    # CELL 6: 날짜 변환 및 정렬
    print("\nStep 6: 날짜 변환 및 정렬 중...")
    df['createdAt'] = pd.to_datetime(df['createdAt'])
    df = df.sort_values(by='createdAt').reset_index(drop=True)
    
    # CELL 8: 기존 데이터와 병합 및 저장
    print("\nStep 7: 기존 데이터와 병합 및 저장 중...")
    existing_df = pd.read_csv('pubg_match_data.csv')
    existing_df['createdAt'] = pd.to_datetime(existing_df['createdAt'])
    combined_df = pd.concat([existing_df, df]).drop_duplicates(subset='id').reset_index(drop=True)
    print(f"Total records after merging: {len(combined_df)}")
    combined_df.to_csv('pubg_match_data.csv', index=False)
    
    print(f"\n{'='*50}")
    print(f"반복 실행 {i+1}회차 완료")
    print(f"{'='*50}\n")

print("\n모든 반복 실행 완료!")


반복 실행 1회차 시작

Step 1: START_TIME 설정 중...
Using START_TIME: 2025-11-06T23:36:16Z

Step 2: API 샘플 데이터 요청 중...
Status Code: 200

Step 3: Match ID 추출 중...
Total match IDs: 414

Step 4: 매치 상세 정보 수집 중...
Status Code: 200

Step 3: Match ID 추출 중...
Total match IDs: 414

Step 4: 매치 상세 정보 수집 중...
수집된 매치 수: 414

Step 5: 데이터 전처리 중...
Records after preprocessing: 340

Step 6: 날짜 변환 및 정렬 중...

Step 7: 기존 데이터와 병합 및 저장 중...
Total records after merging: 2503

반복 실행 1회차 완료


반복 실행 2회차 시작

Step 1: START_TIME 설정 중...
Using START_TIME: 2025-11-07T23:34:41Z

Step 2: API 샘플 데이터 요청 중...
수집된 매치 수: 414

Step 5: 데이터 전처리 중...
Records after preprocessing: 340

Step 6: 날짜 변환 및 정렬 중...

Step 7: 기존 데이터와 병합 및 저장 중...
Total records after merging: 2503

반복 실행 1회차 완료


반복 실행 2회차 시작

Step 1: START_TIME 설정 중...
Using START_TIME: 2025-11-07T23:34:41Z

Step 2: API 샘플 데이터 요청 중...
Status Code: 200

Step 3: Match ID 추출 중...
Total match IDs: 937

Step 4: 매치 상세 정보 수집 중...
Status Code: 200

Step 3: Match ID 추출 중...
Total match ID