In [6]:
# API 키 설정 (!!!본인 키 넣기!!!)
import os

# 상위 폴더의 key.txt에서 API 키 읽어오기
with open(os.path.join('..', 'key.txt'), 'r') as f:
    API_KEY = f.read().strip()

# 직접 입력
# API_KEY = "여기에_본인_API_키_입력"

In [None]:
#START_TIME을 pubg_match_data.csv 파일의 가장 마지막 데이터의 createdAt 값보다 하루 많게 설정
import pandas as pd
#shardId가 kakao인 것 중에서 가장 최근 데이터의 createdAt 값 기준으로 START_TIME 설정
existing_df = pd.read_csv('pubg_match_data.csv')
existing_df['createdAt'] = pd.to_datetime(existing_df['createdAt'])
START_TIME = (existing_df['createdAt'].max() + pd.Timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%SZ')
print(f"Using START_TIME: {START_TIME}")

Using START_TIME: 2025-11-16T23:32:08Z


In [41]:
import requests
import pandas as pd

PLATFORM = 'kakao'
START_TIME = '2025-11-04T16:56:24Z'  # 받아오고 싶은 시간 (UTC ISO8601)

URL = f'https://api.pubg.com/shards/{PLATFORM}/samples'
HEADERS = {
    'Authorization': f'Bearer {API_KEY}',
    'Accept': 'application/vnd.api+json'
}
PARAMS = {
    'filter[createdAt-start]': START_TIME
}

response = requests.get(URL, headers=HEADERS, params=PARAMS)
print('Status Code:', response.status_code)

data = response.json()

Status Code: 200


In [42]:
# 샘플 데이터의 match id 추출
match_ids = []

if 'relationships' in data['data'] and 'matches' in data['data']['relationships']:
    matches_data = data['data']['relationships']['matches'].get('data', [])
    
    if isinstance(matches_data, list):
        for match in matches_data:
            if isinstance(match, dict) and 'id' in match:
                match_ids.append(match['id'])

print(f"Total match IDs: {len(match_ids)}")

Total match IDs: 291


In [43]:
# 각 match id로 상세 정보 요청 (간소화)
headers = {
    'Authorization': f'Bearer {API_KEY}',
    'Accept': 'application/vnd.api+json'
}

match_data_list = []
for match_id in match_ids:
    url = f'https://api.pubg.com/shards/{PLATFORM}/matches/{match_id}'
    response = requests.get(url, headers=headers)
    
    if response.ok:
        data = response.json()
        attrs = data['data']['attributes']
        
        # telemetry URL 찾기
        telemetry_url = next(
            (item['attributes']['URL'] for item in data.get('included', []) 
            if item.get('attributes', {}).get('URL')),
            None
        )
        
        # 매치 정보 저장
        match_data_list.append({
            'id': data['data']['id'],
            'createdAt': attrs.get('createdAt'),
            'gameMode': attrs.get('gameMode'),
            'mapName': attrs.get('mapName'),
            'duration': attrs.get('duration'),
            'telemetryURL': telemetry_url,
            'isCustomGame': attrs.get('isCustomMatch'),
            'shardId': attrs.get('shardId'),
        })
    else:
        print(f"Failed to get match {match_id}: {response.status_code}")

# DataFrame으로 변환
df = pd.DataFrame(match_data_list)
df.head()

Unnamed: 0,id,createdAt,gameMode,mapName,duration,telemetryURL,isCustomGame,shardId
0,86a1a8a3-84d7-4fa4-8a19-bbb6cc7ba4c7,2025-11-03T23:50:06Z,squad,Tiger_Main,1979,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,False,kakao
1,7ea73df9-738b-4eb2-b7fb-3013d1cf4575,2025-11-04T00:00:02Z,solo,Baltic_Main,1824,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,False,kakao
2,5d131410-9f74-4ee0-a30c-525723a9ca79,2025-11-04T00:06:52Z,duo,DihorOtok_Main,3121,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,False,kakao
3,75f934cf-bfc0-441d-84f1-8ecc43dc0090,2025-11-04T00:17:49Z,squad,Neon_Main,1940,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,False,kakao
4,acbcbb1e-0fdc-44be-ab5b-5f80806112a2,2025-11-03T23:50:59Z,clansolo,Range_Main,0,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,False,kakao


In [37]:
# 전처리
# 맵 이름 매핑
map_name_dict = {
  "Baltic_Main": "Erangel",
  "Chimera_Main": "Paramo",
  "Desert_Main": "Miramar",
  "DihorOtok_Main": "Vikendi",
  "Erangel_Main": "Erangel",
  "Heaven_Main": "Haven",
  "Kiki_Main": "Deston",
  "Range_Main": "Camp Jackal",
  "Savage_Main": "Sanhok",
  "Summerland_Main": "Karakin",
  "Tiger_Main": "Taego",
  "Neon_Main": "Rondo"
}

# 맵 이름 딕셔너리에 있는 값 빼고 제거 & 매핑
df = df[df['mapName'].isin(map_name_dict.keys())]
df['mapName'] = df['mapName'].map(map_name_dict)

# 커스텀 게임 제거
df = df[df['isCustomGame'] == False]
df = df.drop(columns=['isCustomGame'])

# 게임모드 리스트에 없는 값 제거
game_modes = ['duo', 'duo-fpp', 'squad', 'squad-fpp',
              'solo', 'solo-fpp']
df = df[df['gameMode'].isin(game_modes)]
df.reset_index(drop=True, inplace=True)

print(f"Records after preprocessing: {len(df)}")
df.head()

Records after preprocessing: 105


Unnamed: 0,id,createdAt,gameMode,mapName,duration,telemetryURL,shardId
0,32d4c3bb-f2fc-44c4-b48e-d06b0ef1c691,2025-11-02T23:43:37Z,solo,Erangel,1947,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,kakao
1,c39194f1-bb54-4772-b11f-538ed9280967,2025-11-03T00:21:33Z,squad,Taego,1508,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,kakao
2,dd09c727-60c8-44e1-a0da-ac5a6e2ead44,2025-11-03T00:25:40Z,squad,Taego,1620,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,kakao
3,78bd6483-a379-4f3e-931d-de86c434c6dc,2025-11-03T00:35:48Z,solo,Taego,1831,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,kakao
4,47e80581-47c3-4c3a-9191-eb955383216c,2025-11-03T01:45:58Z,squad,Sanhok,1680,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,kakao


In [None]:
#createdAt 컬럼을 datetime 타입으로 변환 후 정렬
df['createdAt'] = pd.to_datetime(df['createdAt'])
df = df.sort_values(by='createdAt').reset_index(drop=True)


Unnamed: 0,id,createdAt,gameMode,mapName,duration,telemetryURL,shardId
0,32d4c3bb-f2fc-44c4-b48e-d06b0ef1c691,2025-11-02 23:43:37+00:00,solo,Erangel,1947,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,kakao
1,c39194f1-bb54-4772-b11f-538ed9280967,2025-11-03 00:21:33+00:00,squad,Taego,1508,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,kakao
2,dd09c727-60c8-44e1-a0da-ac5a6e2ead44,2025-11-03 00:25:40+00:00,squad,Taego,1620,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,kakao
3,78bd6483-a379-4f3e-931d-de86c434c6dc,2025-11-03 00:35:48+00:00,solo,Taego,1831,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,kakao
4,47e80581-47c3-4c3a-9191-eb955383216c,2025-11-03 01:45:58+00:00,squad,Sanhok,1680,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,kakao


In [40]:
df.tail()

Unnamed: 0,id,createdAt,gameMode,mapName,duration,telemetryURL,shardId
100,b47f9113-4a42-4606-b057-eadb25a96c72,2025-11-03 19:03:56+00:00,duo,Erangel,1894,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,kakao
101,3f363088-60c4-449a-901c-d6f2e2e80d97,2025-11-03 19:24:17+00:00,squad,Erangel,1905,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,kakao
102,581f3a93-92aa-41b9-b3ca-4680b86466ad,2025-11-03 20:02:49+00:00,duo,Vikendi,1736,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,kakao
103,1df2300d-52a9-4a29-8ce9-bd7cd19edd76,2025-11-03 21:49:17+00:00,squad,Taego,1934,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,kakao
104,5f80e2aa-0f72-4b80-9933-7e463b7338ba,2025-11-03 22:10:59+00:00,squad,Taego,1849,https://telemetry-cdn.pubg.com/bluehole-pubg/k...,kakao


In [None]:
#optional: CSV로 저장
#df.to_csv('pubg_match_data.csv', index=False)

In [39]:
#pubg_match_data.csv 파일 불러와서 비교 후 병합
existing_df = pd.read_csv('pubg_match_data.csv')
existing_df['createdAt'] = pd.to_datetime(existing_df['createdAt'])
#새로 크롤링한 데이터와 기존 데이터 병합
combined_df = pd.concat([existing_df, df]).drop_duplicates(subset='id').reset_index(drop=True)
print(f"Total records after merging: {len(combined_df)}")

# 병합된 데이터 CSV로 저장
combined_df.to_csv('pubg_match_data.csv', index=False)

Total records after merging: 8721


In [14]:
# 두 번째 셀(인덱스 1)부터 마지막 셀(인덱스 8)까지 5번 반복 실행
for i in range(5):
    print(f"\n{'='*50}")
    print(f"반복 실행 {i+1}회차 시작")
    print(f"{'='*50}\n")
    
    # CELL 1: START_TIME 설정
    print("Step 1: START_TIME 설정 중...")
    existing_df = pd.read_csv('pubg_match_data.csv')
    existing_df['createdAt'] = pd.to_datetime(existing_df['createdAt'])
    START_TIME = (existing_df['createdAt'].max() + pd.Timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%SZ')
    print(f"Using START_TIME: {START_TIME}")
    
    # CELL 2: API 요청
    print("\nStep 2: API 샘플 데이터 요청 중...")
    PARAMS = {'filter[createdAt-start]': START_TIME}
    response = requests.get(URL, headers=HEADERS, params=PARAMS)
    print(f'Status Code: {response.status_code}')
    data = response.json()
    
    # CELL 3: Match ID 추출
    print("\nStep 3: Match ID 추출 중...")
    match_ids = []
    if 'relationships' in data['data'] and 'matches' in data['data']['relationships']:
        matches_data = data['data']['relationships']['matches'].get('data', [])
        if isinstance(matches_data, list):
            for match in matches_data:
                if isinstance(match, dict) and 'id' in match:
                    match_ids.append(match['id'])
    print(f"Total match IDs: {len(match_ids)}")
    
    # CELL 4: 매치 상세 정보 요청
    print("\nStep 4: 매치 상세 정보 수집 중...")
    match_data_list = []
    for match_id in match_ids:
        url = f'https://api.pubg.com/shards/{PLATFORM}/matches/{match_id}'
        response = requests.get(url, headers=headers)
        
        if response.ok:
            data = response.json()
            attrs = data['data']['attributes']
            
            telemetry_url = next(
                (item['attributes']['URL'] for item in data.get('included', []) 
                if item.get('attributes', {}).get('URL')),
                None
            )
            
            match_data_list.append({
                'id': data['data']['id'],
                'createdAt': attrs.get('createdAt'),
                'gameMode': attrs.get('gameMode'),
                'mapName': attrs.get('mapName'),
                'duration': attrs.get('duration'),
                'telemetryURL': telemetry_url,
                'isCustomGame': attrs.get('isCustomMatch'),
                'shardId': attrs.get('shardId'),
            })
        else:
            print(f"Failed to get match {match_id}: {response.status_code}")
    
    df = pd.DataFrame(match_data_list)
    print(f"수집된 매치 수: {len(df)}")
    
    # CELL 5: 전처리
    print("\nStep 5: 데이터 전처리 중...")
    df = df[df['mapName'].isin(map_name_dict.keys())]
    df['mapName'] = df['mapName'].map(map_name_dict)
    df = df[df['isCustomGame'] == False]
    df = df.drop(columns=['isCustomGame'])
    df = df[df['gameMode'].isin(game_modes)]
    df.reset_index(drop=True, inplace=True)
    print(f"Records after preprocessing: {len(df)}")
    
    # CELL 6: 날짜 변환 및 정렬
    print("\nStep 6: 날짜 변환 및 정렬 중...")
    df['createdAt'] = pd.to_datetime(df['createdAt'])
    df = df.sort_values(by='createdAt').reset_index(drop=True)
    
    # CELL 8: 기존 데이터와 병합 및 저장
    print("\nStep 7: 기존 데이터와 병합 및 저장 중...")
    existing_df = pd.read_csv('pubg_match_data.csv')
    existing_df['createdAt'] = pd.to_datetime(existing_df['createdAt'])
    combined_df = pd.concat([existing_df, df]).drop_duplicates(subset='id').reset_index(drop=True)
    print(f"Total records after merging: {len(combined_df)}")
    combined_df.to_csv('pubg_match_data.csv', index=False)
    
    print(f"\n{'='*50}")
    print(f"반복 실행 {i+1}회차 완료")
    print(f"{'='*50}\n")

print("\n모든 반복 실행 완료!")


반복 실행 1회차 시작

Step 1: START_TIME 설정 중...
Using START_TIME: 2025-11-12T23:23:58Z

Step 2: API 샘플 데이터 요청 중...
Status Code: 200

Step 3: Match ID 추출 중...
Total match IDs: 948

Step 4: 매치 상세 정보 수집 중...
수집된 매치 수: 948

Step 5: 데이터 전처리 중...
Records after preprocessing: 760

Step 6: 날짜 변환 및 정렬 중...

Step 7: 기존 데이터와 병합 및 저장 중...
Total records after merging: 6649

반복 실행 1회차 완료


반복 실행 2회차 시작

Step 1: START_TIME 설정 중...
Using START_TIME: 2025-11-13T23:27:38Z

Step 2: API 샘플 데이터 요청 중...
Status Code: 200

Step 3: Match ID 추출 중...
Total match IDs: 409

Step 4: 매치 상세 정보 수집 중...
수집된 매치 수: 409

Step 5: 데이터 전처리 중...
Records after preprocessing: 331

Step 6: 날짜 변환 및 정렬 중...

Step 7: 기존 데이터와 병합 및 저장 중...
Total records after merging: 6980

반복 실행 2회차 완료


반복 실행 3회차 시작

Step 1: START_TIME 설정 중...
Using START_TIME: 2025-11-14T23:20:44Z

Step 2: API 샘플 데이터 요청 중...
Status Code: 200

Step 3: Match ID 추출 중...
Total match IDs: 855

Step 4: 매치 상세 정보 수집 중...
수집된 매치 수: 855

Step 5: 데이터 전처리 중...
Records after preproc

KeyError: 'data'