1. 기초준비 및 웹페이지 구성파악

In [None]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs


In [None]:
base_url = 'https://understat.com/league'
leagues = ['La_liga', 'EPL', 'Bundesliga', 'Serie_A', 'Ligue_1', 'RFPL']
seasons = []
for i in range(8):
  c = 2015 + i
  seasons.append(str(c))

In [None]:
url = base_url+'/'+leagues[0]+'/'+seasons[0]
res = requests.get(url)
soup = bs(res.content, 'lxml')

scripts = soup.find_all('script')

2. JSON 핸들링

In [None]:
import json

In [None]:
starting_with_json_obj = ''

for el in scripts:
  if 'teamsData' in str(el):
    string_with_json_obj = str(el).strip()

#print(string_with_json_obj)

ind_start = string_with_json_obj.index("('")+2        # (' 이후의 내용이 필요
ind_end = string_with_json_obj.index(")")             # ) 전 까지의 내용이 필요
json_data = string_with_json_obj[ind_start:ind_end]

In [None]:
json_data = json_data.encode('utf8').decode('unicode_escape')

3. 파이썬에서 데이터 확인

오류발생) repr(json_data)로 확인 결과, json_data 마지막에 \' 때문에 json.loads 불가

In [None]:
#print(json_data)
print(repr(json_data))

In [None]:
#json_data = json_data.replace("\'", '')
#print(repr(json_data))
data = json.loads(json_data)

In [None]:
print(data.keys())  # 137~156
print('='*150)
print(data['137'].keys())
print('='*150)
print(data['137']['id'])
print('='*150)
print(data['137']['title'])
print('='*150)
print(data['137']['history'][0])

In [None]:
teams = {}
for id in data.keys():
  teams[id] = data[id]['title']
teams

In [None]:
columns = []
values = []
for id in data.keys():
  columns = list(data[id]['history'][0].keys())
  values = list(data[id]['history'][0].values())
  break

print(columns)
print(values)

In [None]:
sevilla_data = []
for row in data['138']['history']:
  sevilla_data.append(list(row.values()))

df = pd.DataFrame(sevilla_data, columns=columns)
df.head()

3. All Teams 데이터

In [None]:
dataframes = {}
for id, team in teams.items():
  teams_data = []
  for row in data[id]['history']:
    teams_data.append(list(row.values()))

  df = pd.DataFrame(teams_data, columns = columns)
  dataframes[team] = df
  print('Added Data for {}.'.format(team))

In [None]:
dataframes['Barcelona'].head()

5. All 시즌 All 팀 All 데이터 스크래핑

In [None]:
full_data = dict()
for league in leagues:
  season_data = dict()

  for season in seasons:
    url = base_url+'/'+league+'/'+season
    res = requests.get(url)
    soup = bs(res.content, 'lxml')

    scripts = soup.find_all('script')
    string_with_json_obj = ''

    for el in scripts:
      if 'teamsData' in el.text:
        string_with_json_obj = el.text.strip()
    
    ind_start = string_with_json_obj.index("('")+2
    ind_end = string_with_json_obj.index("')")
    json_data = string_with_json_obj[ind_start:ind_end]
    json_data = json_data.encode('utf8').decode('unicode_escape')

    data = json.loads(json_data)


    teams = {}
    for id in data.keys():
      teams[id] = data[id]['title']
      
    # EDA to get a feeling of how the JSON is structured
    # Column names are all the same, so we just use first element
    columns = []
    # Check the sample of values per each column
    values = []
    for id in data.keys():
      columns = list(data[id]['history'][0].keys())
      values = list(data[id]['history'][0].values())
      break
      
    # Getting data for all teams
    dataframes = {}
    for id, team in teams.items():
      teams_data = []
      for row in data[id]['history']:
        teams_data.append(list(row.values()))

      df = pd.DataFrame(teams_data, columns=columns)
      dataframes[team] = df
      # print('Added data for {}.'.format(team))
      
    for team, df in dataframes.items():
      dataframes[team]['ppda_coef'] = dataframes[team]['ppda'].apply(lambda x: x['att']/x['def'] if x['def'] != 0 else 0)
      dataframes[team]['oppda_coef'] = dataframes[team]['ppda_allowed'].apply(lambda x: x['att']/x['def'] if x['def'] != 0 else 0)
      
    cols_to_sum = ['xG', 'xGA', 'npxG', 'npxGA', 'deep', 'deep_allowed', 'scored', 'missed', 'xpts', 'wins', 'draws', 'loses', 'pts', 'npxGD']
    cols_to_mean = ['ppda_coef', 'oppda_coef']
    
    frames = []
    for team, df in dataframes.items():
      sum_data = pd.DataFrame(df[cols_to_sum].sum()).transpose()
      mean_data = pd.DataFrame(df[cols_to_mean].mean()).transpose()
      final_df = sum_data.join(mean_data)
      final_df['team'] = team
      final_df['matches'] = len(df)
      frames.append(final_df)

    full_stat = pd.concat(frames)
    
    full_stat = full_stat[['team', 'matches', 'wins', 'draws', 'loses', 'scored', 'missed', 'pts', 'xG', 'npxG', 'xGA', 'npxGA', 'npxGD', 'ppda_coef', 'oppda_coef', 'deep', 'deep_allowed', 'xpts']]
    full_stat.sort_values('pts', ascending=False, inplace=True)
    full_stat.reset_index(inplace=True, drop=True)
    full_stat['position'] = range(1,len(full_stat)+1)  
    
    full_stat['xG_diff'] = full_stat['xG'] - full_stat['scored']
    full_stat['xGA_diff'] = full_stat['xGA'] - full_stat['missed']
    full_stat['xpts_diff'] = full_stat['xpts'] - full_stat['pts']
    
    cols_to_int = ['wins', 'draws', 'loses', 'scored', 'missed', 'pts', 'deep', 'deep_allowed']
    full_stat[cols_to_int] = full_stat[cols_to_int].astype(int)
    
    col_order = ['position', 'team', 'matches', 'wins', 'draws', 'loses', 'scored', 'missed', 'pts', 'xG', 'xG_diff', 'npxG', 'xGA', 'xGA_diff', 'npxGA', 'npxGD', 'ppda_coef', 'oppda_coef', 'deep', 'deep_allowed', 'xpts', 'xpts_diff']
    full_stat = full_stat[col_order]
    full_stat = full_stat.set_index('position')
    # print(full_stat.head(20))
    
    season_data[season] = full_stat
  
  df_season = pd.concat(season_data)
  full_data[league] = df_season
  
data = pd.concat(full_data)
data.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,team,matches,wins,draws,loses,scored,missed,pts,xG,xG_diff,...,xGA,xGA_diff,npxGA,npxGD,ppda_coef,oppda_coef,deep,deep_allowed,xpts,xpts_diff
Unnamed: 0_level_1,Unnamed: 1_level_1,position,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
La_liga,2015,1,Barcelona,38,29,4,5,112,29,91,113.59827,1.59827,...,34.02909,5.02909,33.28594,66.19108,6.012152,15.06439,570,163,94.38,3.38
La_liga,2015,2,Real Madrid,38,28,6,4,110,34,90,90.454148,-19.545852,...,45.233782,11.233782,42.260671,41.503967,9.251967,14.571881,404,211,79.0927,-10.9073
La_liga,2015,3,Atletico Madrid,38,28,4,6,63,18,88,54.927365,-8.072635,...,27.797052,9.797052,27.053774,26.387161,8.834884,9.045124,261,170,72.2803,-15.7197
La_liga,2015,4,Villarreal,38,18,10,10,44,35,64,40.488814,-3.511186,...,41.630788,6.630788,39.400958,-1.141985,9.924386,9.034772,188,215,52.1071,-11.8929
La_liga,2015,5,Athletic Club,38,18,8,12,58,45,62,53.896614,-4.103386,...,45.326224,0.326224,40.123397,9.916378,8.11232,9.674713,221,174,58.2381,-3.7619


In [None]:
data.to_csv('understat.com.csv')