In [2]:
import requests
from bs4 import BeautifulSoup
# import lxml.html as lh
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import numpy as np
import re
import time
import json
from tqdm import tqdm
from sqlalchemy import create_engine
from Scraper import get_schedule

# Schedule

In [5]:
def scrape_schedule_by_date(game_date,leagueID=401):
    """
    爬取某日期的比赛表，包括link
    返回一个list，每个item是一个dictionary，每个dictionary是一场比赛。如：
    {'比赛日': '2020-07-22 11:00',
      '阶段': '常规赛',
      '时间': '11:00',
      '状态': '已完赛',
      '主队': '四川五粮金樽|//sportsdata.misports.cn/beitai/cba/team?leagueid=401&teamid=29127',
      '比分': '101-84|//sportsdata.misports.cn/beitai/cba/live?leagueid=401&scheduleid=100040250',
      '客队': '八一南昌|//sportsdata.misports.cn/beitai/cba/team?leagueid=401&teamid=29116',
      '直播/回放': '技术统计|//sportsdata.misports.cn/beitai/cba/live?leagueid=401&scheduleid=100040250'}
    
    """
    headers={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"}
    data = {"currentdate": game_date,"leagueID": leagueID}

    r = requests.post(r"http://sportsdata.misports.cn/beitai/cba/schedule/getschedulelistForDate",
                  data=data, headers=headers)

    r.encoding = 'utf-8'

    soup = BeautifulSoup(r.content, "html.parser")

    assert len(soup.find_all('div',{'class':'title'}))==1,"more than 1 title div"
    current_day_games=[]
    for section in soup.find_all('div',attrs={"class":"section2"}):
        titles = [span.text.strip() for span in section.find('div', attrs={"class":'title'}).find_all('span')]
        games = []
        # 每一行是一场比赛
        for row in section.find('table').find_all('tr'):
            games.append([td.text.strip()+'|'+td.find('a')['href'] if td.find('a') else td.text.strip() for td in row.find_all('td')])
        for game in games:
            if '未开始' in game:
                # exit function
                pass
        for game in games:
            game_dict = {}
            game_dict['比赛日'] = game_date+' '+ game[1]
            
            for title,data_item in zip(titles,game):
                game_dict[title]= data_item
            current_day_games.append(game_dict)
    return current_day_games

# Game Plays

In [6]:
def get_plays(scheduleid,leagueid,
                    headers={
                        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"
                    },
                   url = r"http://sportsdata.misports.cn/beitai/cba/live/liveeventdetails",
                   encoding='UTF-8'):
    data = {"scheduleid": scheduleid,"leagueid": leagueid}
    r = requests.post(url, data=data, headers=headers,)
    r.encoding=encoding
    soup = BeautifulSoup(r.content, "html.parser")
    all_events = []
    for event in soup.find_all("li"):
        event_dic = event.attrs
        event_dic['scheduleid'] = scheduleid
        li = list(event.find_all('span'))
        if li:
            event_dic['event']=li[2].text
            event_dic['score']=li[3].text
            all_events.append(event_dic)
    
    return all_events

# Saving data to database

In [7]:
def upload_plays(plays):
    # 写入数据库Staging_Schedule
    user_name = 'master'
    passcode = 'Pw#cbashuju0131'
    endpoint = 'cbashuju.ctkaehd5rxxe.us-east-1.rds.amazonaws.com'
    database = 'CBA_Data'
#     database = 'CBA_Staging'
    engine = create_engine(f'mysql+pymysql://{user_name}:{passcode}@{endpoint}/{database}')
    connection= engine.connect()

    plays.to_sql(name='Playbyplay',con=connection,index=False,if_exists='replace')
    connection.close()

In [8]:
def append_plays(plays):
    # 写入数据库Staging_Schedule
    user_name = 'master'
    passcode = 'Pw#cbashuju0131'
    endpoint = 'cbashuju.ctkaehd5rxxe.us-east-1.rds.amazonaws.com'
    database = 'CBA_Data'
#     database = 'CBA_Staging'
    engine = create_engine(f'mysql+pymysql://{user_name}:{passcode}@{endpoint}/{database}')
    connection= engine.connect()

    plays.to_sql(name='Playbyplay',con=connection,index=False,if_exists='append')
    connection.close()

# Scraping

In [9]:
dates_with_games = ['2020-08-13']
full_schedule = []
for date in tqdm(dates_with_games):
    full_schedule.extend(scrape_schedule_by_date(str(date),401))
    time.sleep(np.random.rand()*3)

100%|██████████| 1/1 [00:04<00:00,  4.30s/it]


In [10]:
len(full_schedule)

1

In [11]:
full_schedule = pd.DataFrame(full_schedule)

full_schedule[['主队', '主队ID']] = full_schedule['主队'].str.split('|', expand=True)
full_schedule[['客队', '客队ID']] = full_schedule['客队'].str.split('|', expand=True)
full_schedule[['比分', 'scheduleid']] = full_schedule['比分'].str.split('|', expand=True)

full_schedule['主队ID']=full_schedule['主队ID'].apply(lambda x: re.findall('teamid=(.*)',x)[0])
full_schedule['客队ID']=full_schedule['客队ID'].apply(lambda x: re.findall('teamid=(.*)',x)[0])
full_schedule['leagueid']=full_schedule['直播/回放'].apply(lambda x: re.findall('leagueid=(.*)&',x)[0])
full_schedule['scheduleid']=full_schedule['scheduleid'].apply(lambda x: re.findall('scheduleid=(.*)',x)[0])

full_schedule.sort_values(by='比赛日',inplace=True)

In [12]:
full_schedule

Unnamed: 0,比赛日,阶段,时间,状态,主队,比分,客队,直播/回放,主队ID,客队ID,scheduleid,leagueid
0,2020-08-13 20:00,季后赛,20:00,已完赛,辽宁本钢,115-113,广东东莞银行,技术统计|//sportsdata.misports.cn/beitai/cba/live?...,29129,29124,100042218,401


In [14]:
all_plays=[]
for index, game in tqdm(full_schedule.iterrows(), total=full_schedule.shape[0]):
    game_date = game['比赛日']
    home_tm = game['主队']
    away_tm = game['客队']
    if game['状态'] =='未开始':
        print(f'{game_date}, {home_tm} vs. {away_tm} 未开始，跳过')
        continue
    leagueid = game['leagueid']
    scheduleid = game['scheduleid']
    # game['plays'] = get_events(scheduleid,leagueid)
    all_plays.extend(get_plays(scheduleid,leagueid))
    time.sleep(np.random.rand()*3)
print('Complete!')

100%|██████████| 1/1 [00:04<00:00,  4.13s/it]

Complete!





In [15]:
df_all_events =pd.DataFrame(all_plays)
df_all_events.drop(columns=['class'],inplace=True)
df_all_events

Unnamed: 0,eventid,teamid,shotcoordx,shotcoordy,eventtypeid,quarternum,minutes,seconds,quarter,style,scheduleid,event,score,shotevent
0,,0,0,0,19,4,0,0,4,width: 500px;,100042218,比赛结束,115:113,
1,,0,0,0,15,4,0,0,4,width: 500px;,100042218,第4节结束,115:113,
2,,29124,9.4792,19.2181,4,4,0,1,4,width: 500px;,100042218,威姆斯三分跳投不中，赵继伟获得防守篮板,115:113,
3,,29124,0,0,11,4,0,10,4,width: 500px;,100042218,广东东莞银行队短暂停,115:113,
4,,29129,13.2708,7.6351,4,4,0,16,4,width: 500px;,100042218,赵继伟跳投不中，赵睿获得防守篮板,115:113,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,,29129,-2.3333,3.0019,4,1,11,22,1,width: 500px;,100042218,贺天举跳投不中，易建联获得防守篮板,2:0,
336,,29124,-7.4375,10.4151,4,1,11,40,1,width: 500px;,100042218,威姆斯跳投不中，郭艾伦获得防守篮板,2:0,
337,,29129,4.2292,-1.0135,3,1,11,56,1,width: 500px;,100042218,韩德君接到郭艾伦的助攻，跳投，命中得分,2:0,
338,,29124,0,0,12,1,12,0,1,width: 500px;,100042218,易建联和韩德君跳球，辽宁本钢队得到球权,0:0,


In [16]:
append_plays(df_all_events)

### Ignore below

### Ignore below

### Ignore below

### Ignore below

### Ignore below

### Ignore below

### Ignore below

### Ignore below

### Ignore below

### Ignore below