In [1]:
import requests
from bs4 import BeautifulSoup
# import lxml.html as lh
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import numpy as np
import re
import time
import json
from tqdm import tqdm
from sqlalchemy import create_engine
from Scraper import get_schedule

# Schedule

In [2]:
def scrape_schedule_by_date(game_date,leagueID=401):
    """
    爬取某日期的比赛表，包括link
    返回一个list，每个item是一个dictionary，每个dictionary是一场比赛。如：
    {'比赛日': '2020-07-22 11:00',
      '阶段': '常规赛',
      '时间': '11:00',
      '状态': '已完赛',
      '主队': '四川五粮金樽|//sportsdata.misports.cn/beitai/cba/team?leagueid=401&teamid=29127',
      '比分': '101-84|//sportsdata.misports.cn/beitai/cba/live?leagueid=401&scheduleid=100040250',
      '客队': '八一南昌|//sportsdata.misports.cn/beitai/cba/team?leagueid=401&teamid=29116',
      '直播/回放': '技术统计|//sportsdata.misports.cn/beitai/cba/live?leagueid=401&scheduleid=100040250'}
    
    """
    headers={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"}
    data = {"currentdate": game_date,"leagueID": leagueID}

    r = requests.post(r"http://sportsdata.misports.cn/beitai/cba/schedule/getschedulelistForDate",
                  data=data, headers=headers)

    r.encoding = 'utf-8'

    soup = BeautifulSoup(r.content, "html.parser")

    assert len(soup.find_all('div',{'class':'title'}))==1,"more than 1 title div"
    current_day_games=[]
    for section in soup.find_all('div',attrs={"class":"section2"}):
        titles = [span.text.strip() for span in section.find('div', attrs={"class":'title'}).find_all('span')]
        games = []
        # 每一行是一场比赛
        for row in section.find('table').find_all('tr'):
            games.append([td.text.strip()+'|'+td.find('a')['href'] if td.find('a') else td.text.strip() for td in row.find_all('td')])
        for game in games:
            if '未开始' in game:
                # exit function
                pass
        for game in games:
            game_dict = {}
            game_dict['比赛日'] = game_date+' '+ game[1]
            
            for title,data_item in zip(titles,game):
                game_dict[title]= data_item
            current_day_games.append(game_dict)
    return current_day_games

# Game Plays

In [3]:
def get_plays(scheduleid,leagueid,
                    headers={
                        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"
                    },
                   url = r"http://sportsdata.misports.cn/beitai/cba/live/liveeventdetails",
                   encoding='UTF-8'):
    data = {"scheduleid": scheduleid,"leagueid": leagueid}
    r = requests.post(url, data=data, headers=headers,)
    r.encoding=encoding
    soup = BeautifulSoup(r.content, "html.parser")
    all_events = []
    for event in soup.find_all("li"):
        event_dic = event.attrs
        event_dic['scheduleid'] = scheduleid
        li = list(event.find_all('span'))
        if li:
            event_dic['event']=li[2].text
            event_dic['score']=li[3].text
            all_events.append(event_dic)
    
    return all_events

# Saving data to database

In [4]:
def upload_plays(plays):
    # 写入数据库Staging_Schedule
    user_name = 'master'
    passcode = 'Pw#cbashuju0131'
    endpoint = 'cbashuju.ctkaehd5rxxe.us-east-1.rds.amazonaws.com'
    database = 'CBA_Data'
#     database = 'CBA_Staging'
    engine = create_engine(f'mysql+pymysql://{user_name}:{passcode}@{endpoint}/{database}')
    connection= engine.connect()

    plays.to_sql(name='Playbyplay',con=connection,index=False,if_exists='replace')
    connection.close()

In [5]:
def append_plays(plays):
    # 写入数据库Staging_Schedule
    user_name = 'master'
    passcode = 'Pw#cbashuju0131'
    endpoint = 'cbashuju.ctkaehd5rxxe.us-east-1.rds.amazonaws.com'
    database = 'CBA_Data'
#     database = 'CBA_Staging'
    engine = create_engine(f'mysql+pymysql://{user_name}:{passcode}@{endpoint}/{database}')
    connection= engine.connect()

    plays.to_sql(name='Playbyplay',con=connection,index=False,if_exists='append')
    connection.close()

In [22]:
def get_games_events(teamid=None,scheduleid_list=None):
    """
    从database中，获取比赛的所有events，包括对手的events。
    如果只提供了teamid，则会获取teamid对应队伍参加的比赛的所有events，包括对手的events。
    如果提供了scheduleid_list，则会无视teamid，获取scheduleid对应的比赛。
    如果不提供任何参数，则会直接获取整个数据库的所有play （需要fix->增加一个leagueid，未来可能会有多赛季的数据）
    
    """
#     assert (not (teamid is None)&(scheduleid_list is None)),'至少需要teamid或scheduleid_list一个参数'
    user_name = 'master'
    passcode = 'Pw#cbashuju0131'
    endpoint = 'cbashuju.ctkaehd5rxxe.us-east-1.rds.amazonaws.com'
    database = 'CBA_Data'
    #     database = 'CBA_Staging'
    engine = create_engine(f'mysql+pymysql://{user_name}:{passcode}@{endpoint}/{database}')
    connection= engine.connect()
    
    if scheduleid_list:
        assert type(scheduleid_list)==list,'scheduleid_list只接受list'
        sql_str = f"""
        SELECT *
        FROM CBA_Data.Playbyplay plays1
        WHERE plays1.scheduleid IN ('{"','".join(str(i) for i in scheduleid_list)}')
        """
    elif teamid:
        sql_str = f"""
        SELECT *
        FROM CBA_Data.Playbyplay plays1
        WHERE plays1.scheduleid IN (
            SELECT DISTINCT plays2.scheduleid
            FROM CBA_Data.Playbyplay plays2
            WHERE plays2.teamid = {teamid}
        )
        """
    else:
        sql_str = f"""
        SELECT *
        FROM CBA_Data.Playbyplay plays1
        
        """
    team_events = pd.read_sql(sql_str, connection)
    # plays.to_sql(name='Playbyplay',con=connection,index=False,if_exists='replace')
    connection.close()
    return team_events

# Scraping

In [6]:
df_schedule = get_schedule()

In [7]:
df_schedule

Unnamed: 0,轮次,日期,主队,比分,客队,战报,统计,组图,地点,电视,...,比分_link,客队_link,战报_link,统计_link,组图_link,地点_link,电视_link,SinaGame_ID,客队ID,主队ID
0,第1轮,2019-11-01 19:35:00,广东,107:98,辽宁,战报,统计,组图,东莞,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/9/,http://sports.sina.com.cn/basketball/cba/2019-...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://slide.sports.sina.com.cn/cba/slide_2_79...,,,18381,9,1
1,第1轮,2019-11-02 19:35:00,江苏,96:97,浙江,战报,统计,组图,苏州,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/6/,http://sports.sina.com.cn/basketball/cba/2019-...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://slide.sports.sina.com.cn/cba/slide_2_78...,,,18382,6,2
2,第1轮,2019-11-02 19:35:00,四川,87:134,新疆,战报,统计,组图,成都,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/11/,http://sports.sina.com.cn/basketball/cba/2019-...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://slide.sports.sina.com.cn/cba/slide_2_78...,,,18383,11,182
3,第1轮,2019-11-02 19:35:00,山东,108:95,八一,战报,统计,组图,济南,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/3/,http://sports.sina.com.cn/basketball/cba/2019-...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://slide.sports.sina.com.cn/cba/slide_2_78...,,,18384,3,15
4,第1轮,2019-11-02 19:35:00,北控,108:103,深圳,战报,统计,组图,北京,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/8/,http://sports.sina.com.cn/basketball/cba/2019-...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://slide.sports.sina.com.cn/cba/slide_2_79...,,,18385,8,369
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470,半决赛,2020-08-06 20:00:00,北京,90:86,广东,战报,统计,组图,青岛,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/1/,https://sports.sina.com.cn/basketball/cba/2020...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://slide.sports.sina.com.cn/cba/slide_2_79...,,,19014,1,10
471,半决赛,2020-08-07 20:00:00,辽宁,119:113,新疆,战报,统计,组图,青岛,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/11/,https://sports.sina.com.cn/basketball/cba/2020...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://slide.sports.sina.com.cn/cba/slide_2_79...,,,19015,11,9
472,半决赛,2020-08-08 20:00:00,广东,88:85,北京,战报,统计,组图,青岛,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/10/,https://sports.sina.com.cn/basketball/cba/2020...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://slide.sports.sina.com.cn/cba/slide_2_79...,,,19016,10,1
473,总决赛,2020-08-11 20:00:00,广东,110:88,辽宁,战报,统计,组图,青岛,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/9/,https://sports.sina.com.cn/basketball/cba/2020...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://slide.sports.sina.com.cn/cba/slide_2_79...,,,19017,9,1


In [9]:
list(set(df_schedule['日期'].dt.date))

[datetime.date(2020, 6, 21),
 datetime.date(2020, 7, 24),
 datetime.date(2020, 7, 27),
 datetime.date(2020, 7, 21),
 datetime.date(2020, 7, 17),
 datetime.date(2020, 6, 22),
 datetime.date(2020, 7, 1),
 datetime.date(2020, 7, 10),
 datetime.date(2019, 12, 5),
 datetime.date(2020, 7, 12),
 datetime.date(2020, 7, 14),
 datetime.date(2019, 11, 29),
 datetime.date(2020, 1, 17),
 datetime.date(2019, 11, 6),
 datetime.date(2019, 12, 20),
 datetime.date(2020, 6, 23),
 datetime.date(2019, 11, 13),
 datetime.date(2020, 7, 15),
 datetime.date(2020, 1, 19),
 datetime.date(2020, 1, 1),
 datetime.date(2020, 1, 21),
 datetime.date(2019, 11, 23),
 datetime.date(2019, 11, 24),
 datetime.date(2020, 7, 26),
 datetime.date(2019, 12, 29),
 datetime.date(2019, 11, 15),
 datetime.date(2019, 12, 6),
 datetime.date(2019, 12, 12),
 datetime.date(2019, 12, 11),
 datetime.date(2019, 11, 16),
 datetime.date(2019, 11, 3),
 datetime.date(2020, 1, 3),
 datetime.date(2020, 1, 14),
 datetime.date(2020, 8, 7),
 datetim

In [24]:
dates_with_games = [x for x in list(set(df_schedule['日期'].dt.date))]
full_schedule = []
for date in tqdm(dates_with_games):
    full_schedule.extend(scrape_schedule_by_date(str(date),401))
    time.sleep(np.random.rand()*3)

100%|██████████| 115/115 [04:46<00:00,  2.49s/it]


In [25]:
len(full_schedule)

475

In [26]:
full_schedule = pd.DataFrame(full_schedule)

full_schedule[['主队', '主队ID']] = full_schedule['主队'].str.split('|', expand=True)
full_schedule[['客队', '客队ID']] = full_schedule['客队'].str.split('|', expand=True)
full_schedule[['比分', 'scheduleid']] = full_schedule['比分'].str.split('|', expand=True)

full_schedule['主队ID']=full_schedule['主队ID'].apply(lambda x: re.findall('teamid=(.*)',x)[0])
full_schedule['客队ID']=full_schedule['客队ID'].apply(lambda x: re.findall('teamid=(.*)',x)[0])
full_schedule['leagueid']=full_schedule['直播/回放'].apply(lambda x: re.findall('leagueid=(.*)&',x)[0])
full_schedule['scheduleid']=full_schedule['scheduleid'].apply(lambda x: re.findall('scheduleid=(.*)',x)[0])

full_schedule.sort_values(by='比赛日',inplace=True)

In [129]:
full_schedule.loc[full_schedule['比赛日']>'2020-06-20',]

Unnamed: 0,比赛日,阶段,时间,状态,主队,比分,客队,直播/回放,主队ID,客队ID,scheduleid,leagueid
234,2020-06-20 11:00,常规赛,11:00,已完赛,南京同曦宙光,95-112,浙江广厦控股,技术统计|//sportsdata.misports.cn/beitai/cba/live?...,29133,29128,100040015,401
235,2020-06-20 12:30,常规赛,12:30,已完赛,苏州肯帝亚,105-98,深圳马可波罗,技术统计|//sportsdata.misports.cn/beitai/cba/live?...,29118,29131,100040016,401
236,2020-06-20 15:30,常规赛,15:30,已完赛,青岛国信双星,99-113,新疆伊力特,技术统计|//sportsdata.misports.cn/beitai/cba/live?...,29135,29117,100040017,401
237,2020-06-20 16:30,常规赛,16:30,已完赛,广东东莞银行,105-82,山西汾酒股份,技术统计|//sportsdata.misports.cn/beitai/cba/live?...,29124,29132,100040019,401
238,2020-06-20 19:35,常规赛,19:35,已完赛,北京首钢,91-82,辽宁本钢,技术统计|//sportsdata.misports.cn/beitai/cba/live?...,29115,29129,100040018,401
...,...,...,...,...,...,...,...,...,...,...,...,...
233,2020-08-06 20:00,季后赛,20:00,已完赛,北京首钢,90-86,广东东莞银行,技术统计|//sportsdata.misports.cn/beitai/cba/live?...,29115,29124,100042088,401
155,2020-08-07 20:00,季后赛,20:00,已完赛,辽宁本钢,119-113,新疆伊力特,技术统计|//sportsdata.misports.cn/beitai/cba/live?...,29129,29117,100042098,401
209,2020-08-08 20:00,季后赛,20:00,已完赛,广东东莞银行,88-85,北京首钢,技术统计|//sportsdata.misports.cn/beitai/cba/live?...,29124,29115,100042089,401
379,2020-08-11 20:00,季后赛,20:00,已完赛,广东东莞银行,110-88,辽宁本钢,技术统计|//sportsdata.misports.cn/beitai/cba/live?...,29124,29129,100042217,401


In [134]:
df_all_events = get_games_events()
df_all_events.shape

(166090, 14)

In [135]:
df_all_events = df_all_events.merge(right=full_schedule[['scheduleid','主队', '比分', '客队']],on='scheduleid')

In [136]:
df_all_events=df_all_events.loc[df_all_events['scheduleid']>='100040015']

In [137]:
# 获取每节或每个加时结束时的比分
df_all_events  = df_all_events.query("eventtypeid in ('15')")

In [138]:
df_all_events[['主队得分','客队得分']] = df_all_events['score'].str.split(':', expand=True)

In [139]:
df_all_events[['主队得分','客队得分']] = df_all_events[['主队得分','客队得分']].astype(int)

In [140]:
df_all_events.columns

Index(['eventid', 'teamid', 'shotcoordx', 'shotcoordy', 'eventtypeid',
       'quarternum', 'minutes', 'seconds', 'quarter', 'style', 'scheduleid',
       'event', 'score', 'shotevent', '主队', '比分', '客队', '主队得分', '客队得分'],
      dtype='object')

In [141]:
df_all_events=df_all_events[['quarternum', 'scheduleid', '主队', '比分', '客队', '主队得分', '客队得分']]

In [142]:
df_all_events_home = df_all_events[['quarternum', 'scheduleid', '主队', '主队得分']]
df_all_events_away = df_all_events[['quarternum', 'scheduleid', '客队', '客队得分']]

df_all_events_home = df_all_events_home.rename(columns={'主队':'team','主队得分':'score'})

df_all_events_away = df_all_events_away.rename(columns={'客队':'team','客队得分':'score'})

In [143]:
df_quarter_score=pd.concat([df_all_events_home,df_all_events_away])
df_quarter_score

Unnamed: 0,quarternum,scheduleid,team,score
104285,4,100040015,南京同曦宙光,95
104389,3,100040015,南京同曦宙光,72
104473,2,100040015,南京同曦宙光,49
104570,1,100040015,南京同曦宙光,25
104667,4,100040016,苏州肯帝亚,105
...,...,...,...,...
165662,1,100042217,辽宁本钢,24
165751,4,100042218,广东东莞银行,113
165824,3,100042218,广东东莞银行,93
165910,2,100042218,广东东莞银行,66


In [144]:
df_quarter_score=df_quarter_score.loc[df_quarter_score.query("quarternum in ('1','2','3','4')").index]

In [145]:
df_avg_score = df_quarter_score.groupby(['team','quarternum']).mean().reset_index()

In [146]:
df_shifted = df_avg_score.merge(df_avg_score.shift(1,fill_value=0),left_index=True,right_index=True)
df_shifted.loc[df_shifted['quarternum_x']=='1','score_y']=0
df_shifted

Unnamed: 0,team_x,quarternum_x,score_x,team_y,quarternum_y,score_y
0,上海久事,1,22.500000,0,0,0.000000
1,上海久事,2,47.187500,上海久事,1,22.500000
2,上海久事,3,70.125000,上海久事,2,47.187500
3,上海久事,4,96.933333,上海久事,3,70.125000
4,九台农商银行,1,23.882353,上海久事,4,0.000000
...,...,...,...,...,...,...
75,辽宁本钢,4,114.350000,辽宁本钢,3,85.619048
76,青岛国信双星,1,21.684211,辽宁本钢,4,0.000000
77,青岛国信双星,2,46.473684,青岛国信双星,1,21.684211
78,青岛国信双星,3,69.421053,青岛国信双星,2,46.473684


In [147]:
df_shifted.head(20)

Unnamed: 0,team_x,quarternum_x,score_x,team_y,quarternum_y,score_y
0,上海久事,1,22.5,0,0,0.0
1,上海久事,2,47.1875,上海久事,1,22.5
2,上海久事,3,70.125,上海久事,2,47.1875
3,上海久事,4,96.933333,上海久事,3,70.125
4,九台农商银行,1,23.882353,上海久事,4,0.0
5,九台农商银行,2,48.470588,九台农商银行,1,23.882353
6,九台农商银行,3,72.117647,九台农商银行,2,48.470588
7,九台农商银行,4,97.823529,九台农商银行,3,72.117647
8,八一南昌,1,22.8125,九台农商银行,4,0.0
9,八一南昌,2,43.8125,八一南昌,1,22.8125


In [148]:
df_shifted['diff'] = df_shifted['score_x']-df_shifted['score_y']

In [149]:
df_shifted.columns

Index(['team_x', 'quarternum_x', 'score_x', 'team_y', 'quarternum_y',
       'score_y', 'diff'],
      dtype='object')

In [150]:
df_shifted.loc[df_shifted['team_x']=='广东东莞银行',]

Unnamed: 0,team_x,quarternum_x,score_x,team_y,quarternum_y,score_y,diff
40,广东东莞银行,1,30.909091,山西汾酒股份,4,0.0,30.909091
41,广东东莞银行,2,59.227273,广东东莞银行,1,30.909091,28.318182
42,广东东莞银行,3,87.954545,广东东莞银行,2,59.227273,28.727273
43,广东东莞银行,4,116.181818,广东东莞银行,3,87.954545,28.227273


In [151]:
df_results =df_shifted[['team_x', 'quarternum_x','diff']]

In [152]:
df_results

Unnamed: 0,team_x,quarternum_x,diff
0,上海久事,1,22.500000
1,上海久事,2,24.687500
2,上海久事,3,22.937500
3,上海久事,4,26.808333
4,九台农商银行,1,23.882353
...,...,...,...
75,辽宁本钢,4,28.730952
76,青岛国信双星,1,21.684211
77,青岛国信双星,2,24.789474
78,青岛国信双星,3,22.947368


In [153]:
teamscore_by_quarter = df_results.pivot(index='team_x',columns='quarternum_x',values='diff')
teamscore_by_quarter

quarternum_x,1,2,3,4
team_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
上海久事,22.5,24.6875,22.9375,26.808333
九台农商银行,23.882353,24.588235,23.647059,25.705882
八一南昌,22.8125,21.0,24.625,22.5625
北京控股,25.388889,24.944444,24.166667,24.5
北京首钢,23.15,24.2,24.25,22.844444
南京同曦宙光,26.1875,26.9375,25.3125,25.9625
四川五粮金樽,17.875,18.4375,23.3125,20.9375
天津先行者,20.4375,22.875,23.375,24.1875
山东西王,22.470588,22.529412,24.470588,23.764706
山西汾酒股份,23.764706,22.0,22.764706,22.941176


In [157]:
teamscore_by_quarter.rank(ascending=False).loc[['广东东莞银行','辽宁本钢']]

quarternum_x,1,2,3,4
team_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
广东东莞银行,1.0,2.0,2.0,2.0
辽宁本钢,5.0,1.0,1.0,1.0


In [154]:
teamscore_by_quarter.loc[['广东东莞银行','辽宁本钢']]

quarternum_x,1,2,3,4
team_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
广东东莞银行,30.909091,28.318182,28.727273,28.227273
辽宁本钢,27.333333,29.0,29.285714,28.730952


### Ignore below

### Ignore below

### Ignore below

### Ignore below

### Ignore below

### Ignore below

### Ignore below

### Ignore below

### Ignore below

### Ignore below