In [1]:
# from CBAStats.Player import *
# from CBAStats.Team import *
# from CBAStats.Player import stats_output
from pathlib import Path
from sqlalchemy import create_engine
import pymysql
import requests
from bs4 import BeautifulSoup
# import lxml.html as lh
import pandas as pd
import datetime
import numpy as np
import re
import time

In [2]:
def get_page_content(url, encoding='UTF-8', header={
            'User-Agent': r'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) '
                          r'Chrome/41.0.2227.1 Safari/537.36'}):
    session = requests.Session()
    base_url = url
    response = session.get(base_url, headers=header)
    response.encoding = encoding
    page_content = BeautifulSoup(response.content, "html.parser")
    return page_content

# 爬取参数列表

In [8]:
# 链接中有几个参数，qleagueid是赛季，qmonth是月，qteamid是球队
# 空出qmonth和qteamid则可以无差别选取某赛季的所有比赛
# qleagueid并不是逐一递增或递减的，如20192020赛季是205，20182019是198

def get_params(default_schedule_url = "http://cba.sports.sina.com.cn/cba/schedule/all/"):
    """
    从赛程页爬取赛季，月，球队的可能参数值。
    从可能的参数值里选取想要爬取的赛季，月，球队等，并用get_url函数拼凑出目标url。
    """
    param_html_list = get_page_content(url=default_schedule_url).find_all('select')
    param_dict = {}

    for param in param_html_list:
        options = {}
        for option in param.find_all('option'):
            options[option.text] = option['value']
        param_dict[param['name']] = options
    
    return param_dict

# 爬取赛程、url

In [9]:
def get_url(leagueid = '19-20',month = '全部',teamid ='全部'):

    """
    此函数用于拼凑想要爬取的目标url。
    
    链接中有几个参数，qleagueid是赛季，qmonth是月，qteamid是球队。
    qleagueid并不是逐一递增或递减的，如20192020赛季是205，20182019是198。
    
    """
    param_dict = get_params(default_schedule_url = "http://cba.sports.sina.com.cn/cba/schedule/all/")
    qleagueid=param_dict['qleagueid'][leagueid]
    qmonth=param_dict['qmonth'][month]
    qteamid=param_dict['qteamid'][teamid]
    scrape_url = f"http://cba.sports.sina.com.cn/cba/schedule/all/?qleagueid={qleagueid}&qmonth={qmonth}&qteamid={qteamid}"
    
    return scrape_url

In [13]:
def scrape_schedule(leagueid = '19-20',month = '全部',teamid ='全部', 
                    only_show_params = False,param_url = "http://cba.sports.sina.com.cn/cba/schedule/all/"):
    """
    此函数用于爬取赛程和详细数据的链接。
    
    请注意，league，month，team参数值是有限定值的。
    可通过运行scrape_schedule(only_show_params = False)来查询可用的参数。如果only_show_params=True那么不会爬取赛程数据，
    只会显示可用参数。
    
    参数中，qleagueid是赛季，qmonth是月，qteamid是球队。
    qleagueid并不是逐一递增或递减的，如20192020赛季是205，20182019是198。
    
    Parameters: 
    
    league:
    month:
    team:
    only_show_params:
    param_url:
    """
    
    param_dict = get_params(default_schedule_url = param_url)
    
    if only_show_params:
        return param_dict
    
    # 拼凑出目标url
    schedule_url = get_url(leagueid = leagueid,month = month,teamid =teamid)
    
    # 爬取整张页面html
    page_content = get_page_content(url=schedule_url)

    # 赛程页面共有两张表
    # 第一张表是当前轮次比赛
    # 第二张表才是该赛季所有比赛

    # 爬取整张表的html
    target_table = page_content.find_all("table")[1]

    # 获取表头
    headers = [th.text for th in target_table.find('thead').find_all('th')]

    # 获取表格数据的html
    tbody = target_table.find('tbody')

    # 获取表格每行的html，存入list
    trs = tbody.find_all('tr')
    text_list =[]
    link_list = []

    for tr in trs:
        tds = tr.find_all('td')
        for td in tds:
            cell_text = str(td.text).strip()
            cell_link = ''
            if td.find('a',href=True):
                cell_link = td.find('a',href=True)['href'].strip()
            text_list.append(cell_text)
            link_list.append(cell_link)

    text_list = np.reshape(text_list, [-1, 10])
    link_list = np.reshape(link_list, [-1, 10])
    df_schedule_text = pd.DataFrame(data=text_list, columns=headers)
    df_schedule_link = pd.DataFrame(data=link_list, columns=[header + '_link' for header in headers])

    df_schedule_full = pd.merge(df_schedule_text, df_schedule_link, left_index=True, right_index=True)

    df_schedule_full['Game_ID'] = df_schedule_full['统计_link'].apply(lambda x: int(re.findall('show[/](\d+)[/]', x)[0]))
    df_schedule_full['客队ID'] = df_schedule_full['客队'].apply(lambda x: int(param_dict['qteamid'][x]))
    df_schedule_full['主队ID'] = df_schedule_full['主队'].apply(lambda x: int(param_dict['qteamid'][x]))
    
    return df_schedule_full

In [14]:
scrape_schedule(only_show_params = True)

{'qleagueid': {'19-20': '205',
  '18-19': '198',
  '17-18': '189',
  '16-17': '180',
  '15-16': '171',
  '14-15': '158',
  '13-14': '136',
  '12-13': '107',
  '11-12': '83',
  '10-11': '69',
  '09-10': '56',
  '08-09': '44',
  '07-08': '9',
  '06-07': '2',
  '05-06': '1'},
 'qmonth': {'全部': '',
  '11': '11',
  '12': '12',
  '01': '01',
  '02': '02',
  '03': '03',
  '04': '04',
  '05': '05',
  '06': '06',
  '07': '07',
  '08': '08',
  '09': '09',
  '10': '10'},
 'qteamid': {'全部': '',
  '广东': '1',
  '江苏': '2',
  '八一': '3',
  '上海': '4',
  '浙江': '6',
  '福建': '7',
  '深圳': '8',
  '辽宁': '9',
  '北京': '10',
  '新疆': '11',
  '吉林': '12',
  '广州': '13',
  '山西': '14',
  '山东': '15',
  '广厦': '16',
  '天津': '113',
  '青岛': '114',
  '四川': '182',
  '同曦': '368',
  '北控': '369'}}

In [15]:
scrape_schedule()

Unnamed: 0,轮次,日期,主队,比分,客队,战报,统计,组图,地点,电视,...,比分_link,客队_link,战报_link,统计_link,组图_link,地点_link,电视_link,Game_ID,客队ID,主队ID
0,第1轮,2019-11-01 19:35,广东,107:98,辽宁,战报,统计,组图,东莞,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/9/,http://sports.sina.com.cn/basketball/cba/2019-...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://slide.sports.sina.com.cn/cba/slide_2_79...,,,18381,9,1
1,第1轮,2019-11-02 19:35,江苏,96:97,浙江,战报,统计,组图,苏州,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/6/,http://sports.sina.com.cn/basketball/cba/2019-...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://slide.sports.sina.com.cn/cba/slide_2_78...,,,18382,6,2
2,第1轮,2019-11-02 19:35,四川,87:134,新疆,战报,统计,组图,成都,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/11/,http://sports.sina.com.cn/basketball/cba/2019-...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://slide.sports.sina.com.cn/cba/slide_2_78...,,,18383,11,182
3,第1轮,2019-11-02 19:35,山东,108:95,八一,战报,统计,组图,济南,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/3/,http://sports.sina.com.cn/basketball/cba/2019-...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://slide.sports.sina.com.cn/cba/slide_2_78...,,,18384,3,15
4,第1轮,2019-11-02 19:35,北控,108:103,深圳,战报,统计,组图,北京,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/8/,http://sports.sina.com.cn/basketball/cba/2019-...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://slide.sports.sina.com.cn/cba/slide_2_79...,,,18385,8,369
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464,12进8,2020-08-01 20:00,北控,108:106,吉林,战报,统计,组图,,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/12/,https://sports.sina.com.cn/basketball/cba/2020...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://slide.sports.sina.com.cn/cba/slide_2_79...,,,19007,12,369
465,1/4决赛,2020-08-02 15:00,广东,VS,青岛,--,统计,--,青岛,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/114/,,http://cba.sports.sina.com.cn/cba/schedule/sho...,,,,19008,114,1
466,1/4决赛,2020-08-02 20:00,北京,VS,福建,--,统计,--,青岛,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/7/,,http://cba.sports.sina.com.cn/cba/schedule/sho...,,,,19009,7,10
467,1/4决赛,2020-08-03 15:00,新疆,VS,北控,--,统计,--,青岛,,...,http://cba.sports.sina.com.cn/cba/schedule/sho...,http://cba.sports.sina.com.cn/cba/team/show/369/,,http://cba.sports.sina.com.cn/cba/schedule/sho...,,,,19010,369,11
