In [1]:
import os
import pandas as pd
import re
from cbastats import DBHelper
import datetime
import pytz

# from cbastats import Scraper
from cbastats import ScraperMongo
from cbastats.Team import *
from cbastats.Player import *

# Scrape current schedule

In [2]:
sina_scraper = ScraperMongo.SinaScraper(ScraperMongo.SINA_SCHEDULE_BASE_URL,ScraperMongo.ENCODING, ScraperMongo.PARSER, ScraperMongo.HEADERS)

In [3]:
sina_scraper.scraper_params

{'qleagueid': {'20-21': '206',
  '19-20': '205',
  '18-19': '198',
  '17-18': '189',
  '16-17': '180',
  '15-16': '171',
  '14-15': '158',
  '13-14': '136',
  '12-13': '107',
  '11-12': '83',
  '10-11': '69',
  '09-10': '56',
  '08-09': '44',
  '07-08': '9',
  '06-07': '2',
  '05-06': '1'},
 'qmonth': {'全部': '',
  '11': '11',
  '12': '12',
  '01': '01',
  '02': '02',
  '03': '03',
  '04': '04',
  '05': '05',
  '06': '06',
  '07': '07',
  '08': '08',
  '09': '09',
  '10': '10'},
 'qteamid': {'全部': '',
  '广东': '1',
  '江苏': '2',
  '上海': '4',
  '浙江': '6',
  '福建': '7',
  '深圳': '8',
  '辽宁': '9',
  '北京': '10',
  '新疆': '11',
  '吉林': '12',
  '广州': '13',
  '山西': '14',
  '山东': '15',
  '广厦': '16',
  '天津': '113',
  '青岛': '114',
  '四川': '182',
  '同曦': '368',
  '北控': '369'}}

In [4]:
scraped_schedule = sina_scraper.scrape_schedule()

len(scraped_schedule)

351

In [5]:
scraped_schedule[0]

{'轮次': '第1轮',
 '日期': Timestamp('2020-10-17 16:00:00+0800', tz='Asia/Shanghai'),
 '主队': '江苏',
 '比分': '94:102',
 '客队': '青岛',
 '战报': '战报',
 '统计': '统计',
 '组图': '组图',
 '地点': '诸暨',
 '电视': '',
 '轮次_link': '',
 '日期_link': '',
 '主队_link': 'http://cba.sports.sina.com.cn/cba/team/show/2/',
 '比分_link': 'http://cba.sports.sina.com.cn/cba/schedule/show/19143/',
 '客队_link': 'http://cba.sports.sina.com.cn/cba/team/show/114/',
 '战报_link': 'https://sports.sina.com.cn/basketball/cba/2020-10-17/doc-iiznezxr6527423.shtml',
 '统计_link': 'http://cba.sports.sina.com.cn/cba/schedule/show/19143/',
 '组图_link': 'http://slide.sports.sina.com.cn/cba/slide_2_792_255844.html',
 '地点_link': '',
 '电视_link': '',
 'GameID_Sina': '19143',
 '客队ID': '114',
 '主队ID': '2',
 '赛季': '20-21',
 '详细统计': '',
 '比赛回合': ''}

# Connect Database

In [6]:
mongodbio =DBHelper.MongoDBHelper() 
client = mongodbio.create_connection(DBHelper.MONGODB_USERNAME,DBHelper.MONGODB_PWD,DBHelper.MONGODB_ENDPOINT)

db = client['cbaStats']

db.list_collection_names()

existing database ['cbaStats', 'admin', 'local']


['cbaGamesStaging', 'seasonTeamStats', 'seasonPlayerStats', 'cbaGames']

In [7]:
coll_cbaGames = db['cbaGames']
coll_cbaGamesStaging=db['cbaGamesStaging']

# Insert new games

In [8]:
result = mongodbio.insert_new_games(scraped_schedule,coll_cbaGames,coll_cbaGamesStaging)

----------clean up staging collection----------
cbaGamesStaging operation acknowledged!
0 records were deleted from cbaGamesStaging .
cbaGames has 351 docs.
cbaGamesStaging has 0 docs.
----------insert records into staging collection----------
cbaGamesStaging operation acknowledged!
351 records were inserted into cbaGamesStaging.
----------checking what records to insert into production----------
Production is up-to-date
cbaGames has 351 docs.


# Update Game Stats within each game

In [9]:
current_time = datetime.datetime.now()
timezone_est = pytz.timezone('US/Eastern')
timezone_utc = pytz.timezone('UTC')
current_time = timezone_utc.localize(current_time)
current_time

datetime.datetime(2021, 1, 14, 4, 12, 13, 602925, tzinfo=<UTC>)

In [10]:
schedule_to_scrape = list(mongodbio.select_records(coll_cbaGames,filter={'详细统计':'','日期':{"$lt":current_time}}))

In [11]:
games_stats = sina_scraper.scrape_games(schedule_to_scrape)

100%|██████████| 9/9 [00:24<00:00,  2.69s/it]


In [12]:
# put game_stats into a dictionary
game_stats_dict={}
unique_gameids = list(games_stats['GameID_Sina'].value_counts().index)
for gameid in unique_gameids:
    game_stats_dict[gameid] = games_stats.loc[games_stats['GameID_Sina']==gameid].to_dict('record')

# update the stats in mongo
results=[]
for game_id, game_stats in game_stats_dict.items():
    print(game_id)
    results.append(coll_cbaGames.update_one({'GameID_Sina':game_id},{"$set":{"详细统计":game_stats}}))

19650
19644
19649
19645
19642
19643
19648
19646
19647


# Update Calculated Stats

### Get GameStats

In [13]:
sina_scraper.current_season

'20-21'

In [14]:
# pull data 
all_data = mongodbio.select_records(coll_cbaGames,filter={'详细统计':{"$ne":''},'赛季':sina_scraper.current_season})

# convert to a pandas dataframe
all_games_stats = []
for game in all_data:
    for row in game['详细统计']:
        all_games_stats.append(row)

df_all_games_stats=pd.DataFrame(all_games_stats)

In [15]:
df_all_games_stats.columns

Index(['球员_link', '球员', '号码', '出场时间', '首发', '进攻篮板', '防守篮板', '助攻', '犯规', '抢断',
       '失误', '盖帽', '扣篮', '被侵', '快攻', '得分', '轮次', '日期', '赛季', 'GameID_Sina',
       '球队ID', '对手ID', '球队', '对手', '地点', '2分中', '2分投', '3分中', '3分投', '罚球中',
       '罚球投'],
      dtype='object')

### Calculate Team Stats

In [16]:
teams = Team('',df_all_games_stats)

teams_df = pd.concat([teams.mov, teams.tm_pace, teams.tm_ortg, teams.tm_drtg, teams.tm_nrtg], axis=1)

teams_df.columns=['场均净胜分MOV', 'Pace', 'OffensiveRating', 'DefensiveRating', 'NetRating']
teams_df = teams_df.sort_values(by='NetRating',ascending=False)
teams_df.round(1)

Unnamed: 0_level_0,场均净胜分MOV,Pace,OffensiveRating,DefensiveRating,NetRating
球队,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
辽宁,14.9,94.1,125.2,109.1,16.1
广东,14.3,102.0,122.6,108.2,14.4
浙江,10.5,95.6,120.0,108.8,11.2
新疆,4.7,95.3,115.4,110.3,5.1
山东,4.4,94.8,113.1,108.5,4.7
吉林,2.3,91.2,115.6,113.0,2.6
青岛,2.1,95.3,112.0,109.8,2.3
广厦,2.1,95.6,118.1,116.0,2.2
上海,0.0,93.9,115.1,115.0,0.0
广州,-0.4,90.8,110.9,111.3,-0.4


### Insert Curent Season Team Stats

In [17]:
db.list_collection_names()

['cbaGamesStaging', 'seasonTeamStats', 'seasonPlayerStats', 'cbaGames']

In [18]:
coll_teamStats = db['seasonTeamStats']

# insert
mongodbio.delete_records(coll_teamStats,{})
mongodbio.insert_records(coll_teamStats,teams_df.to_dict('record'))

seasonTeamStats operation acknowledged!
19 records were deleted from seasonTeamStats .
seasonTeamStats operation acknowledged!
19 records were inserted into seasonTeamStats.


<pymongo.results.InsertManyResult at 0x7f0015a938c0>

### Calculate Player Stats

In [19]:
players = Player('',df_all_games_stats)

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
players_df = pd.concat([players.plr_ortg, players.plr_drtg, players.plr_nrtg,players.plr_usg], axis=1)

players_df.columns=['OffensiveRating', 'DefensiveRating', 'NetRating','UsagePercent']
players_df = players_df.sort_values(by=['NetRating','UsagePercent'],ascending=False)


players_df = pd.merge(players_df,players.plr_total_stats,left_index=True,right_index=True)
players_df.reset_index(inplace=True)
players_df.sort_values(by=['球队','NetRating','UsagePercent'],ascending=False,inplace=True)
players_df

Unnamed: 0,球员,球队,OffensiveRating,DefensiveRating,NetRating,UsagePercent,出场时间,首发,进攻篮板,防守篮板,...,扣篮,被侵,得分,2分中,2分投,3分中,3分投,罚球中,罚球投,出场
266,约翰逊,青岛,117.488857,110.833265,6.655592,30.451736,340,6,58,103,...,7,112,257,91,142,0,0,75,144,14
39,刘传兴,青岛,117.678038,114.377164,3.300874,15.417408,622,18,78,143,...,15,88,230,79,131,0,0,72,102,28
250,王睿泽,青岛,117.550862,115.157858,2.393004,18.534374,189,5,6,18,...,2,6,93,17,31,19,39,2,3,9
158,斯蒂马克,青岛,114.949967,115.797674,-0.847707,28.323284,126,0,26,45,...,2,22,84,32,58,3,6,11,25,9
318,赵泰隆,青岛,113.804436,115.800382,-1.995946,14.128267,446,16,6,25,...,1,16,156,18,34,37,96,9,17,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,黄旭,上海,,123.747940,,11.114661,12,0,0,0,...,0,0,2,1,1,0,2,0,0,4
138,张春军,上海,,114.875565,,10.232545,126,0,4,13,...,0,8,17,4,10,3,15,0,0,8
82,周彦旭,上海,,,,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
242,王旭,上海,,,,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Insert Current Season Player Stats

In [20]:
coll_playerStats = db['seasonPlayerStats']

# insert
mongodbio.delete_records(coll_playerStats,{})
mongodbio.insert_records(coll_playerStats,players_df.to_dict('record'))

seasonPlayerStats operation acknowledged!
379 records were deleted from seasonPlayerStats .
seasonPlayerStats operation acknowledged!
380 records were inserted into seasonPlayerStats.


<pymongo.results.InsertManyResult at 0x7f0015a6dbe0>

# Update play by play