In [2]:
import os
from bs4 import BeautifulSoup, Comment
import asyncio
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time
import re
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm
import requests

In [26]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

In [3]:
def logo(team):
    abbrev = ABBREV[team.strip()]
    return f'https://a.espncdn.com/i/teamlogos/nba/500/{abbrev}.png'

In [4]:
def name(url):
    url = url.replace('-','_')
    return url.split('/')[-1]

In [None]:
def name_csv(url):
    temp = name(url)
    temp = temp.replace('html','csv')
    return temp

In [5]:
def save(link,directory,sleep=10,name=name):
    save_path = os.path.join(directory, name(link))
    if not(os.path.exists(save_path)):
        time.sleep(sleep)
        response = requests.get(link);
        text = response.text
        with open(save_path, "w+") as f:
            f.write(text)
        print("fetching from web")
    else :
        with open(save_path, 'r') as f:
            text = f.read()
        print("found!")
    return text

In [6]:
def save_tag(link,directory,tag,sleep=10,name=name):
    save_path = os.path.join(directory, name(link))
    if not(os.path.exists(save_path)):
        response = requests.get(link);
        text = response.text
        bs = BeautifulSoup(text, 'html.parser')
        text = bs.find(id = tag)
        time.sleep(sleep)
        with open(save_path, "w+") as f:
            f.write(str(text))
    else :
        with open(save_path, 'r') as f:
            text = f.read()
    return text

In [7]:
async def savePath(link,directory,name,tag):
    save_path = os.path.join(directory, name(link))
    if not(os.path.exists(save_path)):
        html = await get_html(link, tag);
        with open(save_path, "w+") as f:
            f.write(html)
    else :
        with open(save_path, 'r') as f:
            html = f.read()
    return html

In [8]:
def getSeed(string):
    return int(string[string.find('(')+1:string.find(')')])
def getTeam(string):
    return string[:string.find('(')]

In [9]:
def pbp(url):
    print(url)
    text = save(url,DIR)
    bs = BeautifulSoup(text, 'html.parser')
    summaries = bs.find(id = 'div_other_scores')
    boxes = [BASE + tag['href'] for tag in summaries.find_all('a') if "box" in tag['href']]
    return [url.replace('boxscores', 'boxscores/pbp') for url in boxes]

In [10]:
def getImg(url,home=True):
    text = save(url,DIR2)
    bs = BeautifulSoup(text, 'html.parser')
    summaries = bs.find(id = 'content')
    arr = summaries.find_all('img')
    ret = arr[1]['src'] if home else arr[0]['src']
    return ret

In [11]:
def getScore(url,game,winner=True):
    class_ = 'winner' if winner else 'loser'
    text = save(url,DIR)
    bs = BeautifulSoup(text, 'html.parser')
    summaries = bs.find(id = 'div_other_scores')
    return int(summaries.find_all('tr', class_=class_)[game-1].find('td', class_='right').text)

In [12]:
def seriesSum(url,games):
    [w,l] = [0,0]
    for i in range(games):
        w += getScore(url,i,True)
        l += getScore(url,i,False)
    return [w,l]

In [13]:
def mov(url,games):
    [w,l] = seriesSum(url,games)
    return round((w - l)/games,2)

In [14]:
def home(url,team):
    text = save(url,DIR)
    bs = BeautifulSoup(text, 'html.parser')
    tags = bs.h2.find_all('a')
    names = [tag.text for tag in tags]
    return team.strip() == names[1]

In [15]:
def statsString(url):
    text = save(url,DIR2)
    bs = BeautifulSoup(text, 'html.parser')
    stats = bs.find(id = 'all_game-summary')
    return str(stats)

In [16]:
def ties(url):
    string = statsString(url)
    i = string.find('Ties<')
    ties = string[i:string.find('</td>',i)+1]
    return int(re.search(r'>(\d+)<', ties).group(1))

In [17]:
def leads(url):
    string = statsString(url)
    i = string.find('Lead changes<')
    leads = string[i:string.find('</td>',i)+1]
    return int(re.search(r'>(\d+)<', leads).group(1))

In [18]:
def convertTime(string):
    parts = string.split(':')
    minutes = int(parts[0])
    seconds = float(parts[1])
    return pd.to_timedelta(minutes, unit='m') + pd.to_timedelta(seconds, unit='s')

In [19]:
def tied(url,convert=True):
    string = statsString(url)
    i = string.find('Game tied')
    tied = string[i:string.find('</td>',i)+1]
    ret = re.search(r'\b\d+:\d+\.\d+\b', tied).group(0)
    return convertTime(ret) if convert else ret

In [20]:
def awayLed(url,convert = True):
    string = statsString(url)
    i = string.find('led')
    tied = string[i:string.find('</td>',i)+1]
    ret = re.search(r'\b\d+:\d+\.\d+\b', tied).group(0)
    return convertTime(ret) if convert else ret

In [21]:
def homeLed(url,convert = True):
    string = statsString(url)
    j = string.find('led')
    i = string.find('led',j+1)
    tied = string[i:string.find('</td>',i)+1]
    ret = re.search(r'\b\d+:\d+\.\d+\b', tied).group(0)
    return convertTime(ret) if convert else ret

In [22]:
def addPBP(df,game):
    full = df.copy()
    col = f'g{game}'
    labels = [f'tie{game}',f'leads{game}',f'tied{game}',f'homeLed{game}',f'awayLed{game}']
    funcs = [ties,leads,tied,homeLed,awayLed]
    for i in range(len(funcs)):
        full.insert(full.columns.get_loc(col)+i+1, labels[i], full[col].apply(funcs[i]))
    return full

In [23]:
def seriesName(string):
    dictionary = {
        'Eastern Conf First Round' : 'R1',
        'Western Conf First Round' : 'R1',
        'Eastern Conf Semifinals' : 'R2',
        'Western Conf Semifinals' : 'R2',
        'Eastern Conf Finals' : 'ECF',
        'Western Conf Finals' : 'WCF',
        'Finals' : 'Finals'
    }
    return dictionary[string]

In [24]:
def zCol(df,col,label,neg=False,drop=False):
    newCol = df.apply(lambda row: (row[col] - df[col].mean())/df[col].std(), axis=1)
    df.insert(df.columns.get_loc(col)+1, label,newCol)
    df.drop(columns=[label]) if drop else True
    if neg:
        df[label] = -df[label]
    return df

In [25]:
def analytic(df,indexCol,cols,weights,label):
    base = pd.DataFrame()
    base['0'] = [0] * 60
    for i in range(len(weights)):
        base['0'] = base['0'] + df[cols[i]]*weights[i]
    base['0'] = -base['0']
    df.insert(df.columns.get_loc(indexCol)+1, label,base['0'])
    return df