In [95]:
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time
import re
import pandas as pd
import numpy as np
from datetime import datetime

In [96]:
# !pip playwright install-deps firefox

In [135]:
DATA_DIR = "../sbpgs";
YR_DIR = "../leagueYrs";
PLAYOFF_SIZES = [4] * 4 + [8] * 8 + [10] * 12 + [12] * 30 + [14] * 5;
year_sizes = list(zip(PLAYOFF_SIZES,list(range(1966,2024))))
print(year_sizes)

[(4, 1966), (4, 1967), (4, 1968), (4, 1969), (8, 1970), (8, 1971), (8, 1972), (8, 1973), (8, 1974), (8, 1975), (8, 1976), (8, 1977), (10, 1978), (10, 1979), (10, 1980), (10, 1981), (10, 1982), (10, 1983), (10, 1984), (10, 1985), (10, 1986), (10, 1987), (10, 1988), (10, 1989), (12, 1990), (12, 1991), (12, 1992), (12, 1993), (12, 1994), (12, 1995), (12, 1996), (12, 1997), (12, 1998), (12, 1999), (12, 2000), (12, 2001), (12, 2002), (12, 2003), (12, 2004), (12, 2005), (12, 2006), (12, 2007), (12, 2008), (12, 2009), (12, 2010), (12, 2011), (12, 2012), (12, 2013), (12, 2014), (12, 2015), (12, 2016), (12, 2017), (12, 2018), (12, 2019), (14, 2020), (14, 2021), (14, 2022), (14, 2023)]


In [98]:
def playoff_size(year):
    return PLAYOFF_SIZES[year - 1966]

In [99]:
# DataQuest function
# add sleep to asking again
# selector defines the section of html you're looking at
async def get_html(url,selector,sleep =5, retries =3):
    html = None
    for i in range(1, retries+1):
        time.sleep(sleep *i)
        
        try:
            async with async_playwright() as p:
#                 chromium is open sourced version of chrome
                browser = await p.firefox.launch()
                page = await browser.new_page()
                await page.goto(url)
                print(await page.title())
                html = await page.inner_html(selector)
        except PlaywrightTimeout:
            print(f"Timeout error on {url}")
            continue
        else:
            break
    return html

In [100]:
# sus function, don't touch right now
async def savePath(link,directory,name,tag):
    save_path = os.path.join(directory, name(link))
    if not(os.path.exists(save_path)):
        html = await get_html(link, tag);
        with open(save_path, "w+") as f:
            f.write(html)
    else :
        with open(save_path, 'r') as f:
            html = f.read()
    return html

In [101]:
def getResult(index):
    if index < 0 :
        return 0
    elif index == 0 :
      return 5
    elif index <= 1:
      return 4
    elif index <= 3:
      return 3
    elif index <= 7:
      return 2
    else :
      return 1

In [102]:
def getResult2(index,length):
    if index < 0 :
        return 0
    elif index == length-1 :
      return 5
    elif index == length-2 :
      return 4
    elif index >= length-4 :
      return 3
    elif index >= length-8 :
      return 2
    else :
      return 1

In [103]:
def getYearURL(year):
    base = f"https://www.pro-football-reference.com";
    url = f"{base}/years/{year}/index.htm"
    return url

In [104]:
async def getPlayoffTeamsArr(url):
    base = f"https://www.pro-football-reference.com";
    a_tags = (await findLosers(url)) + (await findWinner(url));
    hrefs = [a["href"]  for a in a_tags];
    teams = [l for l in hrefs if "/teams/" in l];
    finalTeams = [base + t for t in teams]
    return finalTeams;

In [105]:
async def findWinner(url):
    html = BeautifulSoup(await savePath(url, YR_DIR, nameYear,"#div_playoff_results"))
    winner = html.find_all("td",{'data-stat': 'winner'})[-1]
    a_tag = [div.find_all("a") for div in winner]
    return sum(a_tag,[])

In [106]:
async def findLosers(url):
    html = BeautifulSoup(await savePath(url, YR_DIR, nameYear,"#div_playoff_results"))
    losers = html.find_all("td",{'data-stat': 'loser'})
    html.find_all("td",{'data-stat': 'winner'})[-1]
    a_tags = [div.find_all("a") for div in losers]
    return sum(a_tags,[])

In [107]:
async def findResult(teamURL,year):
    yearURL = getYearURL(year)
    size = playoff_size(year)
    if year > 2022 or year < 1970 :
        return "undefined"
    arr = await getPlayoffTeamsArr(yearURL)
    index = arr.index(teamURL) if teamURL in arr else -1
    return getResult2(index,size)

In [108]:
def getYear(url):
    yr = int(url[url.find(".htm")-4:url.find(".htm")]);
    return yr;

In [148]:
def twoDigitYear(year):
    return str(year)[-2:]

In [109]:
def getAbbrv(url):
    abbrv = url[url.index("teams/")+6:url.index(".htm")-5];
    return abbrv;

In [110]:
def nameYear(link):
    i = link.find("years")+6;
    return link[i:i+4] + "league.htm";

In [111]:
def nameTeam(link):
    return getAbbrv(link)+str(getYear(link))+".htm";

In [112]:
def getWins(string):
    w = int(string[:string.find("-")])
    return w;

In [113]:
def getGames(string):
    w = getWins(string)
    rest = string[string.find("-") + 1:]
    l = getWins(rest)
    t = int(rest[rest.find("-") + 1:])
    return w + l + t

In [114]:
# sus url manipulation to get the next year
def getNext(url,diff):
    base = url[:url.find(".htm")-5]
    yr = getYear(url)+diff;
    if((yr > 2023) or (yr < 1950)):
        return "";
    return f"{base}/{yr}.htm";

In [115]:
def searchHTML(string,html,div):
    arr = html.find_all(div)
    for i,p in enumerate(arr):
        if str(p).find(string)>0 :
            return i
    return -1

In [116]:
async def searchMeta(url,div,string):
    html = BeautifulSoup(await savePath(url,DATA_DIR,nameTeam,"#meta"))
    index = searchHTML(string,html,div)
    if index < 0 :
        return "undefined"
    return html.find_all(div)[index].getText();

In [117]:
#get record from html
async def getRec(link):
    ret = await searchMeta(link,"p","Record")
    ret = ret[ret.find(":")+2:ret.find(",")]
    return ret

In [118]:
async def getFullName(url):
    html = BeautifulSoup(await savePath(url,DATA_DIR,nameTeam,"#meta"))
    ret = html.find_all("span")[1].getText()
    return ret

In [143]:
def getNickname(year, name):
    return "'" + twoDigitYear(year) + " " + name.split()[-1]

In [119]:
async def getDivision(url):
    ret = await searchMeta(url,"p","Record")
    if ret == "undefined" :
        return ret;
    return ret[ret.find("\t")+1:ret.find("Div")-1]

In [120]:
async def getConf(url):
    ret = await getDivision(url)
    if ret == "undefined" :
        return ret;
    return ret[0:3]

In [121]:
async def getCoach(url):
    ret = await searchMeta(url,"p","Coach")
    if ret == "undefined" :
        return ret;
    return ret[ret.find("\n")+1:ret.find("(")-1]

In [182]:
# can be buggy for older seasons
async def getSBOdds(url):
    ret = await searchMeta(url,"p","Preseason Odds")
    if ret == "undefined" :
        return ret;
    endI = len(ret);
    if(ret.find(";") > 0):
        endI = ret.find(";");
    return ret[ret.find("Bowl")+5:endI]

In [123]:
await getSBOdds("https://www.pro-football-reference.com/teams/rai/1980.htm")

'+350'

In [124]:
# can be buggy for older seasons
async def getOverUnder(url):
    ret = await searchMeta(url,"p","O/U:")
    if ret == "undefined" :
        return ret;
    return float(ret[ret.find("O/U:")+5:])

In [125]:
await searchMeta("https://www.pro-football-reference.com/teams/clt/1970.htm","p","Odds")

'undefined'

In [126]:
await getOverUnder("https://www.pro-football-reference.com/teams/kan/2021.htm")

12.5

In [127]:
async def getPFRank(url):
    ret = await searchMeta(url,"p","Points For")
    if ret == "undefined" :
        return ret;
    return ret[ret.find(")")+2:ret.find("of")-3]

In [128]:
async def getPARank(url):
    ret = await searchMeta(url,"p","Points Against")
    if ret == "undefined" :
        return ret;
    return ret[ret.find(")")+2:ret.find("of")-3]

In [129]:
async def getExpRec(url):
    ret = await searchMeta(url,"p","Expected W-L")
    if ret == "undefined" :
        return ret;
    return ret[ret.find(":")+2:]

In [130]:
async def getSRS(url):
    ret = await searchMeta(url,"p","#srs")
    if ret == "undefined" :
        return ret;
    return ret[ret.find(":")+2:ret.find("(")-1]

In [131]:
async def getSOS(url):
    ret = await searchMeta(url,"p","#sos")
    if ret == "undefined" :
        return ret;
    return ret[ret.find("SOS: ")+5:-1]

In [132]:
async def scrape_season(season):
    url = getYearURL(season)
    finalTeams = await getPlayoffTeamsArr(url)
    return finalTeams

In [133]:
# season = 2022;
# html = await scrape_season(season);
# html;

In [187]:
SEASONS = list(range(1989,2022));
SEASONS = [await scrape_season(yr) for yr in SEASONS]

In [188]:
winners = [arr[-1] for arr in SEASONS]

In [199]:
(await getSBOdds(getNext("https://www.pro-football-reference.com/teams/ram/1999.htm",1)))

'+300'

In [200]:
simple = [];
for winner in winners:
    full = await getFullName(winner)
    year = getYear(winner)
    odds = await getSBOdds(getNext(winner,1))
    wins = await getOverUnder(getNext(winner,1))
    simple.append([year,
                    getNickname(year,full),
                    getWins(await getRec(getNext(winner,1))),
                    await findResult(getNext(winner,1),year +1),
                     odds,
                     int(odds[1:]),
                     wins])

In [201]:
simple

[[1989, "'89 49ers", 14, 3, '+350', 350, 11.5],
 [1990, "'90 Giants", 8, 0, '+400', 400, 11.0],
 [1991, "'91 Redskins", 9, 2, '+600', 600, 11.5],
 [1992, "'92 Cowboys", 12, 5, '+350', 350, 11.5],
 [1993, "'93 Cowboys", 12, 3, '+300', 300, 11.0],
 [1994, "'94 49ers", 11, 2, '+200', 200, 12.5],
 [1995, "'95 Cowboys", 10, 2, '+600', 600, 10.5],
 [1996, "'96 Packers", 13, 4, '+250', 250, 12.0],
 [1997, "'97 Broncos", 14, 5, '+600', 600, 11.0],
 [1998, "'98 Broncos", 6, 0, '+500', 500, 10.5],
 [1999, "'99 Rams", 10, 1, '+300', 300, 11.0],
 [2000, "'00 Ravens", 10, 2, '+800', 800, 11.0],
 [2001, "'01 Patriots", 9, 0, '+2000', 2000, 8.0],
 [2002, "'02 Buccaneers", 7, 0, '+800', 800, 10.5],
 [2003, "'03 Patriots", 14, 5, '+600', 600, 10.5],
 [2004, "'04 Patriots", 10, 2, '+500', 500, 11.0],
 [2005, "'05 Steelers", 8, 0, '+1200', 1200, 10.5],
 [2006, "'06 Colts", 13, 2, '+800', 800, 10.5],
 [2007, "'07 Giants", 12, 2, '+2000', 2000, 8.5],
 [2008, "'08 Steelers", 9, 0, '+1000', 1000, 10.5],
 [20

In [202]:
sf = pd.DataFrame(simple)
sf.columns = ["Year", "Team","N_Wins","N_Round","Odds_Str","Odds","Over-Under"]
# sf.columns = ["Year", "Team","Prev_Round","Round","Next_Round","Prev_Wins","W","Next_Wins"]
sf.to_csv('sbOdds.csv', index = False, header=True)
sf

Unnamed: 0,Year,Team,N_Wins,N_Round,Odds_Str,Odds,Over-Under
0,1989,'89 49ers,14,3,350,350,11.5
1,1990,'90 Giants,8,0,400,400,11.0
2,1991,'91 Redskins,9,2,600,600,11.5
3,1992,'92 Cowboys,12,5,350,350,11.5
4,1993,'93 Cowboys,12,3,300,300,11.0
5,1994,'94 49ers,11,2,200,200,12.5
6,1995,'95 Cowboys,10,2,600,600,10.5
7,1996,'96 Packers,13,4,250,250,12.0
8,1997,'97 Broncos,14,5,600,600,11.0
9,1998,'98 Broncos,6,0,500,500,10.5


In [107]:
df = pd.DataFrame(allTeams)

In [49]:
from pathlib import Path
root_path = "~/Desktop"
output_path = Path(root_path, 'output4.csv')
# output_file_full_path = ("/","output")
testDF.to_csv(output_path, index = None)