In [15]:
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time
import re
import pandas as pd
import numpy as np
from datetime import datetime

In [16]:
# !pip playwright install-deps firefox

In [17]:
DATA_DIR = "sbpgs";
YR_DIR = "leagueYrs";
PLAYOFF_SIZES = [4] * 4 + [8] * 8 + [10] * 12 + [12] * 30 + [14] * 5;
year_sizes = list(zip(PLAYOFF_SIZES,list(range(1966,2024))))
print(year_sizes)

[(4, 1966), (4, 1967), (4, 1968), (4, 1969), (8, 1970), (8, 1971), (8, 1972), (8, 1973), (8, 1974), (8, 1975), (8, 1976), (8, 1977), (10, 1978), (10, 1979), (10, 1980), (10, 1981), (10, 1982), (10, 1983), (10, 1984), (10, 1985), (10, 1986), (10, 1987), (10, 1988), (10, 1989), (12, 1990), (12, 1991), (12, 1992), (12, 1993), (12, 1994), (12, 1995), (12, 1996), (12, 1997), (12, 1998), (12, 1999), (12, 2000), (12, 2001), (12, 2002), (12, 2003), (12, 2004), (12, 2005), (12, 2006), (12, 2007), (12, 2008), (12, 2009), (12, 2010), (12, 2011), (12, 2012), (12, 2013), (12, 2014), (12, 2015), (12, 2016), (12, 2017), (12, 2018), (12, 2019), (14, 2020), (14, 2021), (14, 2022), (14, 2023)]


In [18]:
def playoff_size(year):
    return PLAYOFF_SIZES[year - 1966]

In [19]:
# DataQuest function
# add sleep to asking again
# selector defines the section of html you're looking at
async def get_html(url,selector,sleep =5, retries =3):
    html = None
    for i in range(1, retries+1):
        time.sleep(sleep *i)
        
        try:
            async with async_playwright() as p:
#                 chromium is open sourced version of chrome
                browser = await p.firefox.launch()
                page = await browser.new_page()
                await page.goto(url)
                print(await page.title())
                html = await page.inner_html(selector)
        except PlaywrightTimeout:
            print(f"Timeout error on {url}")
            continue
        else:
            break
    return html

In [20]:
# sus function, don't touch right now
async def savePath(link,directory,name,tag):
    save_path = os.path.join(directory, name(link))
    if not(os.path.exists(save_path)):
        html = await get_html(link, tag);
        with open(save_path, "w+") as f:
            f.write(html)
    else :
        with open(save_path, 'r') as f:
            html = f.read()
    return html

In [21]:
def getResult(index):
    if index < 0 :
        return 0
    elif index == 0 :
      return 5
    elif index <= 1:
      return 4
    elif index <= 3:
      return 3
    elif index <= 7:
      return 2
    else :
      return 1

In [22]:
def getResult2(index,length):
    if index < 0 :
        return 0
    elif index == length-1 :
      return 5
    elif index == length-2 :
      return 4
    elif index >= length-4 :
      return 3
    elif index >= length-8 :
      return 2
    else :
      return 1

In [23]:
def getYearURL(year):
    base = f"https://www.pro-football-reference.com";
    url = f"{base}/years/{year}/index.htm"
    return url

In [24]:
async def getPlayoffTeamsArr(url):
    base = f"https://www.pro-football-reference.com";
    a_tags = (await findLosers(url)) + (await findWinner(url));
    hrefs = [a["href"]  for a in a_tags];
    teams = [l for l in hrefs if "/teams/" in l];
    finalTeams = [base + t for t in teams]
    return finalTeams;

In [25]:
async def findWinner(url):
    html = BeautifulSoup(await savePath(url, YR_DIR, nameYear,"#div_playoff_results"))
    winner = html.find_all("td",{'data-stat': 'winner'})[-1]
    a_tag = [div.find_all("a") for div in winner]
    return sum(a_tag,[])

In [26]:
async def findLosers(url):
    html = BeautifulSoup(await savePath(url, YR_DIR, nameYear,"#div_playoff_results"))
    losers = html.find_all("td",{'data-stat': 'loser'})
    html.find_all("td",{'data-stat': 'winner'})[-1]
    a_tags = [div.find_all("a") for div in losers]
    return sum(a_tags,[])

In [None]:
async def findResult(teamURL,year):
    yearURL = getYearURL(year)
    size = playoff_size(year)
    if year > 2022 or year < 1970 :
        return "undefined"
    arr = await getPlayoffTeamsArr(yearURL)
    index = arr.index(teamURL) if teamURL in arr else -1
    return getResult2(index,size)

In [None]:
def getYear(url):
    yr = int(url[url.find(".htm")-4:url.find(".htm")]);
    return yr;

In [None]:
def getAbbrv(url):
    abbrv = url[url.index("teams/")+6:url.index(".htm")-5];
    return abbrv;

In [None]:
def nameYear(link):
    i = link.find("years")+6;
    return link[i:i+4] + "league.htm";

In [None]:
def nameTeam(link):
    return getAbbrv(link)+str(getYear(link))+".htm";

In [None]:
def getWins(string):
    w = int(string[:string.find("-")])
    return w;

In [None]:
def getGames(string):
    w = getWins(string)
    rest = string[string.find("-") + 1:]
    l = getWins(rest)
    t = int(rest[rest.find("-") + 1:])
    return w + l + t

In [None]:
# sus url manipulation to get the next year
def getNext(url,diff):
    base = url[:url.find(".htm")-5]
    yr = getYear(url)+diff;
    if((yr > 2023) or (yr < 1950)):
        return "";
    return f"{base}/{yr}.htm";

In [None]:
def searchHTML(string,html,div):
    arr = html.find_all(div)
    for i,p in enumerate(arr):
        if str(p).find(string)>0 :
            return i
    return -1

In [None]:
async def searchMeta(url,div,string):
    html = BeautifulSoup(await savePath(url,DATA_DIR,nameTeam,"#meta"))
    index = searchHTML(string,html,div)
    if index < 0 :
        return "undefined"
    return html.find_all(div)[index].getText();

In [None]:
#get record from html
async def getRec(link):
    ret = await searchMeta(link,"p","Record")
    ret = ret[ret.find(":")+2:ret.find(",")]
    return ret

In [None]:
async def getFullName(url):
    html = BeautifulSoup(await savePath(url,DATA_DIR,nameTeam,"#meta"))
    ret = html.find_all("span")[1].getText()
    return ret

In [None]:
async def getDivision(url):
    ret = await searchMeta(url,"p","Record")
    if ret == "undefined" :
        return ret;
    return ret[ret.find("\t")+1:ret.find("Div")-1]

In [None]:
async def getConf(url):
    ret = await getDivision(url)
    if ret == "undefined" :
        return ret;
    return ret[0:3]

In [None]:
async def getCoach(url):
    ret = await searchMeta(url,"p","Coach")
    if ret == "undefined" :
        return ret;
    return ret[ret.find("\n")+1:ret.find("(")-1]

In [None]:
# can be buggy for older seasons
async def getSBOdds(url):
    ret = await searchMeta(url,"p","Preseason Odds")
    if ret == "undefined" :
        return ret;
    endI = min(len(ret) - 1, ret.find(";"))
    return ret[ret.find("Bowl")+5:endI]

In [None]:
await getSBOdds("https://www.pro-football-reference.com/teams/rai/1980.htm")

In [None]:
# can be buggy for older seasons
async def getOverUnder(url):
    ret = await searchMeta(url,"p","O/U:")
    if ret == "undefined" :
        return ret;
    return float(ret[ret.find("O/U:")+5:])

In [None]:
await searchMeta("https://www.pro-football-reference.com/teams/clt/1970.htm","p","Odds")

In [None]:
await getOverUnder("https://www.pro-football-reference.com/teams/kan/2021.htm")

In [None]:
async def getPFRank(url):
    ret = await searchMeta(url,"p","Points For")
    if ret == "undefined" :
        return ret;
    return ret[ret.find(")")+2:ret.find("of")-3]

In [None]:
async def getPARank(url):
    ret = await searchMeta(url,"p","Points Against")
    if ret == "undefined" :
        return ret;
    return ret[ret.find(")")+2:ret.find("of")-3]

In [None]:
async def getExpRec(url):
    ret = await searchMeta(url,"p","Expected W-L")
    if ret == "undefined" :
        return ret;
    return ret[ret.find(":")+2:]

In [None]:
async def getSRS(url):
    ret = await searchMeta(url,"p","#srs")
    if ret == "undefined" :
        return ret;
    return ret[ret.find(":")+2:ret.find("(")-1]

In [None]:
async def getSOS(url):
    ret = await searchMeta(url,"p","#sos")
    if ret == "undefined" :
        return ret;
    return ret[ret.find("SOS: ")+5:-1]

In [None]:
async def scrape_season(season):
    url = getYearURL(season)
    finalTeams = await getPlayoffTeamsArr(url)
    return finalTeams

In [None]:
# season = 2022;
# html = await scrape_season(season);
# html;

In [None]:
SEASONS = list(range(1970,2022));
SEASONS = [await scrape_season(yr) for yr in SEASONS]

In [None]:
# for arr in SEASONS:
#     for team in arr:
#         for i in list(range(-1,2)):
#             await getRec(getNext(team,i))

In [1]:
SEASONS

NameError: name 'SEASONS' is not defined

In [182]:
simple = [];
for arrays in SEASONS:
    for team in arrays:
        year = getYear(team)
        odds = await getSBOdds(team)
        wins = await getOverUnder(team)
        simple.append([year,
                         await getFullName(team),
#                          await findResult(getNext(team,-1), year-1),
                         await findResult(team, year),
                         await findResult(getNext(team,+1), year+1),
#                          getWins(await getRec(getNext(team,-1))),
                         getWins(await getRec(team)),
                         getWins(await getRec(getNext(team,+1)))])

In [185]:
sf = pd.DataFrame(simple)
sf.columns = ["Year", "Team","Round","Next_Round","W","Next_Wins"]
# sf.columns = ["Year", "Team","Prev_Round","Round","Next_Round","Prev_Wins","W","Next_Wins"]
sf

Unnamed: 0,Year,Team,Round,Next_Round,W,Next_Wins
0,1970,Cincinnati Bengals,2,0,8,4
1,1970,Detroit Lions,2,0,10,7
2,1970,Miami Dolphins,2,4,10,10
3,1970,Minnesota Vikings,2,2,12,11
4,1970,Oakland Raiders,3,0,8,8
...,...,...,...,...,...,...
573,2021,Tampa Bay Buccaneers,2,1,13,8
574,2021,Kansas City Chiefs,3,5,12,14
575,2021,San Francisco 49ers,3,3,10,13
576,2021,Cincinnati Bengals,4,3,10,12


In [107]:
df = pd.DataFrame(allTeams)

In [49]:
from pathlib import Path
root_path = "~/Desktop"
output_path = Path(root_path, 'output4.csv')
# output_file_full_path = ("/","output")
testDF.to_csv(output_path, index = None)