In [127]:
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time
import re
import pandas as pd
import numpy as np
from datetime import datetime

In [128]:
# !pip playwright install-deps firefox

In [250]:
DATA_DIR = "../sbpgs";
YR_DIR = "../leagueYrs";
PLAYOFF_SIZES = [4] * 4 + [8] * 8 + [10] * 4 + [16] + [10] * 8 + [12] * 30 + [14] * 5;
year_sizes = list(zip(PLAYOFF_SIZES,list(range(1966,2024))))
print(year_sizes)

[(4, 1966), (4, 1967), (4, 1968), (4, 1969), (8, 1970), (8, 1971), (8, 1972), (8, 1973), (8, 1974), (8, 1975), (8, 1976), (8, 1977), (10, 1978), (10, 1979), (10, 1980), (10, 1981), (16, 1982), (10, 1983), (10, 1984), (10, 1985), (10, 1986), (10, 1987), (10, 1988), (10, 1989), (10, 1990), (12, 1991), (12, 1992), (12, 1993), (12, 1994), (12, 1995), (12, 1996), (12, 1997), (12, 1998), (12, 1999), (12, 2000), (12, 2001), (12, 2002), (12, 2003), (12, 2004), (12, 2005), (12, 2006), (12, 2007), (12, 2008), (12, 2009), (12, 2010), (12, 2011), (12, 2012), (12, 2013), (12, 2014), (12, 2015), (12, 2016), (12, 2017), (12, 2018), (12, 2019), (12, 2020), (14, 2021), (14, 2022), (14, 2023)]


In [130]:
def playoff_size(year):
    return PLAYOFF_SIZES[year - 1966]

In [131]:
# DataQuest function
# add sleep to asking again
# selector defines the section of html you're looking at
async def get_html(url,selector,sleep =5, retries =3):
    html = None
    for i in range(1, retries+1):
        time.sleep(sleep *i)
        
        try:
            async with async_playwright() as p:
#                 chromium is open sourced version of chrome
                browser = await p.firefox.launch()
                page = await browser.new_page()
                await page.goto(url)
                print(await page.title())
                html = await page.inner_html(selector)
        except PlaywrightTimeout:
            print(f"Timeout error on {url}")
            continue
        else:
            break
    return html

In [143]:
def nameYear(link):
    i = link.find("years")+6;
    return link[i:i+4] + "league.htm";

In [144]:
def nameTeam(link):
    return getAbbrv(link)+str(getYear(link))+".htm";

In [132]:
# sus function, don't touch right now
async def savePath(link,directory,name,tag):
    save_path = os.path.join(directory, name(link))
    if not(os.path.exists(save_path)):
        html = await get_html(link, tag);
        with open(save_path, "w+") as f:
            f.write(html)
    else :
        with open(save_path, 'r') as f:
            html = f.read()
    return html

In [241]:
def convertResult(index):
    if index <= 0 :
        return "Missed"
    elif index == 5 :
      return "Won SB"
    elif index == 4:
      return "Lost SB"
    elif index == 3:
      return "Lost Title"
    elif index == 2:
      return "Lost Div"
    else :
      return "Lost WC"

In [134]:
def getResult2(index,length):
    if index < 0 :
        return 0
    elif index == length-1 :
      return 5
    elif index == length-2 :
      return 4
    elif index >= length-4 :
      return 3
    elif index >= length-8 :
      return 2
    else :
      return 1

In [135]:
def getYearURL(year):
    base = f"https://www.pro-football-reference.com";
    url = f"{base}/years/{year}/index.htm"
    return url

In [136]:
def twoDigitYear(year):
    return str(year)[-2:]

In [137]:
async def getPlayoffTeamsArr(url):
    base = f"https://www.pro-football-reference.com";
    a_tags = (await findLosers(url)) + (await findWinner(url));
    hrefs = [a["href"]  for a in a_tags];
    teams = [l for l in hrefs if "/teams/" in l];
    finalTeams = [base + t for t in teams]
    return finalTeams;

In [138]:
async def findWinner(url):
    html = BeautifulSoup(await savePath(url, YR_DIR, nameYear,"#div_playoff_results"))
    winner = html.find_all("td",{'data-stat': 'winner'})[-1]
    a_tag = [div.find_all("a") for div in winner]
    return sum(a_tag,[])

In [139]:
async def findLosers(url):
    html = BeautifulSoup(await savePath(url, YR_DIR, nameYear,"#div_playoff_results"))
    losers = html.find_all("td",{'data-stat': 'loser'})
    html.find_all("td",{'data-stat': 'winner'})[-1]
    a_tags = [div.find_all("a") for div in losers]
    return sum(a_tags,[])

In [140]:
async def findResult(teamURL,year):
    yearURL = getYearURL(year)
    size = playoff_size(year)
    if year > 2022 or year < 1970 :
        return "undefined"
    arr = await getPlayoffTeamsArr(yearURL)
    index = arr.index(teamURL) if teamURL in arr else -1
    return getResult2(index,size)

In [141]:
def getYear(url):
    yr = int(url[url.find(".htm")-4:url.find(".htm")]);
    return yr;

In [142]:
def getAbbrv(url):
    abbrv = url[url.index("teams/")+6:url.index(".htm")-5];
    return abbrv;

In [145]:
def getWins(string):
    w = int(string[:string.find("-")])
    return w;

In [146]:
def getGames(string):
    w = getWins(string)
    rest = string[string.find("-") + 1:]
    l = getWins(rest)
    t = int(rest[rest.find("-") + 1:])
    return w + l + t

In [147]:
# sus url manipulation to get the next year
def getNext(url,diff):
    base = url[:url.find(".htm")-5]
    yr = getYear(url)+diff;
    if((yr > 2023) or (yr < 1950)):
        return "";
    return f"{base}/{yr}.htm";

In [148]:
url = "https://www.pro-football-reference.com/years/2021/index.htm"
html = BeautifulSoup(await savePath(url, YR_DIR, nameYear,"#div_playoff_results"))
losers = html.find_all("td",{'data-stat': 'loser'})
html.find_all("td",{'data-stat': 'winner'})[-1]
# all_tags = losers.append(html.find_all("td",{'data-stat': 'winner'})[-1])
a_tags = [div.find_all("a") for div in losers]
sum(a_tags,[])

[<a href="/teams/nwe/2021.htm">New England Patriots</a>,
 <a href="/teams/rai/2021.htm">Las Vegas Raiders</a>,
 <a href="/teams/pit/2021.htm">Pittsburgh Steelers</a>,
 <a href="/teams/dal/2021.htm">Dallas Cowboys</a>,
 <a href="/teams/phi/2021.htm">Philadelphia Eagles</a>,
 <a href="/teams/crd/2021.htm">Arizona Cardinals</a>,
 <a href="/teams/oti/2021.htm">Tennessee Titans</a>,
 <a href="/teams/gnb/2021.htm">Green Bay Packers</a>,
 <a href="/teams/buf/2021.htm">Buffalo Bills</a>,
 <a href="/teams/tam/2021.htm">Tampa Bay Buccaneers</a>,
 <a href="/teams/kan/2021.htm">Kansas City Chiefs</a>,
 <a href="/teams/sfo/2021.htm">San Francisco 49ers</a>,
 <a href="/teams/cin/2021.htm">Cincinnati Bengals</a>]

In [149]:
def searchHTML(string,html,div):
    arr = html.find_all(div)
    for i,p in enumerate(arr):
        if str(p).find(string)>0 :
            return i
    return -1

In [150]:
async def searchMeta(url,div,string):
    html = BeautifulSoup(await savePath(url,DATA_DIR,nameTeam,"#meta"))
    index = searchHTML(string,html,div)
    if index < 0 :
        return "undefined"
    return html.find_all(div)[index].getText();

In [151]:
#get record from html
async def getRec(link):
    ret = await searchMeta(link,"p","Record")
    ret = ret[ret.find(":")+2:ret.find(",")]
    return ret

In [152]:
async def getFullName(url):
    html = BeautifulSoup(await savePath(url,DATA_DIR,nameTeam,"#meta"))
    ret = html.find_all("span")[1].getText()
    return ret

In [202]:
def getNick(name):
    return name.split()[-1];

In [201]:
def getNickname(year, name):
    return "'" + twoDigitYear(year) + " " + getNick(name)

In [154]:
async def getDivision(url):
    ret = await searchMeta(url,"p","Record")
    if ret == "undefined" :
        return ret;
    return ret[ret.find("\t")+1:ret.find("Div")-1]

In [155]:
async def getConf(url):
    ret = await getDivision(url)
    if ret == "undefined" :
        return ret;
    return ret[0:3]

In [156]:
async def getCoach(url):
    ret = await searchMeta(url,"p","Coach")
    if ret == "undefined" :
        return ret;
    return ret[ret.find("\n")+1:ret.find("(")-1]

In [157]:
# can be buggy for older seasons
async def getSBOdds(url):
    ret = await searchMeta(url,"p","Preseason Odds")
    if ret == "undefined" :
        return ret;
    endI = min(len(ret) - 1, ret.find(";"))
    return ret[ret.find("Bowl")+5:endI]

In [158]:
await getSBOdds("https://www.pro-football-reference.com/teams/rai/1980.htm")

'+350'

In [159]:
# can be buggy for older seasons
async def getOverUnder(url):
    ret = await searchMeta(url,"p","O/U:")
    if ret == "undefined" :
        return ret;
    return float(ret[ret.find("O/U:")+5:])

In [160]:
await searchMeta("https://www.pro-football-reference.com/teams/clt/1970.htm","p","Odds")

'undefined'

In [161]:
await getOverUnder("https://www.pro-football-reference.com/teams/kan/2021.htm")

12.5

In [162]:
async def getPFRank(url):
    ret = await searchMeta(url,"p","Points For")
    if ret == "undefined" :
        return ret;
    return ret[ret.find(")")+2:ret.find("of")-3]

In [163]:
async def getPARank(url):
    ret = await searchMeta(url,"p","Points Against")
    if ret == "undefined" :
        return ret;
    return ret[ret.find(")")+2:ret.find("of")-3]

In [164]:
async def getExpRec(url):
    ret = await searchMeta(url,"p","Expected W-L")
    if ret == "undefined" :
        return ret;
    return ret[ret.find(":")+2:]

In [165]:
async def getSRS(url):
    ret = await searchMeta(url,"p","#srs")
    if ret == "undefined" :
        return ret;
    return ret[ret.find(":")+2:ret.find("(")-1]

In [166]:
async def getSOS(url):
    ret = await searchMeta(url,"p","#sos")
    if ret == "undefined" :
        return ret;
    return ret[ret.find("SOS: ")+5:-1]

In [167]:
async def scrape_season(season):
    url = getYearURL(season)
    finalTeams = await getPlayoffTeamsArr(url)
    return finalTeams

In [168]:
# season = 2022;
# html = await scrape_season(season);
# html;

In [180]:
SEASONS = list(range(1971,2023));
SEASONS = [await scrape_season(yr) for yr in SEASONS];

In [None]:
await findResult(getNext(team,-1),year -1);

### Playoff Turnover

In [253]:
simple = [];
for teamArr in SEASONS:
    count = 0;
    farthestRound = 0;
    year = getYear(teamArr[0]);
    yearArr = [year,0,0,0,[],[]];
    topTeams = [];
    allTeams = [];
    for team in reversed(teamArr):
        prevResult = await findResult(getNext(team,-1),year -1);
        if prevResult == 0:
            full = await getFullName(team)
            yearArr[1] = yearArr[1] + 1;
            yearArr[5].append(getNick(full));
            res = await findResult(team,year);
            if res >= yearArr[2]:
                yearArr[2] = res
                yearArr[3] = convertResult(res);
                yearArr[4].append(getNick(full))
    simple.append(yearArr)

In [254]:
sf = pd.DataFrame(simple);
sf.columns = ["Year", "New_Teams","Farthest Round","Round Name","Top Teams","All Teams"]
sf.to_csv('playoffTurnover.csv', index=False, header=True)
sf

Unnamed: 0,Year,New_Teams,Farthest Round,Round Name,Top Teams,All Teams
0,1971,3,2,Lost Div,"[Redskins, Browns, Chiefs]","[Redskins, Browns, Chiefs]"
1,1972,3,3,Lost Title,[Steelers],"[Steelers, Packers, Raiders]"
2,1973,3,4,Lost SB,[Vikings],"[Vikings, Bengals, Rams]"
3,1974,2,2,Lost Div,"[Bills, Cardinals]","[Bills, Cardinals]"
4,1975,3,4,Lost SB,[Cowboys],"[Cowboys, Bengals, Colts]"
5,1976,2,2,Lost Div,"[Patriots, Redskins]","[Patriots, Redskins]"
6,1977,2,4,Lost SB,[Broncos],"[Broncos, Bears]"
7,1978,5,3,Lost Title,[Oilers],"[Oilers, Patriots, Falcons, Dolphins, Eagles]"
8,1979,3,3,Lost Title,[Buccaneers],"[Buccaneers, Chargers, Bears]"
9,1980,5,5,Won SB,[Raiders],"[Raiders, Browns, Falcons, Bills, Vikings]"


### Playoff Round Heatmap

In [193]:
simple = [];
for teamArr in SEASONS:
    for team in teamArr:
        full = await getFullName(team)
        year = getYear(team)
        simple.append([year,
                        getNickname(year,full),
                        await findResult(getNext(team,-1),year -1),
                        await findResult(team,year)])

In [219]:
sf = pd.DataFrame(simple);
sf.columns = ["Year", "Team","Prev_Round","Round"]
sf

ValueError: Length mismatch: Expected axis has 12 elements, new values have 4 elements

In [191]:
start = [];

# for teams that made the playoffs
for val in range(0, 6):
    df = sf[sf['Prev_Round'] == val]
    percents = df['Prev_Round'].value_counts(normalize=True).sort_index().tolist()
    start.append(percents)
start
# # Print the collected percentage breakdowns as a list of lists
# start = [percent for percent in start if percent]
# rounded = [[round(num, 3) for num in sublist] for sublist in start]
# mtrx = np.array(rounded)
# df = pd.DataFrame(mtrx)
# rows = ['Missed','Lost WC', 'Lost Div','Lost Title', 'Lost SB','Won SB']
# cols = ['Missed', 'Lost WC', 'Lost Div','Lost Title', 'Lost SB','Won SB']
# df.index = rows;
# df.columns = cols;
# print(df)
# df.to_csv('playoffhangover.csv', index=True, header=True)
# # print(rounded)

[[0.3622641509433962,
  0.36981132075471695,
  0.1509433962264151,
  0.07547169811320754,
  0.04150943396226415],
 [0.273972602739726,
  0.410958904109589,
  0.1917808219178082,
  0.0684931506849315,
  0.0547945205479452],
 [0.22772277227722773,
  0.3069306930693069,
  0.22772277227722773,
  0.10891089108910891,
  0.12871287128712872],
 [0.16666666666666666,
  0.3055555555555556,
  0.2638888888888889,
  0.08333333333333333,
  0.18055555555555555],
 [0.16216216216216217,
  0.40540540540540543,
  0.1891891891891892,
  0.13513513513513514,
  0.10810810810810811],
 [0.1388888888888889,
  0.3333333333333333,
  0.19444444444444445,
  0.1388888888888889,
  0.19444444444444445]]

In [45]:
df = sf[sf['Prev_Round'] == 0]
percents = df['Round'].value_counts(normalize=True).sort_index().tolist()
print(percents)

[0.3682170542635659, 0.37209302325581395, 0.1434108527131783, 0.07364341085271318, 0.04263565891472868]


In [49]:
from pathlib import Path
root_path = "~/Desktop"
output_path = Path(root_path, 'output4.csv')
# output_file_full_path = ("/","output")
testDF.to_csv(output_path, index = None)