 Lets analyze some data...

In [87]:
!pip list

Package                 Version
----------------------- -----------
asttokens               3.0.0
comm                    0.2.3
contourpy               1.3.3
cycler                  0.12.1
debugpy                 1.8.16
decorator               5.2.1
et_xmlfile              2.0.0
executing               2.2.0
fonttools               4.59.1
ipykernel               6.30.1
ipython                 9.4.0
ipython_pygments_lexers 1.1.1
jedi                    0.19.2
jupyter_client          8.6.3
jupyter_core            5.8.1
kiwisolver              1.4.9
matplotlib              3.10.5
matplotlib-inline       0.1.7
nest-asyncio            1.6.0
numpy                   2.3.2
openpyxl                3.1.5
packaging               25.0
pandas                  2.3.2
parso                   0.8.5
pathlib                 1.0.1
pexpect                 4.9.0
pillow                  11.3.0
pip                     25.2
platformdirs            4.3.8
prompt_toolkit          3.0.51
psutil                  7.

I created a new environment for this project, so I ran "!pip list" to see what is already here.

I am running "!pip install" with the libraries that I know I will be using for sure, I may need to add some more later, but this will get me started up.

In [88]:
!pip install pandas tqdm numpy matplotlib seaborn pathlib openpyxl



Trust but verify that they all installed correctly by checking "!pip list" again.,

In [89]:
!pip list

Package                 Version
----------------------- -----------
asttokens               3.0.0
comm                    0.2.3
contourpy               1.3.3
cycler                  0.12.1
debugpy                 1.8.16
decorator               5.2.1
et_xmlfile              2.0.0
executing               2.2.0
fonttools               4.59.1
ipykernel               6.30.1
ipython                 9.4.0
ipython_pygments_lexers 1.1.1
jedi                    0.19.2
jupyter_client          8.6.3
jupyter_core            5.8.1
kiwisolver              1.4.9
matplotlib              3.10.5
matplotlib-inline       0.1.7
nest-asyncio            1.6.0
numpy                   2.3.2
openpyxl                3.1.5
packaging               25.0
pandas                  2.3.2
parso                   0.8.5
pathlib                 1.0.1
pexpect                 4.9.0
pillow                  11.3.0
pip                     25.2
platformdirs            4.3.8
prompt_toolkit          3.0.51
psutil                  7.

To start, I will combine my spreadsheets to have a cleaner work flow. 
I have ADP sheets for 2016-2024 and where each player finished from 2016-2024. I can combine them by year to have 2 working sheets.

In [90]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import re, csv

def extract_year(path):
    m = re.search(r"(20\d{2})", path.name)
    if not m:
        raise ValueError(f"No year in filename: {path.name}")
    return int(m.group(1))

def read_csv_resilient(path: Path) -> pd.DataFrame:
    """Robust CSV reader for messy FantasyPros exports."""
    # 1) Try the fast path
    try:
        return pd.read_csv(path)
    except Exception:
        pass

    # 2) Sniff dialect + use python engine (more forgiving)
    try:
        sample = path.read_text(encoding="utf-8", errors="replace")[:10000]
        dialect = csv.Sniffer().sniff(sample)
        return pd.read_csv(
            path,
            engine="python",
            sep=dialect.delimiter,
            quotechar=dialect.quotechar,
            skip_blank_lines=True
        )
    except Exception:
        pass

    # 3) Heavy-duty fallback: assume comma, keep quotes, skip only bad lines
    return pd.read_csv(
        path,
        engine="python",
        sep=",",
        quotechar='"',
        skip_blank_lines=True,
        on_bad_lines="skip"   # logs silently; change to 'warn' if you want messages
    )

# === COMBINE ADPs ===
adp_folder = Path("data/average_draft_pos")
adp_dfs = []
for path in tqdm(sorted(adp_folder.glob("*.csv")), desc="Combining ADPs"):
    year = extract_year(path)
    df = read_csv_resilient(path)
    df["Year"] = year
    adp_dfs.append(df)

adps_all = pd.concat(adp_dfs, ignore_index=True)
adps_all = adps_all[[c for c in adps_all.columns if "Unnamed" not in c]]  # cleanup
adps_all.to_csv("fantasypros_adp_2016_2024.csv", index=False)
print("✅ Saved fantasypros_adp_2016_2024.csv")

# === COMBINE SEASON END POINTS (Leaders) ===
leaders_folder = Path("data/season_end_points")
leaders_dfs = []
for path in tqdm(sorted(leaders_folder.glob("*.csv")), desc="Combining Leaders"):
    year = extract_year(path)
    df = read_csv_resilient(path)
    df["Year"] = year
    leaders_dfs.append(df)

leaders_all = pd.concat(leaders_dfs, ignore_index=True)
leaders_all = leaders_all[[c for c in leaders_all.columns if "Unnamed" not in c]]
leaders_all.to_csv("fantasypros_leaders_2016_2024.csv", index=False)
print("✅ Saved fantasypros_leaders_2016_2024.csv")


Combining ADPs: 100%|██████████| 9/9 [00:00<00:00, 159.35it/s]


✅ Saved fantasypros_adp_2016_2024.csv


Combining Leaders: 100%|██████████| 9/9 [00:00<00:00, 288.78it/s]

✅ Saved fantasypros_leaders_2016_2024.csv





In [91]:
adps_all = pd.read_csv("fantasypros_adp_2016_2024.csv")

# pick only the relevant columns (adjust names if slightly different in your CSVs)
adps_all = adps_all[['Year','Player','POS','Rank']]

# save back to a slim version
adps_all.to_csv("fantasypros_adp_slim.csv", index=False)


In [92]:
points_all = pd.read_csv("fantasypros_leaders_2016_2024.csv")

# pick only the relevant columns (adjust names if slightly different in your CSVs)
points_all = points_all[['#','Player','Pos','Year', 'AVG', 'TTL']]

# save back to a slim version
points_all.to_csv("fantasypros_leaders_slim.csv", index=False)



In [93]:
adps_all = pd.read_csv("fantasypros_adp_slim.csv")

# For each year, keep only the first 150 ranked players
adps_top150 = (adps_all
    .sort_values(["Year","Rank"])
    .groupby("Year")
    .head(150)
    .reset_index(drop=True)
)

adps_top150.to_csv("fantasypros_adp_top150.csv", index=False)
print("✅ Saved fantasypros_adp_top150.csv")


✅ Saved fantasypros_adp_top150.csv


In [94]:
points_all = pd.read_csv("fantasypros_leaders_slim.csv")

# For each year, keep only the first 150 ranked players
points_top150 = (points_all
    .sort_values(["Year","#"])
    .groupby("Year")
    .head(150)
    .reset_index(drop=True)
)

points_top150.to_csv("fantasypros_leaders_top150.csv", index=False)
print("✅ Saved fantasypros_leaders_top150.csv")


✅ Saved fantasypros_leaders_top150.csv


In [95]:
import pandas as pd

xls = pd.ExcelFile("data/league_history/actual_draft_history_cleaned.xlsx")

# Grab only sheets that are years (2018, 2019, …, 2024)
year_sheets = [s for s in xls.sheet_names if s.isdigit()]

dfs = []
for s in year_sheets:
    df = pd.read_excel(xls, sheet_name=s)
    df["Year"] = int(s)  # add year column from sheet name
    dfs.append(df)

drafts_all = pd.concat(dfs, ignore_index=True)

print("done")


done


In [96]:
import pandas as pd
import numpy as np

FILE = "data/league_history/actual_draft_history_cleaned.xlsx"
SHEET = "2024"   # change to whichever year you want

df = pd.read_excel(FILE, sheet_name=SHEET, header=None)

count = 0
for _, row in df.iterrows():
    # grab first 4 columns only
    cells = row.iloc[:4]
    # check: all 4 have values AND first one is numeric
    if cells.notna().sum() == 4 and pd.api.types.is_number(cells.iloc[0]):
        count += 1

print(f"Rows with 4 filled cells & first numeric: {count}")


Rows with 4 filled cells & first numeric: 238


In [99]:
import pandas as pd

FILE = "data/league_history/actual_draft_history_cleaned.xlsx"
xls = pd.ExcelFile(FILE)
year_sheets = [s for s in xls.sheet_names if s.isdigit()]

all_years = []
for s in year_sheets:
    df = pd.read_excel(xls, sheet_name=s, header=None)

    # keep only rows where first cell is an integer
    df = df[df.iloc[:,0].apply(lambda x: str(x).isdigit())].copy()

    # assign proper headers
    df.columns = ["PickInRound","Player","Team","Manager"]

    # add Year and OverallPick
    df["Year"] = int(s)
    df["OverallPick"] = range(1, len(df)+1)

    all_years.append(df)

drafts_all = pd.concat(all_years, ignore_index=True)

print(drafts_all.head(100).to_string(index=False))
# Save to CSV
output_file = "cleaned_draft_history.csv"
drafts_all.to_csv(output_file, index=False)

print(f"Saved cleaned draft history to {output_file}")


PickInRound                      Player                            Team Manager  Year  OverallPick
          1  Christian McCaffrey SF, RB              The Bowers that be     Ant  2024            1
          2         Breece Hall NYJ, RB                  Breeces Pieces    Adam  2024            2
          3         CeeDee Lamb Dal, WR                 Spin that Drake   Tyler  2024            3
          4      Bijan Robinson Atl, RB                         DreamZz    Nate  2024            4
          5         Tyreek Hill Mia, WR                        The Hill   Peete  2024            5
          6   Amon-Ra St. Brown Det, WR             Mostert On The Beat     Vic  2024            6
          7       Ja'Marr Chase Cin, WR The Battle of the JPs commences    Josh  2024            7
          8     Jonathan Taylor Ind, RB             Joel Embiid Express Rashide  2024            8
          9      Saquon Barkley Phi, RB               Mack Attack Combo     Mac  2024            9
         1