In [None]:
# STEP 1: Import libraries
import requests  # type: ignore
from bs4 import BeautifulSoup  # type: ignore
import pandas as pd  # type: ignore
from tqdm import tqdm  # type: ignore
from sqlalchemy import create_engine # type: ignore
from dotenv import load_dotenv # type: ignore
import os

# STEP 2: Load environment variables
load_dotenv()

DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT", 5432)
DB_NAME = os.getenv("PG_DB")
DB_USER = os.getenv("PG_USER")
DB_PASS = os.getenv("PG_PASSWORD")

# STEP 3: Scrape standings from Baseball-Reference
years = list(range(2011, 2025))
all_data = []

print("🔄 Scraping standings from Baseball-Reference...")

for year in tqdm(years):
    url = f"https://www.baseball-reference.com/leagues/MLB/{year}-standings.shtml"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    tables = soup.find_all("table")

    for table in tables:
        if "standings" in table.get("id", ""):
            try:
                df = pd.read_html(str(table))[0]
                if 'W' in df.columns:
                    df['Year'] = year
                    all_data.append(df)
            except Exception as e:
                print(f"⚠️ Failed to read table for {year}: {e}")

# STEP 4: Combine all years into a single DataFrame
records_df = pd.concat(all_data, ignore_index=True)

# STEP 5: Clean and standardize
records_df = records_df.rename(columns={
    'Tm': 'Team',
    'W': 'Wins',
    'L': 'Losses',
    'W-L%': 'Win_Percentage'
})
records_df = records_df[records_df['Team'].notna()]

print("✅ Sample of cleaned data:")
print(records_df.head())

# STEP 6: Save locally to CSV
records_df.to_csv("mlb_team_records_2011_2024.csv", index=False)
print("✅ CSV file saved as mlb_team_records_2011_2024.csv")

# STEP 7: Upload to PostgreSQL
connection_string = f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(connection_string)

records_df.to_sql("mlb_team_records", engine, schema="sql_project", if_exists="replace", index=False)
print("✅ Data loaded to sql_project.mlb_team_records in PostgreSQL.")


  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(

                Team  Wins  Losses  Win_Percentage    GB  Year
0   New York Yankees    97      65           0.599    --  2011
1     Tampa Bay Rays    91      71           0.562   6.0  2011
2     Boston Red Sox    90      72           0.556   7.0  2011
3  Toronto Blue Jays    81      81           0.500  16.0  2011
4  Baltimore Orioles    69      93           0.426  28.0  2011



