# Imports

In [183]:
import pandas as pd
import psycopg2
import psycopg2.extras

from dotenv import load_dotenv
import os
from bs4 import BeautifulSoup
import requests

# Constants

In [31]:
dotenv_path = ("/Users/maukanmir/Documents/Machine-Learning/Web-Scraping-Code/Player-Salaries/dot.env")
load_dotenv(dotenv_path)

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
TABLE_NAME = "player_stats"

# Functions

In [32]:
def extract_nba_stats(html_content, year):
  
  soup = BeautifulSoup(html_content, 'html.parser')
  master_df = []
  tds = soup.find_all("td")
  player_stats = [td.text.strip() for td in tds]
  player_columns = ["Rank", "Name", "Team", "GP", "MPG", "PPG", "FGM", "FGA", "FG%", "3PM", "3PA", "3P%", "FTM", "FTA", "FT%", "ORB", "DRB", "RPG", "APG", "SPG", "BPG", "TOV", "PF"]

  count = 0
  player_dict = {}
  for idx, player_stat in enumerate(player_stats):
    
    if count == 22:
      master_df.append(player_dict)
      player_dict = {}
      count = 0
    else:
      col = player_columns[count]
      player_dict[col] = player_stat
      count +=1
  
  df = pd.DataFrame(master_df)
  df["season"] = year
  return df

In [33]:
years = [str(year+1) for year in range(1990, 2024)]
pages = range(1, 4)

In [36]:
all_data= pd.DataFrame()
for year in years:
  for page in pages:
    base_url = f"https://basketball.realgm.com/nba/stats/{year}/Averages/Qualified/points/All/desc/{page}/Regular_Season"
    response = requests.get(base_url)
    try:
      df = extract_nba_stats(response.text, year)
      all_data = pd.concat([all_data, df])
    except Exception as error:
      print(f" THe error is {error}")

In [39]:
all_data.drop("Rank", inplace=True, axis=1)

In [169]:
all_data['id'] = range(1, len(all_data) + 1)

In [160]:
cols_to_change = [ col for col in all_data.columns if col not in ["Team", "Name", "GP", "season"]]

for col in cols_to_change:
  all_data[col] = all_data[col].apply(lambda x: float(x))

all_data["GP"] = all_data["GP"].apply(lambda x: int(x))
all_data["season"] = all_data["season"].apply(lambda x: int(x))

In [184]:
try:
    with psycopg2.connect(
        host=DB_HOST,
        database=DB_NAME,
        user=DB_USER,
        password=DB_PASSWORD
    ) as conn:
        with conn.cursor() as cur:
            cur.execute(f"DROP TABLE IF EXISTS {TABLE_NAME};")
            cur.execute(f"""
                CREATE TABLE {TABLE_NAME} (
                    id INTEGER PRIMARY KEY,
                    Name VARCHAR(255),
                    Team VARCHAR(10),
                    GP INTEGER,
                    MPG FLOAT,
                    PPG FLOAT,
                    FGM FLOAT,
                    FGA FLOAT,
                    "FG%" FLOAT,
                    "3PM" FLOAT,
                    "3PA" FLOAT,
                    "3P%" FLOAT,
                    FTM FLOAT,
                    FTA FLOAT,
                    "FT%" FLOAT,
                    ORB FLOAT,
                    DRB FLOAT,
                    RPG FLOAT,
                    APG FLOAT,
                    SPG FLOAT,
                    BPG FLOAT,
                    TOV FLOAT,
                    season INTEGER
                );
            """)

            insert_query = f'''
                INSERT INTO {TABLE_NAME} (
                    id, Name, Team, GP, MPG, PPG, FGM, FGA, "FG%", "3PM", "3PA", "3P%", 
                    FTM, FTA, "FT%", ORB, DRB, RPG, APG, SPG, BPG, TOV, season
                ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            '''
            rows_to_insert = [
                (   
                    row["id"],
                    row['Name'],
                    row['Team'],
                    row['GP'],
                    row['MPG'],
                    row['PPG'],
                    row['FGM'],
                    row['FGA'],
                    row['FG%'],
                    row['3PM'],
                    row['3PA'],
                    row['3P%'],
                    row['FTM'],
                    row['FTA'],
                    row['FT%'],
                    row['ORB'],
                    row['DRB'],
                    row['RPG'],
                    row['APG'],
                    row['SPG'],
                    row['BPG'],
                    row['TOV'],
                    row['season']
                )
                for index, row in all_data.iterrows()
            ]

            cur.executemany(insert_query, rows_to_insert)
            conn.commit()
            print("Data successfully written to the database.")
except Exception as e:
    print(f"Database operation failed. Error: {e}")


Database operation failed. Error: tuple index out of range


In [None]:

# # Create Chrome options if needed (optional)
# opts = Options()
# opts.add_argument("--headless")  # Example option, run Chrome in headless mode

# # Initialize the WebDriver using webdriver-manager
# web_driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)

# # Example usage
# web_driver.get("https://www.nba.com/stats/leaders?SeasonType=Regular+Season&Season=2023-24")
# print(web_driver.title)  # Prints the title of the page