# Imports

In [104]:
import os
import pandas as pd
import psycopg2

from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
from sqlalchemy import create_engine

In [105]:
dotenv_path = ("/Users/maukanmir/Documents/Machine-Learning/Web-Scraping-Code/Player-Salaries/dot.env")
load_dotenv(dotenv_path)

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
TABLE_NAME = "team_salaries"

engine = create_engine(f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}')

# Functions

In [185]:
def extract_player_info(html_content, year):
    soup = BeautifulSoup(html_content, 'html.parser')
    player_data = []
    players = soup.find_all('li', class_='list-group-item')

    for player in players:
        data = {}

        name_div = player.find('div', class_='link')
        salary_span = player.find('span', class_='medium')
        team_position_small = player.find('small')
        
        if name_div:
            
            data['player'] = name_div.text.strip()

            block = team_position_small.text.strip().split(",")
            team = block[0]
            pos = block[1]
            data['team'] = team
            data["pos"] = pos
            salary = salary_span.text.strip().replace("$", "").replace(",", "")
            data['salary'] = int(salary)
            player_data.append(data)
    
    df = pd.DataFrame(player_data)
    df["season"] = int(year)
    return df

def extract_team_info(html_content, year):
    soup = BeautifulSoup(html_content, 'html.parser')
    tds = soup.findAll("td")
    stats = [td.text.strip() for td in tds]
    sep_blocks = " ".join(stats).split(".")[1:]
    teams_salaries = [block.strip().split(" ")[:4] for block in sep_blocks]
    
    df = []
    for block in teams_salaries:
        teams = {}
        for idx, part in enumerate(block):
            if "$" in part and block[idx-1].isalpha():
                integer = part.replace("$", "").replace(",", "")
                teams["salary"] = int(integer)
            elif part.isalpha():
                if block[idx+1].isalpha():
                    teams["Team"] = part + " " + block[idx+1]
                    df.append(teams)
                elif not block[idx+1].isalpha() and not block[idx-1].isalpha():
                    teams["Team"] = part
                    df.append(teams)
    
    master_df= pd.DataFrame(df)
    master_df["season"] = year
    return master_df
    

# Grabbing Player Salaries

In [19]:
years = [str(year) for year in range(2011, 2025)]
all_data = pd.DataFrame()

for year in years:
  url = f"https://www.spotrac.com/nba/rankings/player/_/year/{year}/sort/cash_total"
  response = requests.get(url)
  df = extract_player_info(response.text, year)
  all_data = pd.concat([all_data, df])

In [30]:
all_data['id'] = range(1, len(all_data) + 1)

# Save To Posgres

In [36]:
try:
  with psycopg2.connect(
        host=DB_HOST,
        database=DB_NAME,
        user=DB_USER,
        password=DB_PASSWORD
    ) as conn:
      with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
        cur.execute(f"DROP TABLE IF EXISTS {TABLE_NAME};")

        cur.execute(f"""
            CREATE TABLE {TABLE_NAME} (
                id INTEGER PRIMARY KEY,
                player VARCHAR(255),
                team VARCHAR(12),
                pos  VARCHAR(20),
                salary INTEGER,
                season INTEGER
            );
        """)
        
        insert_query = f'INSERT INTO {TABLE_NAME} (id, player, team, pos, salary, season) VALUES (%s, %s, %s, %s, %s, %s)'
        rows_to_insert = [(row["id"], row['player'], row['team'], row['pos'], row['salary'], row['season']) for index, row in all_data.iterrows()]
        cur.executemany(insert_query, rows_to_insert)
        
        conn.commit()
        print("Data written to the database.")
except Exception as e:
  print(f"Database Failed to upload Data. The error is: {e}")
        

Data written to the database.


# NBA Team Salaries

In [141]:
years = [str(year) + "-" + str(year+1) for year in range(1990, 2024)]

In [180]:
all_data = pd.DataFrame()
for year in years:
  base_url = f"https://hoopshype.com/salaries/{year}/"
  response = requests.get(url)
  df = extract_team_info(response.text, year)
  all_data = pd.concat([all_data, df])

In [127]:
nba_team_abbreivated = {
  "Atlanta":"ATL",
  "Cleveland": "CLE",
  "New York": "NYK",
  "Charlotte": "CHA",
  "Detroit": "DET",
  "Dallas": "DAL",
  "Philadelphia": "PHI",
  "Milwaukee": "MIL",
  "Phoenix":"PHX",
  "Brooklyn":"BKN",
  "Boston":"BOS",
  "Portland":"POR",
  "Golden State":"GSW",
  "San Antonio":"SAS",
  "Indiana":"IND",
  "Utah":"UT",
  "Oklahoma City":"OKC",
  "Houston":"HOU",
  "Denver":"DEN",
  "LA Clippers":"LAC",
  "Chicago":"CHI",
  "Washington":"WAS",
  "Sacramento":"SAC",
  "Miami":"MIA",
  "Minnesota":"MIN",
  "Orlando":"ORL",
  "New Orleans":"NOP",
  "Memphis":"MEM",
  "Toronto":"TOR",
  "LA Lakers":"LAL"
}

In [177]:
all_data["Team"] = all_data["Team"].apply(lambda x: nba_team_abbreivated[x])

# Download Dataset into DB

In [173]:
try:
    all_data.to_sql(TABLE_NAME, engine, if_exists='replace', index=False)
    print("Data successfully written to the database.")
except Exception as e:
    print(f"Database operation failed. Error: {e}")

Data successfully written to the database.


In [183]:

base_url = f"https://hoopshype.com/salaries/2023-2024/"
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')
tds = soup.findAll("td")
stats = [td.text.strip() for td in tds]
sep_blocks = " ".join(stats).split(".")[1:]
teams_salaries = [block.strip().split(" ")[:4] for block in sep_blocks]

In [184]:
df = []
for block in teams_salaries:
        teams = {}
        for idx, part in enumerate(block):
            if "$" in part and block[idx-1].isalpha():
                integer = part.replace("$", "").replace(",", "")
                teams["salary"] = int(integer)
            elif part.isalpha():
                if block[idx+1].isalpha():
                    teams["Team"] = part + " " + block[idx+1]
                    df.append(teams)
                elif not block[idx+1].isalpha() and not block[idx-1].isalpha():
                    teams["Team"] = part
                    df.append(teams)
df

[{'Team': 'Golden State', 'salary': 209354737},
 {'Team': 'LA Clippers', 'salary': 201366679},
 {'Team': 'Phoenix', 'salary': 193838882},
 {'Team': 'Milwaukee', 'salary': 187346674},
 {'Team': 'Boston', 'salary': 186940921},
 {'Team': 'Denver', 'salary': 180922992},
 {'Team': 'Miami', 'salary': 177143542},
 {'Team': 'LA Lakers', 'salary': 169876920},
 {'Team': 'Dallas', 'salary': 167755884},
 {'Team': 'New Orleans', 'salary': 167403924},
 {'Team': 'Cleveland', 'salary': 166874287},
 {'Team': 'Minnesota', 'salary': 166434327},
 {'Team': 'Philadelphia', 'salary': 166271894},
 {'Team': 'Chicago', 'salary': 165630436},
 {'Team': 'Portland', 'salary': 165263993},
 {'Team': 'New York', 'salary': 164990518},
 {'Team': 'Toronto', 'salary': 163054678},
 {'Team': 'Memphis', 'salary': 162649524},
 {'Team': 'Oklahoma City', 'salary': 162515272},
 {'Team': 'Atlanta', 'salary': 159153393},
 {'Team': 'Brooklyn', 'salary': 155015136},
 {'Team': 'Sacramento', 'salary': 153564021},
 {'Team': 'Washington