# Imports

In [1]:
import os
import pandas as pd
import psycopg2

from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup

In [2]:
dotenv_path = ("/Users/maukanmir/Documents/Machine-Learning/Web-Scraping-Code/Player-Salaries/dot.env")
load_dotenv(dotenv_path)

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
TABLE_NAME = "player_salaries"

# Functions

In [17]:
def extract_player_info(html_content, year):
    soup = BeautifulSoup(html_content, 'html.parser')
    player_data = []
    players = soup.find_all('li', class_='list-group-item')

    for player in players:
        data = {}

        name_div = player.find('div', class_='link')
        salary_span = player.find('span', class_='medium')
        team_position_small = player.find('small')
        
        if name_div:
            
            data['player'] = name_div.text.strip()

            block = team_position_small.text.strip().split(",")
            team = block[0]
            pos = block[1]
            data['team'] = team
            data["pos"] = pos
            salary = salary_span.text.strip().replace("$", "").replace(",", "")
            data['salary'] = int(salary)
            player_data.append(data)
    
    df = pd.DataFrame(player_data)
    df["season"] = int(year)
    return df


# Grabbing Player Salaries

In [19]:
years = [str(year) for year in range(2011, 2025)]
all_data = pd.DataFrame()

for year in years:
  url = f"https://www.spotrac.com/nba/rankings/player/_/year/{year}/sort/cash_total"
  response = requests.get(url)
  df = extract_player_info(response.text, year)
  all_data = pd.concat([all_data, df])

In [30]:
all_data['id'] = range(1, len(all_data) + 1)

# Save To Posgres

In [36]:
try:
  with psycopg2.connect(
        host=DB_HOST,
        database=DB_NAME,
        user=DB_USER,
        password=DB_PASSWORD
    ) as conn:
      with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
        cur.execute(f"DROP TABLE IF EXISTS {TABLE_NAME};")

        cur.execute(f"""
            CREATE TABLE {TABLE_NAME} (
                id INTEGER PRIMARY KEY,
                player VARCHAR(255),
                team VARCHAR(12),
                pos  VARCHAR(20),
                salary INTEGER,
                season INTEGER
            );
        """)
        
        insert_query = f'INSERT INTO {TABLE_NAME} (id, player, team, pos, salary, season) VALUES (%s, %s, %s, %s, %s, %s)'
        rows_to_insert = [(row["id"], row['player'], row['team'], row['pos'], row['salary'], row['season']) for index, row in all_data.iterrows()]
        cur.executemany(insert_query, rows_to_insert)
        
        conn.commit()
        print("Data written to the database.")
except Exception as e:
  print(f"Database Failed to upload Data. The error is: {e}")
        

Data written to the database.


# Test

In [3]:
url = f"https://hoopshype.com/salaries/players/1990-1991/"
response = requests.get(url)

In [5]:
soup = BeautifulSoup(response.text, 'html.parser')
soup.findAll("td", class_="")

In [11]:
url = f"https://hoopshype.com/salaries/1990-1991/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [20]:
tds = soup.findAll("td")
stats = [td.text.strip() for td in tds]

In [66]:
sep_blocks = " ".join(stats).split(".")[1:]
teams_salaries = [block.strip().split(" ")[:4] for block in sep_blocks]

In [67]:
teams_salaries[:3]

[['Cleveland', '$14,403,000', '$33,829,743', '2'],
 ['New', 'York', '$13,290,000', '$31,215,535'],
 ['Detroit', '$12,910,000', '$30,322,989', '4']]

In [93]:
df = []
for salary in teams_salaries:
  teams = {}
  for idx, part in enumerate(salary):
    part = part.replace("$", "").replace(",", "")
    if part.isalpha():
      if salary[idx+1].isalpha():
        teams["Team"] = part + " " + salary[idx+1]
      elif not salary[idx+1].isalpha() and not salary[idx-1].isalpha():
        teams["Team"] = part
    elif part.isnumeric() and salary[idx-1].isalpha():
      teams["salary"] = part
    if teams and teams not in df:
        df.append(teams)

In [94]:
df

[{'Team': 'Cleveland', 'salary': '14403000'},
 {'Team': 'New York', 'salary': '13290000'},
 {'Team': 'Detroit', 'salary': '12910000'},
 {'Team': 'LA Lakers', 'salary': '12120000'},
 {'Team': 'Atlanta', 'salary': '11761000'},
 {'Team': 'Dallas', 'salary': '11693000'},
 {'Team': 'Philadelphia', 'salary': '11640000'},
 {'Team': 'Milwaukee', 'salary': '11595000'},
 {'Team': 'Phoenix', 'salary': '11463000'},
 {'Team': 'Brooklyn', 'salary': '11410000'},
 {'Team': 'Boston', 'salary': '11256000'},
 {'Team': 'Portland', 'salary': '11215000'},
 {'Team': 'Golden State', 'salary': '11150000'},
 {'Team': 'San Antonio', 'salary': '11057000'},
 {'Team': 'Indiana', 'salary': '10981000'},
 {'Team': 'Utah', 'salary': '10695000'},
 {'Team': 'Oklahoma City', 'salary': '10590000'},
 {'Team': 'Houston', 'salary': '10500000'},
 {'Team': 'Charlotte', 'salary': '10417000'},
 {'Team': 'Denver', 'salary': '10335000'},
 {'Team': 'LA Clippers', 'salary': '10245000'},
 {'Team': 'Chicago', 'salary': '10040000'},
 {'