# Imports

In [1]:
import os
import pandas as pd
import psycopg2

from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup

In [2]:
dotenv_path = ("/Users/maukanmir/Documents/Machine-Learning/Web-Scraping-Code/Player-Salaries/dot.env")
load_dotenv(dotenv_path)

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
TABLE_NAME = "player_salaries"

# Functions

In [17]:
def extract_player_info(html_content, year):
    soup = BeautifulSoup(html_content, 'html.parser')
    player_data = []
    players = soup.find_all('li', class_='list-group-item')

    for player in players:
        data = {}

        name_div = player.find('div', class_='link')
        salary_span = player.find('span', class_='medium')
        team_position_small = player.find('small')
        
        if name_div:
            
            data['player'] = name_div.text.strip()

            block = team_position_small.text.strip().split(",")
            team = block[0]
            pos = block[1]
            data['team'] = team
            data["pos"] = pos
            salary = salary_span.text.strip().replace("$", "").replace(",", "")
            data['salary'] = int(salary)
            player_data.append(data)
    
    df = pd.DataFrame(player_data)
    df["season"] = int(year)
    return df


# Grabbing Player Salaries

In [19]:
years = [str(year) for year in range(2011, 2025)]
all_data = pd.DataFrame()

for year in years:
  url = f"https://www.spotrac.com/nba/rankings/player/_/year/{year}/sort/cash_total"
  response = requests.get(url)
  df = extract_player_info(response.text, year)
  all_data = pd.concat([all_data, df])

In [30]:
all_data['id'] = range(1, len(all_data) + 1)

# Save To Posgres

In [36]:
try:
  with psycopg2.connect(
        host=DB_HOST,
        database=DB_NAME,
        user=DB_USER,
        password=DB_PASSWORD
    ) as conn:
      with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
        cur.execute(f"DROP TABLE IF EXISTS {TABLE_NAME};")

        cur.execute(f"""
            CREATE TABLE {TABLE_NAME} (
                id INTEGER PRIMARY KEY,
                player VARCHAR(255),
                team VARCHAR(12),
                pos  VARCHAR(20),
                salary INTEGER,
                season INTEGER
            );
        """)
        
        insert_query = f'INSERT INTO {TABLE_NAME} (id, player, team, pos, salary, season) VALUES (%s, %s, %s, %s, %s, %s)'
        rows_to_insert = [(row["id"], row['player'], row['team'], row['pos'], row['salary'], row['season']) for index, row in all_data.iterrows()]
        cur.executemany(insert_query, rows_to_insert)
        
        conn.commit()
        print("Data written to the database.")
except Exception as e:
  print(f"Database Failed to upload Data. The error is: {e}")
        

Data written to the database.


# Test

In [3]:
url = f"https://hoopshype.com/salaries/players/1990-1991/"
response = requests.get(url)

In [5]:
soup = BeautifulSoup(response.text, 'html.parser')

In [6]:
for block in soup:
  print(block)

html


[if IE 8]> <html lang="en-US" class="no-js ie-browser lt-ie9"> <![endif]


[if IE 9]> <html lang="en-US" class="no-js ie-browser ie9"> <![endif]


[if gt IE 9]><!
 
<html class="no-js" lang="en-US"> <!--<![endif]-->
<head>
<!-- OneTrust Cookies Consent Notice start for hoopshype.com -->
<script charset="UTF-8" data-domain-script="b547768e-cbe2-4f5f-8ebd-dc914559b41e" src="https://cdn.cookielaw.org/scripttemplates/otSDKStub.js" type="text/javascript"></script>
<script type="text/javascript">
function OptanonWrapper() { }
</script>
<!-- OneTrust Cookies Consent Notice end for hoopshype.com -->
<script>(function(H){H.className=H.className.replace(/\bno-js\b/,'js')})(document.documentElement)</script>
<meta charset="utf-8"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<link href="http://gmpg.org/xfn/11" rel="profile"/>
<link href="https://hoopshype.com/xmlrpc.php" rel="pingback"/>
<!--[if lt IE 9]>
  

In [10]:
soup.findAll("td", class_="")

[<td class="">
 							1990/91						</td>,
 <td class="">
 							1990/91(*)						</td>,
 <td class="" data-value="4250000" style="color:black">
 							$4,250,000						</td>,
 <td class="" data-value="9982396" style="color:black">
 							$9,982,396						</td>,
 <td class="" data-value="3785000" style="color:black">
 							$3,785,000						</td>,
 <td class="" data-value="8890204" style="color:black">
 							$8,890,204						</td>,
 <td class="" data-value="3175000" style="color:black">
 							$3,175,000						</td>,
 <td class="" data-value="7457437" style="color:black">
 							$7,457,437						</td>,
 <td class="" data-value="2900000" style="color:black">
 							$2,900,000						</td>,
 <td class="" data-value="6811517" style="color:black">
 							$6,811,517						</td>,
 <td class="" data-value="2850000" style="color:black">
 							$2,850,000						</td>,
 <td class="" data-value="6694077" style="color:black">
 							$6,694,077						</td>,
 <td class="" data-value="2720000" style=