In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

def extraction_webscrapping(url, output_file, header_tag, keep_columns=None):
    response = requests.get(url)
    if response.status_code == 200:
        print("Successfully fetched the webpage.")
    else:
        print(f"Failed to fetch the webpage: {response.status_code}")

    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table')
    
    headers = []
    for th in table.find('thead').find_all(header_tag):
        headers.append(th.text.strip())
    print(f"Headers found: {headers}")
    
    rows = []
    for tr in table.find('tbody').find_all('tr'):
        cells = tr.find_all('td')
        row = [cell.text.strip() for cell in cells]
        rows.append(row)
    
    df = pd.DataFrame(rows, columns=headers)
    
    if keep_columns:
        print(f"Filtering to keep columns: {keep_columns}")
        df = df[keep_columns]
    
    output_path = os.path.join('csv', output_file)
    #df.to_csv(,output_file, index=False)
    df.to_csv(output_path, index=False)
    print(f"Data successfully scraped and saved to {output_file}")
    print(f"Data successfully scraped")

def extract_wnba_player_salaries(url, output_file):
    response = requests.get(url)
    if response.status_code == 200:
        print("Successfully fetched the webpage.")
    else:
        print(f"Failed to fetch the webpage: {response.status_code}")

    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table')
    
    headers = ["Player", "2024 Salary"]
    rows = []
    
    for tr in table.find('tbody').find_all('tr'):
        cells = tr.find_all('td')
        name = cells[0].text.strip().split('\n')[0]  # Only take first part for the name
        salary = cells[1].text.strip().split()[0]    # Only take the salary amount
        row = [name, salary]
        rows.append(row)
    
    df = pd.DataFrame(rows, columns=headers)
    df.to_csv(output_file, index=False)
    print(f"Data successfully scraped and saved to {output_file}")

#Number of points of a player (per game)
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/points", 'player-stat_2024.csv', 'th')
#Effective field goal % (per game):
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/efg-percentage", 'efg-percentage_2024.csv', 'th')
#Assists (per game):
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/assists", 'assists_2024.csv', 'th')
#Win score (per game):
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/win-score", 'win_score_2024.csv', 'th')
#Minutes played (per game):
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/minutes-played",'minutes_played.csv','th')
# 'rebounds-offensive.csv'
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/rebounds-offensive",'rebounds-offensive.csv','th')
# 'rebounds-defensive.csv'
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/rebounds-defensive",'rebounds-defensive.csv','th')
# 'blocks.csv'
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/blocks",'blocks.csv','th')
# steals
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/steals",'steals.csv','th')

Successfully fetched the webpage.
Headers found: ['Rank', 'Player', 'Team', 'Pos', 'Value']
Data successfully scraped and saved to player-stat_2024.csv
Data successfully scraped
Successfully fetched the webpage.
Headers found: ['Rank', 'Player', 'Team', 'Pos', 'Value']
Data successfully scraped and saved to efg-percentage_2024.csv
Data successfully scraped
Successfully fetched the webpage.
Headers found: ['Rank', 'Player', 'Team', 'Pos', 'Value']
Data successfully scraped and saved to assists_2024.csv
Data successfully scraped
Successfully fetched the webpage.
Headers found: ['Rank', 'Player', 'Team', 'Pos', 'Value']
Data successfully scraped and saved to win_score_2024.csv
Data successfully scraped
Successfully fetched the webpage.
Headers found: ['Rank', 'Player', 'Team', 'Pos', 'Value']
Data successfully scraped and saved to minutes_played.csv
Data successfully scraped
Successfully fetched the webpage.
Headers found: ['Rank', 'Player', 'Team', 'Pos', 'Value']
Data successfully scrap

In [9]:
import pandas as pd

############  READ THE CSV ###########
#Assists per game
df_assists = pd.read_csv(os.path.join('csv', 'assists_2024.csv'))
df_assists.rename(columns={'Value':'AST' } , inplace=True)

#Points per game
df_player_stats = pd.read_csv(os.path.join('csv', 'player-stat_2024.csv'))
df_player_stats.rename(columns={'Value':'PTS' } , inplace=True)

#Games Played  --> Minutes Played ?!
df_minutes_played = pd.read_csv(os.path.join('csv', 'minutes_played.csv'))
df_player_stats.rename(columns={'Value':'Minutes_played_per_game' } , inplace=True)

#Offensive rebounds (per game):
df_OR = pd.read_csv(os.path.join('csv', 'rebounds-offensive.csv'))
df_OR.rename(columns={'Value':'ORB' } , inplace=True)

#Defensive rebounds (per game):
df_DRB = pd.read_csv(os.path.join('csv', 'rebounds-defensive.csv'))
df_DRB.rename(columns={'Value':'DRB' } , inplace=True)

#Blocks (per game):
df_BLK = pd.read_csv(os.path.join('csv', 'blocks.csv'))
df_BLK.rename(columns={'Value':'BLK' } , inplace=True)

#Steal (per game):
df_STL = pd.read_csv(os.path.join('csv', 'steals.csv'))
df_STL.rename(columns={'Value':'STL' } , inplace=True)

############  Merge ###########

#Merge vs points and assist (merged):
df_points_and_assists = df_assists.merge(df_player_stats, on=["Player" , "Pos", "Team"])

#Final result: df_points_and_assists merged with df_OR (offensive rebounds) sorted by Points_per_game:
second_merge = df_points_and_assists.merge(df_OR , on=["Player" , "Pos", "Team"], how='outer')
second_merge = second_merge.dropna()

second_merge.rename(columns={'Value':'Minutes_played_per_game' } , inplace=True)
second_merge.sort_values(by=['PTS']).reset_index()
# Remove of columns : 
second_merge.drop(columns = ['Rank_x', 'Rank_y','Rank','Team','Pos'] , axis=1, inplace=True)
# Rounds every numerical columns to 1 decimal
second_merge = second_merge.round(1)

#Finally, Offensive PER computation
second_merge['O_PER'] = round((second_merge['AST'] + second_merge['PTS']  + second_merge['ORB'])/3 ,1)
display("Points_and_assist merged with offensive rebounds and sorted by Points_per_game PTS (offensive PER rounded):")
display(second_merge)
#Final csv offense
second_merge.to_csv('nba_offensive_per.csv')
second_merge = second_merge.head(50)
second_merge.to_csv('nba_top_50_offensive_per.csv')

##### Defense Merge #####
# df_BLK and df_DRB
defense = df_BLK.merge(df_DRB, on=["Player" , "Pos", "Team"], how='outer')
defense = defense.merge(df_STL , on=["Player" , "Pos", "Team"], how='outer')
defense = defense.dropna()
defense.drop(columns = ['Rank_x' ,'Rank_y' ,'Rank' ,'Pos','Team'] , axis=1, inplace=True)
#a) Merge of Pdf_BLK and df_DRB
defense['D_PER'] = round((defense['BLK'] + defense['DRB']  + defense['STL'])/3 ,1)
display('defense ',defense)
defense.to_csv('nba_player_defensive_2024.csv')
display("nba_player_defensive_2024 " , defense)
defense = defense.head(50)
defense.to_csv('nba_player_top_50_defensive_2024.csv')


'Points_and_assist merged with offensive rebounds and sorted by Points_per_game PTS (offensive PER rounded):'

Unnamed: 0,Player,AST,PTS,ORB,O_PER
0,Aaron Gordon,3.7,13.9,2.5,6.7
2,Alperen Sengun,5.0,21.1,2.9,9.7
6,Anthony Davis,3.5,25.0,3.2,10.6
10,Bam Adebayo,3.9,19.3,2.2,8.5
35,Deni Avdija,3.8,14.7,1.1,6.5
42,Domantas Sabonis,8.2,19.4,3.6,10.4
48,Evan Mobley,3.0,15.8,2.3,7.0
49,Franz Wagner,3.8,19.7,1.0,8.2
51,Giannis Antetokounmpo,6.5,30.4,2.7,13.2
61,Jaden Ivey,3.8,15.4,1.1,6.8


'defense '

Unnamed: 0,Player,BLK,DRB,STL,D_PER
4,Alperen Sengun,0.73,6.44,1.21,2.8
5,Amen Thompson,0.61,4.21,1.26,2.0
6,Andre Drummond,0.63,5.56,0.93,2.4
8,Anthony Davis,2.33,9.75,1.13,4.4
10,Ausar Thompson,0.94,4.33,1.08,2.1
12,Bam Adebayo,0.9,8.03,1.06,3.3
21,CJ McCollum,0.64,3.72,0.97,1.8
36,DeMar DeRozan,0.58,3.77,1.14,1.8
45,Domantas Sabonis,0.6,10.1,0.94,3.9
53,Evan Mobley,1.58,7.06,0.89,3.2


'nba_player_defensive_2024 '

Unnamed: 0,Player,BLK,DRB,STL,D_PER
4,Alperen Sengun,0.73,6.44,1.21,2.8
5,Amen Thompson,0.61,4.21,1.26,2.0
6,Andre Drummond,0.63,5.56,0.93,2.4
8,Anthony Davis,2.33,9.75,1.13,4.4
10,Ausar Thompson,0.94,4.33,1.08,2.1
12,Bam Adebayo,0.9,8.03,1.06,3.3
21,CJ McCollum,0.64,3.72,0.97,1.8
36,DeMar DeRozan,0.58,3.77,1.14,1.8
45,Domantas Sabonis,0.6,10.1,0.94,3.9
53,Evan Mobley,1.58,7.06,0.89,3.2
