## NBA PLAYER INJURY OFFICIAL DAILY REPORT - DATA EXTRACTION MODULE
### This NBA Official Daily Player Injury Report Data Extraction (2023-24 and 2024-25 season)
### Every season seems to have different format esp. when we use table.extract() from the pdf document
### Injury Reports from NBA are typically in 
### URL: https://ak-static.cms.nba.com/referee/injury/Injury-Report_{date_str}_10AM.pdf
#### Authored by Murali Balasubramanian, DBA, Walsh College, MI, USA

## This code below is only valid for 2024-25 NBA season 
### Also beware the All-star break between 12-Feb to 18-Feb-25 has no bulletins, so API call will fail for this period

In [88]:
import pandas as pd
import requests
from io import BytesIO
import pdfplumber
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def generate_injury_report_urls(start_date, end_date):
    """
    Generate URLs for injury reports between specified dates.
    
    Args:
        start_date (str): Start date in YYYY-MM-DD format
        end_date (str): End date in YYYY-MM-DD format
    
    Returns:
        list: List of URLs for injury reports
    """
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    urls = []
    while start <= end:
        date_str = start.strftime("%Y-%m-%d")
        url = f"https://ak-static.cms.nba.com/referee/injury/Injury-Report_{date_str}_10AM.pdf"
        urls.append(url)
        start += timedelta(days=1)
    return urls
    
def split_name(full_name):
    """
    Split a full name into last name and first name.
    Handles common name formats like "Last, First" and "First Last"
    """
    if not full_name:
        return ["", ""]
    
    # Handle "Last, First" format
    if ',' in full_name:
        parts = full_name.split(',')
        return [parts[0].strip(), parts[1].strip()]
    
    # Handle "First Last" format
    parts = full_name.split()
    if len(parts) >= 2:
        return [parts[-1], ' '.join(parts[:-1])]
    
    # If we can't split the name, return it as last name
    return [full_name, ""]
    
def parse_injury_pdf(pdf_url, game_date=None):
    try:
        print(f"Attempting to fetch: {pdf_url}")
        
        # Setup retry strategy for requests
        session = requests.Session()
        retry_strategy = Retry(
            total=3,
            backoff_factor=1,
            status_forcelist=[429, 500, 502, 503, 504]
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("http://", adapter)
        session.mount("https://", adapter)
        
        response = session.get(pdf_url, timeout=30)
        if response.status_code != 200:
            print(f"Failed to fetch: {pdf_url}") 
            return pd.DataFrame()
            
        with pdfplumber.open(BytesIO(response.content)) as pdf:
            data_list = []
            date_encountered = False
            date_row_data = None
            
            for page in pdf.pages:
                tables = page.find_tables({
                    "vertical_strategy": "explicit",
                    "horizontal_strategy": "explicit",
                    "explicit_vertical_lines": [edge["x0"] for edge in page.edges],
                    "explicit_horizontal_lines": [edge["top"] for edge in page.edges],
                    "snap_tolerance": 5,
                    "join_tolerance": 5
                })
                
               # print(f" Found {len(tables)} tables on page") # For debugging
                
                for table in tables:
                    table_data = table.extract()
                    # print(f"Table shape: {len(table_data)} rows") # For debugging
                    
                    for i, row in enumerate(table_data):
                        # print(f" Row {i} contents:") # For debugging
                        # print(f" Length: {len(row)}") # For debugging
                        # print(f" Row data: {row}") # For debugging
                        
                        # Skip empty rows
                        if not any(row):
                            print("  → Skipping empty row")
                            continue
                        
                        # Skip page title row (more robust detection)
                        if (len(row) >= 5 and 
                            row[5] and 
                            'Injury Repor' in row[5].strip()):
                            # print("  → Skipping title row") # For debugging
                            continue
                        
                        # Skip header row (2nd column is 'GameDate')
                        if len(row) >= 2 and row[1] and row[1].strip() == 'GameDate':
                            # print("  → Skipping header row") # For debugging
                            continue
                        
                        # Check if this is a date row (2nd column is date format)
                        if len(row) >= 2 and row[1] and row[1].strip():
                            # print("  → Found potential date row") # For debugging
                            # print(f"  Date string: {row[1].strip()}") # For debugging
                            try:
                                # Try both date formats
                                date_formats = ['%m/%d/%y', '%m/%d/%Y']
                                parsed_date = None
                                
                                for date_format in date_formats:
                                    try:
                                        parsed_date = datetime.strptime(row[1].strip(), date_format)
                                        # print(f"  ✓ Successfully parsed date: {parsed_date}") # For debugging
                                        break
                                    except ValueError:
                                        continue
                                
                                if parsed_date is None:
                                    print(f"  ✗ Failed to parse date with both formats") # Error note if date can't be parsed
                                    continue
                                
                                # Store the date row data
                                date_row_data = {
                                    'player_name': row[6] if len(row) >= 7 and row[6] else "",
                                    'current_status': row[7] if len(row) >= 8 and row[7] else "",
                                    'reason': row[8] if len(row) >= 9 and row[8] else ""
                                }
                                
                                # If we already encountered a date, stop processing
                                if date_encountered:
                                    print(f" Found next game date: {row[1]}")
                                    print(" Stopping processing as we've reached next date")
                                    return pd.DataFrame(data_list)
                                
                                date_encountered = True
                                game_date = parsed_date.strftime('%Y-%m-%d')
                                print(f" Set game date: {game_date}")
                                
                                # Only add date row if it contains player information
                                # AND we haven't added any rows yet
                                if date_row_data['player_name'] and not data_list:
                                    name_parts = split_name(date_row_data['player_name'])
                                    data_list.append({
                                        'Game Date': game_date,
                                        'Last Name': name_parts[0],
                                        'First Name': name_parts[1],
                                        'Current Status': date_row_data['current_status'],
                                        'Reason': date_row_data['reason']
                                    })
                            except ValueError:
                                print(f" Failed to parse date: {row[1].strip()}")
                                continue
                        
                        # Process player data rows
                        if len(row) >= 9 and row[6]:  # Check 7th column (index 6)
                            # Skip rows that don't have proper player data
                            if len(row) >= 8 and not row[7]:  # Skip if no status
                                # print("  → Skipping incomplete player data")
                                continue
                            
                            # Skip if this is the same as the date row
                            if date_row_data and row[6] == date_row_data['player_name']:
                                # print("  → Skipping duplicate date row entry")
                                continue
                            
                            player_name = row[6] if row[6] else ""
                            current_status = row[7] if row[7] else ""
                            reason = row[8] if row[8] else ""
                            
                            # Split name
                            name_parts = split_name(player_name)
                            
                            # Add row data to list
                            data_list.append({
                                'Game Date': game_date,
                                'Last Name': name_parts[0],
                                'First Name': name_parts[1],
                                'Current Status': current_status,
                                'Reason': reason
                            })
            
            # Convert list to DataFrame at the end
            if data_list:
                return pd.DataFrame(data_list)
            return pd.DataFrame()
    
    except Exception as e:
        print(f" Error parsing {pdf_url}: {str(e)}") # Error note if pdf_url is inaccessible
        return pd.DataFrame()     
        
def batch_parse_injury_reports(start_date, end_date):
    """
    Batch process multiple injury reports.
    
    Args:
        start_date (str): Start date in YYYY-MM-DD format
        end_date (str): End date in YYYY-MM-DD format
    
    Returns:
        pd.DataFrame: Concatenated DataFrame with all injury reports
    """
    urls = generate_injury_report_urls(start_date, end_date)
    all_dfs = []
    
    for url in urls:
        print(f"\n{'='*50}")
        df = parse_injury_pdf(url)
        if not df.empty:
            all_dfs.append(df)
    
    return pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()


In [48]:
# Example usage
start_date = "2025-02-19"
end_date = "2025-06-22"
df_injury_2024_25 = batch_parse_injury_reports(start_date, end_date)
print(df_injury_2024_25.head())


🔍 Attempting to fetch: https://ak-static.cms.nba.com/referee/injury/Injury-Report_2025-02-19_10AM.pdf
  ✓ Successfully parsed date: 2025-02-19 00:00:00
  ✓ Successfully parsed date: 2025-02-20 00:00:00
📅 Found next game date: 02/20/2025
👋 Stopping processing as we've reached next game

🔍 Attempting to fetch: https://ak-static.cms.nba.com/referee/injury/Injury-Report_2025-02-20_10AM.pdf
  ✓ Successfully parsed date: 2025-02-20 00:00:00
  ✓ Successfully parsed date: 2025-02-21 00:00:00
📅 Found next game date: 02/21/2025
👋 Stopping processing as we've reached next game

🔍 Attempting to fetch: https://ak-static.cms.nba.com/referee/injury/Injury-Report_2025-02-21_10AM.pdf
  ✓ Successfully parsed date: 2025-02-21 00:00:00
  ✓ Successfully parsed date: 2025-02-22 00:00:00
📅 Found next game date: 02/22/2025
👋 Stopping processing as we've reached next game

🔍 Attempting to fetch: https://ak-static.cms.nba.com/referee/injury/Injury-Report_2025-02-22_10AM.pdf
  ✓ Successfully parsed date: 2025-0

In [49]:
df_injury_2024_25.shape

(4208, 5)

In [51]:
df_injury_2024_25.head(10)

Unnamed: 0,Game Date,Last Name,First Name,Current Status,Reason
0,2025-02-19,Ball,LaMelo,Probable,Injury/Illness-RightAnkle;Soreness
1,2025-02-19,Mann,Tre,Out,Injury/Illness-Disc;Herniation
2,2025-02-19,Miller,Brandon,Out,Injury/Illness-RightWrist;Ligament\nRepair
3,2025-02-19,Okogie,Josh,Out,Injury/Illness-LeftHamstring;Strain
4,2025-02-19,Williams,Grant,Out,Injury/Illness-RightACL;Repair
5,2025-02-19,Williams,Mark,Probable,ReturntoCompetition\nReconditioning
6,2025-02-19,James,LeBron,Questionable,Injury/Illness-LeftFoot;Soreness
7,2025-02-19,Kleber,Maxi,Out,Injury/Illness-RightFoot;Surgery\nRecovery
8,2025-02-20,Davison,JD,Out,GLeague-Two-Way
9,2025-02-20,Peterson,Drew,Out,GLeague-Two-Way


In [47]:
df_injury_24_25 = df_injury_2024_25.copy()

In [56]:
df_injury_24_25.to_excel("NBA_Injury_Report_2024_25.xlsx", index=False)

In [52]:
df_injury_24_25 = pd.concat([df_injury_24_25,df_injury_2024_25],ignore_index=True)

In [53]:
df_injury_24_25.shape

(11173, 5)

## Revised code next to extract NBA 2022-23 season due to subtle format change in 2024-25 season leading to a different table extraction module

## This code is for 2022-23 season until May 2nd, 2023
### Notes: From beginning of 22-23 season, the format followed was old one till May 2nd, 2023
### From May 3rd, 2023, The new format was adopted which is valid thru 24-25.

In [86]:
import pandas as pd
import requests
from io import BytesIO
import pdfplumber
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def generate_injury_report_urls(start_date, end_date):
    """
    Generate URLs for injury reports between specified dates.
    
    Args:
        start_date (str): Start date in YYYY-MM-DD format
        end_date (str): End date in YYYY-MM-DD format
    
    Returns:
        list: List of URLs for injury reports
    """
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    urls = []
    while start <= end:
        date_str = start.strftime("%Y-%m-%d")
        url = f"https://ak-static.cms.nba.com/referee/injury/Injury-Report_{date_str}_10AM.pdf"
        urls.append(url)
        start += timedelta(days=1)
    return urls
    
def split_name(full_name):
    """
    Split a full name into last name and first name.
    Handles common name formats like "Last, First" and "First Last"
    """
    if not full_name:
        return ["", ""]
    
    # Handle "Last, First" format
    if ',' in full_name:
        parts = full_name.split(',')
        return [parts[0].strip(), parts[1].strip()]
    
    # Handle "First Last" format
    parts = full_name.split()
    if len(parts) >= 2:
        return [parts[-1], ' '.join(parts[:-1])]
    
    # If we can't split the name, return it as last name
    return [full_name, ""]
    
def parse_injury_pdf(pdf_url, game_date=None):
    try:
        print(f"Attempting to fetch: {pdf_url}")
        
        # Setup retry strategy for requests
        session = requests.Session()
        retry_strategy = Retry(
            total=3,
            backoff_factor=1,
            status_forcelist=[429, 500, 502, 503, 504]
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("http://", adapter)
        session.mount("https://", adapter)
        
        response = session.get(pdf_url, timeout=30)
        if response.status_code != 200:
            print(f"Failed to fetch: {pdf_url}") 
            return pd.DataFrame()
            
        with pdfplumber.open(BytesIO(response.content)) as pdf:
            data_list = []
            date_encountered = False
            date_row_data = None
            
            for page in pdf.pages:
                tables = page.find_tables({
                    "vertical_strategy": "explicit",
                    "horizontal_strategy": "explicit",
                    "explicit_vertical_lines": [edge["x0"] for edge in page.edges],
                    "explicit_horizontal_lines": [edge["top"] for edge in page.edges],
                    "snap_tolerance": 5,
                    "join_tolerance": 5
                })
                
                print(f" Found {len(tables)} tables on page") # For debugging
                
                for table in tables:
                    table_data = table.extract()
                    print(f"Table shape: {len(table_data)} rows") # For debugging
                    
                    for i, row in enumerate(table_data):
                        print(f" Row {i} contents:") # For debugging
                        print(f" Length: {len(row)}") # For debugging
                        print(f" Row data: {row}") # For debugging
                        
                        # Skip empty rows
                        if not any(row):
                            print("  → Skipping empty row")
                            continue
                        
                        # Skip page title row (more robust detection) - Not relevant for 22-23 season
                        #if (len(row) >= 5 and 
                           # row[5] and 
                           # 'Injury Repor' in row[5].strip()):
                           # print("  → Skipping title row") # For debugging
                           # continue
                        
                        # Skip header row (2nd column is 'GameDate')
                        if len(row) >= 2 and row[2] and row[2].strip() == 'Game':
                            print("  → Skipping header row") # For debugging
                            continue
                        
                        # Check if this is a date row (2nd column is date format)
                        if len(row) >= 2 and row[2] and row[2].strip():
                            print("  → Found potential date row") # For debugging
                            print(f"  Date string: {row[2].strip()+row[3].strip()}") # For debugging
                            try:
                                # Try both date formats
                                date_formats = ['%m/%d/%y', '%m/%d/%Y']
                                parsed_date = None
                                
                                for date_format in date_formats:
                                    try:
                                        string_date = row[2].strip()+row[3].strip()
                                        parsed_date = datetime.strptime(string_date, date_format)
                                        # print(f"  ✓ Successfully parsed date: {parsed_date}") # For debugging
                                        break
                                    except ValueError:
                                        continue
                                
                                if parsed_date is None:
                                    print(f"  ✗ Failed to parse date with both formats") # Error note if date can't be parsed
                                    continue
                                
                                # Store the date row data
                                date_row_data = {
                                    'player_name': row[7] if len(row) >= 7 and row[7] else "",
                                    'current_status': row[8] if len(row) >= 8 and row[8] else "",
                                    'reason': row[9] if len(row) >= 9 and row[9] else ""
                                }
                                
                                # If we already encountered a date, stop processing
                                if date_encountered:
                                    print(f" Found next game date: {row[2]+row[3]}")
                                    print(" Stopping processing as we've reached next date")
                                    return pd.DataFrame(data_list)
                                
                                date_encountered = True
                                game_date = parsed_date.strftime('%Y-%m-%d')
                                print(f" Set game date: {game_date}")
                                
                                # Only add date row if it contains player information
                                # AND we haven't added any rows yet
                                if date_row_data['player_name'] and not data_list:
                                    name_parts = split_name(date_row_data['player_name'])
                                    data_list.append({
                                        'Game Date': game_date,
                                        'Last Name': name_parts[0],
                                        'First Name': name_parts[1],
                                        'Current Status': date_row_data['current_status'],
                                        'Reason': date_row_data['reason']
                                    })
                            except ValueError:
                                print(f" Failed to parse date: {row[2].strip()}")
                                continue
                        
                        # Process player data rows
                        if len(row) >= 9 and row[7]:  # Check 8th column (index 7)
                            # Skip rows that don't have proper player data
                            if len(row) >= 8 and not row[8]:  # Skip if no status
                                print("  → Skipping incomplete player data")
                                continue
                            
                            # Skip if this is the same as the date row
                            if date_row_data and row[7] == date_row_data['player_name']:
                                print("  → Skipping duplicate date row entry")
                                continue
                            
                            player_name = row[7] if row[7] else ""
                            current_status = row[8] if row[8] else ""
                            reason = row[9] if row[9] else ""
                            
                            # Split name
                            name_parts = split_name(player_name)
                            
                            # Add row data to list
                            data_list.append({
                                'Game Date': game_date,
                                'Last Name': name_parts[0],
                                'First Name': name_parts[1],
                                'Current Status': current_status,
                                'Reason': reason
                            })
            
            # Convert list to DataFrame at the end
            if data_list:
                return pd.DataFrame(data_list)
            return pd.DataFrame()
    
    except Exception as e:
        print(f" Error parsing {pdf_url}: {str(e)}") # Error note if pdf_url is inaccessible
        return pd.DataFrame()     
        
def batch_parse_injury_reports(start_date, end_date):
    """
    Batch process multiple injury reports.
    
    Args:
        start_date (str): Start date in YYYY-MM-DD format
        end_date (str): End date in YYYY-MM-DD format
    
    Returns:
        pd.DataFrame: Concatenated DataFrame with all injury reports
    """
    urls = generate_injury_report_urls(start_date, end_date)
    all_dfs = []
    
    for url in urls:
        print(f"\n{'='*50}")
        df = parse_injury_pdf(url)
        if not df.empty:
            all_dfs.append(df)
    
    return pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()


In [90]:
df_injury_2022_23.shape

(175, 5)

In [74]:
df_injury_22_23 = df_injury_2022_23.copy()

In [91]:
df_injury_22_23 = pd.concat([df_injury_22_23,df_injury_2022_23],ignore_index=True)

In [92]:
df_injury_22_23.shape

(9281, 5)

In [94]:
df_injury_22_23.tail()

Unnamed: 0,Game Date,Last Name,First Name,Current Status,Reason
9276,2023-06-12,Adebayo,Bam,Available,Injury/Illness-RightShoulder;\nDiscomfort
9277,2023-06-12,Herro,Tyler,Out,Injury/Illness-RightHand;Surgery
9278,2023-06-12,Oladipo,Victor,Out,Injury/Illness-LeftKnee;Surgery
9279,2023-06-12,Vincent,Gabe,Available,Injury/Illness-LeftAnkle;Soreness
9280,2023-06-12,Zeller,Cody,Available,Injury/Illness-Nasal;Fracture


In [95]:
df_injury_22_23.to_excel("NBA_Injury_Report_2022_23.xlsx", index=False)