In [22]:
import requests as req  # Used for making HTTP requests
from lxml import html   # For parsing HTML content using XPath
import datetime as dt   # To work with date and time objects
import time             # For adding delays (sleep)
import random           # For generating random numbers (used in backoff delay)
from googlesearch import search  # To perform Google searches and retrieve URLs
import logging          # For logging information, warnings, and errors

# Configure logging to output messages with INFO level or higher.
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)  # Create a logger object for this module

# Create a global requests session to reuse connections across requests.
session = req.Session()

# Define default headers to mimic a typical web browser to avoid potential blocking.
DEFAULT_HEADERS = {
    "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/111.0.0.0 Safari/537.36")
}

def safe_xpath(context, query: str) -> str:
    """
    Safely perform an XPath query on the given context.
    
    Args:
        context: An lxml element (or document) on which the XPath query is executed.
        query (str): The XPath query to run.
    
    Returns:
        str: The first result from the XPath query with whitespace stripped.
             Returns an empty string if no result is found.
    """
    result = context.xpath(query)
    # Return the first element if present after stripping surrounding whitespace.
    return result[0].strip() if result else ""


def parse_sherdog_fighter(url: str) -> dict:
    """
    Parse a Sherdog fighter profile page to extract fighter details
    and fight history.
    
    This function fetches the page content, parses it using lxml,
    and then extracts relevant fighter information such as name,
    nickname, nationality, birth details, physical stats, win/loss details,
    and the fight history.
    
    Args:
        url (str): The URL of the Sherdog fighter profile.
    
    Returns:
        dict: A dictionary containing fighter details and a list of fights.
    """
    try:
        # Send GET request to the provided URL using the global session.
        response = session.get(url, headers=DEFAULT_HEADERS)
        # Raise an error if the request was unsuccessful.
        response.raise_for_status()
    except req.RequestException as e:
        logger.error(f"Error fetching Sherdog URL {url}: {e}")
        raise

    # Parse the HTML content of the response.
    xml = html.document_fromstring(response.content)

    # Extract detailed wins and losses data using XPath.
    wins_detailed = xml.xpath("//div[@class='wins']/div[@class='meter']/div[1]/text()")
    losses_detailed = xml.xpath("//div[@class='loses']/div[@class='meter']/div[1]/text()")
    
    # Get the fighter's bio information if available.
    bio_list = xml.xpath("//div[@class='fighter-info']")
    bio = bio_list[0] if bio_list else xml

    # Extract additional win/loss counts; default to '0' if not available.
    other_wins = wins_detailed[3] if len(wins_detailed) > 3 else '0'
    other_losses = losses_detailed[3] if len(losses_detailed) > 3 else '0'

    # Build the fighter dictionary with extracted details.
    fighter = {
        'name': safe_xpath(xml, "//span[@class='fn']/text()"),
        'nickname': safe_xpath(bio, ".//span[@class='nickname']/em/text()"),
        'nationality': safe_xpath(bio, ".//strong[@itemprop='nationality']/text()"),
        'birthplace': safe_xpath(xml, "//span[@class='locality']/text()"),
        'birthdate': safe_xpath(xml, "//span[@itemprop='birthDate']/text()"),
        'age': safe_xpath(xml, "//span[@itemprop='birthDate']/preceding-sibling::b/text()"),
        'height': safe_xpath(xml, "//b[@itemprop='height']/text()"),
        'weight': safe_xpath(xml, "//b[@itemprop='weight']/text()"),
        'association': safe_xpath(xml, "//span[@itemprop='memberOf']/a/span/text()"),
        'weight_class': safe_xpath(xml, "//div[@class='association-class']/a/text()"),
        'wins': {
            'total': safe_xpath(xml, "//div[@class='winloses win']/span[2]/text()"),
            'ko/tko': wins_detailed[0] if len(wins_detailed) > 0 else "0",
            'submissions': wins_detailed[1] if len(wins_detailed) > 1 else "0",
            'decisions': wins_detailed[2] if len(wins_detailed) > 2 else "0",
            'others': other_wins
        },
        'losses': {
            'total': safe_xpath(xml, "//div[@class='winloses lose']/span[2]/text()"),
            'ko/tko': losses_detailed[0] if len(losses_detailed) > 0 else "0",
            'submissions': losses_detailed[1] if len(losses_detailed) > 1 else "0",
            'decisions': losses_detailed[2] if len(losses_detailed) > 2 else "0",
            'others': other_losses
        },
        'fights': []  # Initialize an empty list for storing individual fight details.
    }

    # Extract table rows that contain individual fight details.
    fight_rows = xml.xpath("//table[@class='new_table fighter']/tr[not(@class='table_head')]")
    for row in fight_rows:
        # Retrieve referee info if available.
        referee = row.xpath("td[4]/span/a/text()")
        # Construct a dictionary for each fight with details extracted via XPath.
        fight = {
            'name': safe_xpath(row, "td[3]/a/descendant-or-self::*/text()"),
            'date': safe_xpath(row, "td[3]/span/text()"),
            'url': "https://www.sherdog.com" + safe_xpath(row, "td[3]/a/@href"),
            'result': safe_xpath(row, "td[1]/span/text()"),
            'method': safe_xpath(row, "td[4]/b/text()"),
            'referee': referee[0] if referee else "",
            'round': safe_xpath(row, "td[5]/text()"),
            'time': safe_xpath(row, "td[6]/text()"),
            'opponent': safe_xpath(row, "td[2]/a/text()")
        }
        # Append the fight details to the fighter's 'fights' list.
        fighter['fights'].append(fight)
    return fighter


def get_ufc_stats(url: str) -> dict:
    """
    Extract UFC fighter statistics from a UFC.com athlete page.
    
    This function makes a GET request to the given UFC URL, parses the response
    to extract various striking and takedown statistics, and returns a dictionary
    of the statistics.
    
    Args:
        url (str): The URL of the UFC athlete profile.
    
    Returns:
        dict: A dictionary containing detailed striking and takedown statistics.
    """
    try:
        # Request the UFC page content.
        response = session.get(url, headers=DEFAULT_HEADERS)
        response.raise_for_status()
    except req.RequestException as e:
        logger.error(f"Error fetching UFC URL {url}: {e}")
        raise

    # Parse the page using lxml.
    xml = html.document_fromstring(response.content)
    
    # Extract distance-related stats (e.g., standing, clinch, ground) using XPath.
    distance = xml.xpath("//div[@class='c-stat-3bar__value']/text()")
    # Extract additional stats values.
    stats = xml.xpath("//div[@class='c-stat-compare__number']/text()")
    # Extract bio information
    bio = xml.xpath("//div[@class='c-bio__text']/text()")


    # Extract all <dd> elements from the document and strip their text content.
    str_tds = [item.text.strip() if item.text else "0" for item in xml.xpath("//dd")]

    # Extract attempted and landed striking data with default values if missing.
    attempted = str_tds[1] if len(str_tds) > 1 else "0"
    landed = str_tds[0] if len(str_tds) > 0 else "0"
    # Extract takedown attempts and successes.
    takedowns_attempted = str_tds[3] if len(str_tds) > 3 else "0"
    takedowns_landed = str_tds[2] if len(str_tds) > 2 else "0"

    # Extract defensive and efficiency statistics.
    striking_defense = stats[4].strip() if len(stats) > 4 else "0"
    strikes_per_minute = stats[0].strip() if len(stats) > 0 else "0"
    takedown_defense = stats[5].strip() if len(stats) > 5 else "0"
    subs_per_15min = stats[3].strip() if len(stats) > 3 else "0"

    # Parse distance metrics for different fight scenarios.
    standing = distance[0].split(" ")[0] if len(distance) > 0 and distance[0] else "0"
    clinch = distance[1].split(" ")[0] if len(distance) > 1 and distance[1] else "0"
    ground = distance[2].split(" ")[0] if len(distance) > 2 and distance[2] else "0"

    # Parse the bio information to extract the fighter's name.
    status = bio[0].strip() if len(bio) > 0 else "Unknown"
    fight_style = bio[3].strip() if len(bio) > 3 else "Unknown"
    octagondebut = bio[8].strip() if len(bio) > 8 else "Unknown"
    reach = bio[9].strip() if len(bio) > 9 else "Unknown"
    legreach = bio[10].strip() if len(bio) > 10 else "Unknown"

    # Construct the fighter statistics dictionary.
    fighter_stats = {
        'status': status,
        'fight_style': fight_style,
        'octagondebut': octagondebut,
        'reach': reach,
        'legreach': legreach,
        'strikes': {
            'attempted': attempted,
            'landed': landed,
            'standing': standing,
            'clinch': clinch,
            'ground': ground,
            'striking defense': striking_defense,
            'strikes per minute': strikes_per_minute
        },
        'takedowns': {
            'attempted': takedowns_attempted,
            'landed': takedowns_landed,
            'takedown defense': takedown_defense,
            'subs per 15min': subs_per_15min
        }
    }
    return fighter_stats


def get_sherdog_link(query: str, max_retries: int = 3) -> str:
    """
    Searches for the Sherdog fighter profile link using a Google search query.
    
    The function constructs a search query by appending "Sherdog" to the given fighter name,
    then attempts to retrieve a valid Sherdog URL from the search results.
    It retries the search for a specified number of times if no valid URL is found.
    
    Args:
        query (str): The fighter's name or search query.
        max_retries (int): Maximum number of retry attempts (default is 3).
    
    Returns:
        str: A valid Sherdog fighter profile URL.
    
    Raises:
        Exception: If a valid Sherdog link is not found within the retry limit.
    """
    # Construct the search query to target Sherdog fighter profiles.
    search_query = f"{query} Sherdog"
    retry_count = 0

    while retry_count < max_retries:
        try:
            # Use googlesearch to get a list of URLs from Google.
            search_results = list(search(search_query, num_results=1))
            if search_results:
                # Loop through the search results and return the first valid Sherdog fighter URL.
                for url in search_results:
                    if "sherdog.com/fighter/" in url and "/news/" not in url:
                        return url
                logger.warning(f"No valid Sherdog fighter profile URL found for query: {query}")
            else:
                logger.warning(f"No search results found for query: {query}")
        except Exception as e:
            logger.error(f"Error retrieving search results for '{query}': {e}")
        # Increment retry count and wait for a backoff period before retrying.
        retry_count += 1
        backoff_delay = random.uniform(1, 5) * (2 ** retry_count)
        logger.info(f"Retrying in {backoff_delay:.2f} seconds... (attempt {retry_count}/{max_retries})")
        time.sleep(backoff_delay)

    raise Exception(f"Sherdog link not found for query: {query}")


def get_ufc_link(query: str) -> str:
    """
    Searches for the UFC athlete profile link using a Google search query.
    
    The function searches for URLs containing "ufc.com/athlete/" among the top results.
    
    Args:
        query (str): The fighter's name or search query.
    
    Returns:
        str: The UFC athlete profile URL.
    
    Raises:
        Exception: If no UFC athlete link is found.
    """
    try:
        # Get multiple potential URLs from the search.
        possible_urls = list(search(query + " UFC.com", num_results=5))
    except Exception as e:
        logger.error(f"Error during Google search for UFC link with query '{query}': {e}")
        raise

    # Check each URL for the desired pattern.
    for url in possible_urls:
        if "ufc.com/athlete/" in url:
            return url
    raise Exception("UFC link not found!")


def get_fighter(query: str) -> dict:
    """
    Retrieves fighter data by combining results from Sherdog and UFC.
    
    The function first obtains the Sherdog fighter profile URL and the UFC athlete profile URL,
    then parses the fighter data from both sources and merges them into a single dictionary.
    
    Args:
        query (str): The fighter's name or search query.
    
    Returns:
        dict: A dictionary containing fighter details from both Sherdog and UFC.
    """
    # Retrieve the Sherdog and UFC links using respective helper functions.
    sherdog_link = get_sherdog_link(query)
    ufc_link = get_ufc_link(query)

    # Parse fighter data from Sherdog.
    fighter = parse_sherdog_fighter(sherdog_link)
    # Merge UFC stats into the fighter data.
    fighter.update(get_ufc_stats(ufc_link))
    return fighter


def get_upcoming_event_links() -> list:
    """
    Retrieves the upcoming UFC event links from the UFC events page.
    
    This function fetches the UFC events page, parses the content,
    and extracts links for upcoming events.
    
    Returns:
        list: A list of fully-qualified URLs for upcoming UFC events.
    """
    url = 'https://www.ufc.com/events'
    try:
        # Request the UFC events page.
        response = session.get(url, headers=DEFAULT_HEADERS)
        response.raise_for_status()
    except req.RequestException as e:
        logger.error(f"Error fetching UFC events page: {e}")
        raise

    # Parse the HTML content.
    xml = html.document_fromstring(response.content)
    # Extract relative URLs using XPath and prepend the base URL.
    links = xml.xpath("//details[@id='events-list-upcoming']/div/div/div/div/div/section/ul/li/article/div[1]/div/a/@href")
    return ["https://www.ufc.com/" + x for x in links]


def get_ufc_link_event(query: str) -> str:
    """
    Searches for a UFC event link using a Google search query.
    
    The function looks for URLs containing "ufc.com/event/" among the search results.
    
    Args:
        query (str): The event name or search query.
    
    Returns:
        str: The UFC event URL.
    
    Raises:
        Exception: If no UFC event link is found.
    """
    try:
        # Get a list of possible URLs.
        possible_urls = list(search(query + " UFC", num_results=5))
    except Exception as e:
        logger.error(f"Error during Google search for UFC event with query '{query}': {e}")
        raise

    # Loop through the URLs to find a valid event link.
    for url in possible_urls:
        if "ufc.com/event/" in url:
            return url
    raise Exception("UFC event link not found!")


def get_ranking(fight, corner: str) -> str:
    """
    Extracts ranking information for a fighter from a fight element based on the corner.
    
    Depending on whether the fighter is in the 'red' or 'blue' corner,
    the function uses a different XPath to retrieve the ranking text.
    
    Args:
        fight: The HTML element representing a fight card.
        corner (str): The corner identifier ('red' or 'blue').
    
    Returns:
        str: The ranking of the fighter or "Unranked" if not found.
    """
    if corner == 'red':
        path = "div/div/div/div[2]/div[2]/div[2]/div[1]/span/text()"
    else:
        path = "div/div/div/div[2]/div[2]/div[2]/div[2]/span/text()"
    try:
        ranking_text = fight.xpath(path)[0]
        # Remove any prefix (like '#' symbol) from the ranking.
        return ranking_text[1:] if ranking_text else "Unranked"
    except IndexError:
        return "Unranked"


def get_name(fight, corner: str) -> str:
    """
    Extracts the fighter's name from a fight element for a given corner.
    
    Uses a primary XPath to get the name; if the result is empty,
    a fallback XPath is used.
    
    Args:
        fight: The HTML element representing a fight card.
        corner (str): The corner identifier ('red' or 'blue').
    
    Returns:
        str: The fighter's name.
    """
    if corner == 'red':
        path = "div/div/div/div[2]/div[2]/div[5]/div[1]/a/span/text()"
    else:
        path = "div/div/div/div[2]/div[2]/div[5]/div[3]/a/span/text()"
    name_parts = fight.xpath(path)
    # Join multiple text parts and strip extra whitespace.
    name = " ".join(name_parts).strip()
    if not name:
        # If primary extraction fails, try a fallback path.
        fallback_path = path.replace("/span", "")
        name = " ".join(fight.xpath(fallback_path)).strip()
    return name


def parse_event(url: str, past: bool = True) -> dict:
    """
    Parse a UFC event page to extract event details and fight card information.
    
    This function retrieves the event page, parses details such as event name,
    date, location, and then iterates over the fight card elements to extract
    individual fight details. If the event is a past event (indicated by `past=True`),
    it also extracts the fight results, method, round, and time.
    
    Args:
        url (str): The URL of the UFC event page.
        past (bool): Determines if detailed fight results should be extracted (default True).
    
    Returns:
        dict: A dictionary containing event details including a list of fights.
    """
    try:
        # Request the event page content.
        response = session.get(url, headers=DEFAULT_HEADERS)
        response.raise_for_status()
    except req.RequestException as e:
        logger.error(f"Error fetching UFC event URL {url}: {e}")
        raise

    # Parse the HTML content.
    xml = html.document_fromstring(response.content)
    
    # Extract the event header components using XPath.
    prefix = safe_xpath(xml, "//div[@class='c-hero__header']/div[1]/div/h1/text()")
    names = xml.xpath("//div[@class='c-hero__header']/div[2]/span/span/text()")
    # Construct the event name using available parts.
    event_name = f"{prefix}: {names[0].strip()} vs. {names[-1].strip()}" if names else prefix
    
    # Extract the timestamp attribute and convert it to a readable date.
    timestamp = xml.xpath("//div[@class='c-hero__bottom-text']/div[1]/@data-timestamp")
    try:
        date = dt.datetime.fromtimestamp(int(timestamp[0])).strftime("%Y-%m-%d") if timestamp else ""
    except ValueError:
        date = ""
    
    # Extract location information and split into venue and location.
    location_text = safe_xpath(xml, "//div[@class='c-hero__bottom-text']/div[2]/div/text()")
    location_parts = location_text.split(",") if location_text else ["", ""]
    venue = location_parts[0].strip() if location_parts[0] else ""
    loc = location_parts[1].strip() if len(location_parts) > 1 else ""
    
    # Initialize the event dictionary with basic event details.
    event = {
        'name': event_name,
        'date': date,
        'location': loc,
        'venue': venue,
        'fights': []  # List to store details of each fight.
    }
    
    # Retrieve all fight card elements on the page.
    fights_html = xml.xpath("//div[@class='fight-card']/div/div/section/ul/li")
    for fight in fights_html:
        # Extract weightclass information and clean up the text.
        weightclass_text = safe_xpath(fight, "div/div/div/div[2]/div[2]/div[1]/div[2]/text()")
        fight_details = {
            'weightclass': weightclass_text[:-5] if weightclass_text else "",
            'red corner': {
                'name': get_name(fight, 'red'),
                'ranking': get_ranking(fight, 'red'),
                'odds': safe_xpath(fight, "div/div/div/div[4]/div[2]/span[1]/span/text()"),
                'link': safe_xpath(fight, "div/div/div/div[2]/div[2]/div[5]/div[1]/a/@href")
            },
            'blue corner': {
                'name': get_name(fight, 'blue'),
                'ranking': get_ranking(fight, 'blue'),
                'odds': safe_xpath(fight, "div/div/div/div[4]/div[2]/span[3]/span/text()"),
                'link': safe_xpath(fight, "div/div/div/div[2]/div[2]/div[5]/div[3]/a/@href")
            }
        }
        if past:
            # If the event has already occurred, extract additional result details.
            result = fight.xpath("div/div/div/div[2]//div[@class='c-listing-fight__outcome-wrapper']/div/text()")
            method = fight.xpath("div//div[@class='c-listing-fight__result-text method']/text()")
            finished_round = fight.xpath("div//div[@class='c-listing-fight__result-text round']/text()")
            finished_time = fight.xpath("div//div[@class='c-listing-fight__result-text time']/text()")
            
            # Save the fight outcome details in the fight_details dictionary.
            fight_details['round'] = finished_round[0] if finished_round else ""
            fight_details['time'] = finished_time[0] if finished_time else ""
            fight_details['method'] = method[0] if method else ""
            if result and len(result) >= 2:
                fight_details['red corner']['result'] = result[0].strip()
                fight_details['blue corner']['result'] = result[1].strip()
            else:
                fight_details['red corner']['result'] = ""
                fight_details['blue corner']['result'] = ""
        # Append the fight details to the event's fight list.
        event['fights'].append(fight_details)
    return event


def get_upcoming_events() -> dict:
    """
    Retrieve upcoming UFC events and return a dictionary with event names as keys.
    
    This function fetches the list of upcoming event links and parses each event page
    to extract event details.
    
    Returns:
        dict: A dictionary where keys are event names and values are event details.
    """
    # Get the list of upcoming event URLs.
    links = get_upcoming_event_links()
    results = {}
    for url in links:
        try:
            # Parse each event (without past event details since they haven't occurred yet).
            event = parse_event(url, past=False)
            results[event['name']] = event
        except Exception as e:
            logger.error(f"Error parsing event at {url}: {e}")
    return results


def get_event(query: str) -> dict:
    """
    Retrieve event data for a specific UFC event based on a search query.
    
    This function searches for the UFC event link using the provided query,
    then parses the event page to extract its details.
    
    Args:
        query (str): The event name or search query.
    
    Returns:
        dict: A dictionary containing the event details.
    """
    # Retrieve the event URL using a helper function.
    link = get_ufc_link_event(query)
    # Parse the event page to extract detailed information.
    return parse_event(link)


In [24]:
get_fighter("Adesanya")

{'name': 'Israel Adesanya',
 'nickname': 'The Last Stylebender',
 'nationality': 'Nigeria',
 'birthplace': 'Lagos',
 'birthdate': 'Jul 22, 1989',
 'age': '35',
 'height': '6\'4"',
 'weight': '185 lbs',
 'association': 'City Kickboxing',
 'weight_class': 'Middleweight',
 'wins': {'total': '24',
  'ko/tko': '16',
  'submissions': '0',
  'decisions': '8',
  'others': '0'},
 'losses': {'total': '5',
  'ko/tko': '2',
  'submissions': '1',
  'decisions': '2',
  'others': '0'},
 'fights': [{'name': 'UFC Fight Night 250 - Adesanya vs. Imavov',
   'date': 'Feb / 01 / 2025',
   'url': 'https://www.sherdog.com/events/UFC-Fight-Night-250-Adesanya-vs-Imavov-105177',
   'result': 'loss',
   'method': 'TKO (Punches)',
   'referee': 'Marc Goddard',
   'round': '2',
   'time': '0:30',
   'opponent': 'Nassourdine Imavov'},
  {'name': 'UFC 305 - Du Plessis vs. Adesanya',
   'date': 'Aug / 17 / 2024',
   'url': 'https://www.sherdog.com/events/UFC-305-Du-Plessis-vs-Adesanya-102302',
   'result': 'loss',
  