In [4]:
#Matches Fixtures

import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import re
import os

def scrape_ipl_matches_selenium():
    """Scrape IPL match data using Selenium with improved team extraction"""
    # Create the directory if it doesn't exist
    os.makedirs("Ipl Matches (2025)", exist_ok=True)
    
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
    
    driver = None
    
    try:
        print("Initializing Chrome WebDriver...")
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        
        url = "https://www.iplt20.com/matches/fixtures"
        print(f"Accessing website: {url}")
        driver.get(url)
        
        # Wait for page to load completely
        print("Waiting for page to load...")
        wait = WebDriverWait(driver, 20)
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "vn-sheduleList")))
        
        time.sleep(5)  # Additional wait to ensure dynamic content loads
        
        # Analyze the page structure
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        print(f"Page title: {soup.title.text if soup.title else 'No title'}")
        
        # Find all match items - first identify the main container
        match_container = soup.select_one('.vn-sheduleList')
        if not match_container:
            print("Main schedule container not found!")
            return
            
        # Now find all individual match items within the container
        match_items = match_container.select('.vn-sheduleListItem')
        print(f"Found {len(match_items)} individual matches")
        
        if not match_items:
            # Try an alternative approach
            match_items = match_container.select('li')
            print(f"Alternative approach: Found {len(match_items)} list items as potential matches")

        match_data = []
        
        for index, match in enumerate(match_items, 1):
            try:
                # Extract match number
                match_no_elem = match.select_one('.vn-matchOrder')
                match_no = match_no_elem.text.strip() if match_no_elem else f"Match {index}"
                
                # Extract date
                date_elem = match.select_one('.vn-matchDate')
                date = date_elem.text.strip() if date_elem else "N/A"
                
                # Extract time
                time_elem = match.select_one('.vn-matchTime')
                time_text = time_elem.text.strip() if time_elem else "N/A"
                
                # Extract venue
                venue = extract_venue(match)
                
                # IMPROVED TEAM EXTRACTION - using enhanced approach
                teams_info = extract_teams_improved(match)
                
                match_data.append({
                    "No": index,
                    "Match_No": match_no,
                    "Team1_Team2": teams_info,
                    "Date": date,
                    "Time": time_text,
                    "Venue": venue
                })
                
                print(f"Processed match {index}: {teams_info} on {date} at {venue}")
                
            except Exception as e:
                print(f"Error processing match {index}: {e}")
        
        # If still no matches found, try a more comprehensive approach
        if not match_data:
            print("Trying a more comprehensive approach to identify matches...")
            
            # Identify the overall schedule container - might have different class names
            schedule_containers = soup.select('[class*=schedule], [class*=fixture], [class*=match-list]')
            
            for container in schedule_containers:
                # Find potential match items
                potential_matches = container.find_all(['div', 'li', 'article'], class_=True)
                
                for item in potential_matches:
                    # Check if this item contains date and teams - signs it might be a match
                    item_text = item.get_text()
                    # Skip very short text items that can't be matches
                    if len(item_text) < 20:
                        continue
                        
                    # Look for date patterns
                    date_pattern = re.search(r'([A-Z]{3},?\s+[A-Z]{3}\s+\d{1,2})', item_text)
                    # Look for time patterns
                    time_pattern = re.search(r'(\d{1,2}:\d{2}\s*[AP]M)', item_text, re.IGNORECASE)
                    
                    if date_pattern or time_pattern:
                        try:
                            index = len(match_data) + 1
                            date = date_pattern.group(1) if date_pattern else "N/A"
                            time_text = time_pattern.group(1) if time_pattern else "N/A"
                            
                            # Use improved team extraction
                            teams_info = extract_teams_improved(item)
                            
                            # Extract venue
                            venue = extract_venue(item)
                            
                            # Extract match number
                            match_no = f"Match {index}"
                            match_no_pattern = re.search(r'Match\s+(\d+)', item_text, re.IGNORECASE)
                            if match_no_pattern:
                                match_no = f"Match {match_no_pattern.group(1)}"
                            
                            match_data.append({
                                "No": index,
                                "Match_No": match_no,
                                "Team1_Team2": teams_info,
                                "Date": date,
                                "Time": time_text,
                                "Venue": venue
                            })
                            
                            print(f"Processed match {index} (comprehensive approach): {teams_info} on {date} at {venue}")
                        except Exception as e:
                            print(f"Error in comprehensive approach: {e}")
        
        if match_data:
            df = pd.DataFrame(match_data)
            print(f"\nMatch data scraped successfully: {len(match_data)} matches found")
            print(df.head(10))  # Show first 10 matches
            
            # Save data to CSV in the specified folder
            df.to_csv(os.path.join("Ipl Matches (2025)", "ipl_matches_Fixtures.csv"), index=False)
            print("Data saved to Ipl Matches (2025)/ipl_matches_Fixtures.csv")
        else:
            print("No match data was extracted. Please check the HTML structure.")
            
    except Exception as e:
        print(f"An error occurred during Selenium scraping: {e}")
        import traceback
        traceback.print_exc()
        
    finally:
        if driver:
            driver.quit()

def extract_teams_improved(match_element):
    """Enhanced function to extract both teams using multiple approaches with better consistency"""
    # Direct HTML analysis to extract image alt attributes or team name elements
    
    # First check for team images with alt text (most reliable for IPL website)
    team_images = match_element.select('img[class*=team], img[class*=Team], img[alt*=logo], img[alt*=team]')
    
    if len(team_images) >= 2:
        team1 = clean_team_name(team_images[0].get('alt', ''))
        team2 = clean_team_name(team_images[1].get('alt', ''))
        
        if team1 and team2:
            return f"{team1} vs {team2}"
    
    # Try to find dedicated team name containers - IPL often has specific elements for each team
    team_containers = find_team_containers(match_element)
    if team_containers and len(team_containers) >= 2:
        teams = []
        for container in team_containers[:2]:  # Get the first two teams
            team_name = extract_team_from_container(container)
            if team_name:
                teams.append(team_name)
                
        if len(teams) == 2:
            return f"{teams[0]} vs {teams[1]}"
    
    # Look for team abbreviations in different elements
    all_text = match_element.get_text()
    
    # Common IPL team full names and abbreviations
    ipl_teams = {
        'MI': ['Mumbai Indians', 'MI'],
        'CSK': ['Chennai Super Kings', 'CSK'],
        'RCB': ['Royal Challengers Bangalore', 'RCB'],
        'KKR': ['Kolkata Knight Riders', 'KKR'],
        'DC': ['Delhi Capitals', 'DC'],
        'SRH': ['Sunrisers Hyderabad', 'SRH'],
        'PBKS': ['Punjab Kings', 'PBKS'],
        'RR': ['Rajasthan Royals', 'RR'],
        'GT': ['Gujarat Titans', 'GT'],
        'LSG': ['Lucknow Super Giants', 'LSG']
    }
    
    # Find team abbreviations in the text
    found_teams = []
    for abbr, names in ipl_teams.items():
        for name in names:
            if re.search(r'\b' + re.escape(name) + r'\b', all_text):
                found_teams.append(abbr)
                break
    
    # Remove duplicates while preserving order
    found_teams = list(dict.fromkeys(found_teams))
    
    if len(found_teams) >= 2:
        return f"{found_teams[0]} vs {found_teams[1]}"
    
    # Try looking for versus pattern in text
    vs_pattern = re.search(r'([A-Za-z\s]+)\s+(?:vs\.?|versus)\s+([A-Za-z\s]+)', all_text, re.IGNORECASE)
    if vs_pattern:
        team1 = clean_team_name(vs_pattern.group(1))
        team2 = clean_team_name(vs_pattern.group(2))
        if team1 and team2:
            return f"{team1} vs {team2}"
    
    # Last attempt: Check for team elements by class name
    team1_elem = match_element.select_one('.vn-teamname1, .team1-name, .team-name-1, .vn-teamCode1')
    team2_elem = match_element.select_one('.vn-teamname2, .team2-name, .team-name-2, .vn-teamCode2')
    
    if team1_elem and team2_elem:
        team1 = clean_team_name(team1_elem.text)
        team2 = clean_team_name(team2_elem.text)
        if team1 and team2:
            return f"{team1} vs {team2}"
    
    # If we can't find both teams, look for any team name we can find
    all_potential_teams = match_element.select('[class*=team], [class*=Team]')
    teams_text = [clean_team_name(team.text) for team in all_potential_teams if clean_team_name(team.text)]
    
    # Remove duplicates while preserving order
    teams_text = list(dict.fromkeys(teams_text))
    
    if len(teams_text) >= 2:
        return f"{teams_text[0]} vs {teams_text[1]}"
    elif len(teams_text) == 1:
        # Try to find the second team
        all_text = match_element.get_text()
        vs_text = re.search(rf'{re.escape(teams_text[0])}\s+(?:vs\.?|versus)\s+([A-Za-z\s]+)', all_text, re.IGNORECASE)
        if vs_text:
            team2 = clean_team_name(vs_text.group(1))
            if team2:
                return f"{teams_text[0]} vs {team2}"
        
        # If we can't find vs text, look for IPL team abbreviations
        for abbr in ipl_teams.keys():
            if abbr != teams_text[0] and re.search(r'\b' + re.escape(abbr) + r'\b', all_text):
                return f"{teams_text[0]} vs {abbr}"
        
        return f"{teams_text[0]} vs Unknown Team"
    
    # Absolute last resort - look for capitalized abbreviations
    abbrs = re.findall(r'\b[A-Z]{2,4}\b', all_text)
    unique_abbrs = list(dict.fromkeys(abbrs))
    
    if len(unique_abbrs) >= 2:
        return f"{unique_abbrs[0]} vs {unique_abbrs[1]}"
    
    return "Teams Not Found"

def clean_team_name(name_text):
    """Clean up team name text"""
    if not name_text:
        return ""
    
    # Remove common suffixes/words
    name = name_text.strip()
    name = re.sub(r'logo|team|image|alt|vs\.?|versus', '', name, flags=re.IGNORECASE).strip()
    
    # Remove any non-alphanumeric characters at the end
    name = re.sub(r'[^\w\s]+$', '', name).strip()
    
    # If name is too long, try to extract abbreviation
    if len(name) > 20:
        abbr = ''.join(word[0] for word in name.split() if word)
        if len(abbr) >= 2:
            return abbr
    
    return name if name else ""

def find_team_containers(match_element):
    """Find containers that might hold team information"""
    containers = []
    
    # Look for team containers using various class selectors
    team_selectors = [
        '.vn-teamVs', '.team-vs', '.match-teams', '.fixture-teams',
        '[class*=team-container]', '[class*=teamContainer]'
    ]
    
    for selector in team_selectors:
        team_container = match_element.select_one(selector)
        if team_container:
            # Find direct children that might be team elements
            children = team_container.find_all(['div', 'span'], recursive=False)
            if len(children) >= 2:
                return children
    
    # Try finding adjacent team elements
    team_elements = match_element.select('.vn-teamname1, .vn-teamname2, .team1-name, .team2-name')
    if len(team_elements) >= 2:
        return team_elements
    
    # Try looking at image containers
    image_containers = match_element.select('[class*=logo-container], [class*=logoContainer]')
    if len(image_containers) >= 2:
        return image_containers
        
    return containers

def extract_team_from_container(container):
    """Extract team name from a container element"""
    # First check for image with alt text
    img = container.select_one('img')
    if img and img.get('alt'):
        return clean_team_name(img.get('alt'))
    
    # Look for specific team name elements
    team_name_elem = container.select_one('[class*=name], [class*=Name], [class*=title], [class*=Title]')
    if team_name_elem:
        return clean_team_name(team_name_elem.text)
    
    # Just use the container text if it's short enough
    text = container.get_text().strip()
    if len(text) < 30:  # Not too long
        return clean_team_name(text)
    
    return ""

def extract_venue(match_element):
    """Enhanced function to extract venue information using multiple approaches"""
    # Approach 1: Try the expected class
    venue_elem = match_element.select_one('.vn-matchVenue')
    if venue_elem and venue_elem.text.strip():
        return venue_elem.text.strip()
    
    # Approach 2: Try alternative venue classes
    for venue_class in ['.venue', '.match-venue', '.fixture-venue', 
                       '[class*=venue]', '[class*=Venue]', '.vn-venue', 
                       '.vn-venueDesc', '.vn-venueTitle']:
        venue_elem = match_element.select_one(venue_class)
        if venue_elem and venue_elem.text.strip():
            return venue_elem.text.strip()
    
    # Approach 3: Look for elements that might contain venue information
    # Often venues come after time or have specific keywords
    venue_containing_elements = match_element.find_all(['span', 'div', 'p'])
    for elem in venue_containing_elements:
        text = elem.text.strip()
        # Check if text might be a venue (contains stadium, arena, ground, common IPL venues)
        if text and len(text) > 3:
            if (
                'stadium' in text.lower() or 
                'arena' in text.lower() or 
                'ground' in text.lower() or
                any(city.lower() in text.lower() for city in [
                    'delhi', 'mumbai', 'kolkata', 'bangalore', 'bengaluru', 'chennai', 
                    'ahmedabad', 'pune', 'hyderabad', 'jaipur', 'mohali', 'dharamsala',
                    'lucknow', 'guwahati', 'chandigarh', 'indore'
                ])
            ):
                return text
    
    # Approach 4: Try to find venue using regex patterns
    all_text = match_element.get_text()
    venue_patterns = [
        r'at\s+([A-Za-z\s\.]+Stadium)',
        r'at\s+([A-Za-z\s\.]+Ground)',
        r'at\s+([A-Za-z\s\.]+Arena)',
        r'venue:?\s+([^,\n]+)',
        r'stadium:?\s+([^,\n]+)',
    ]
    
    for pattern in venue_patterns:
        match = re.search(pattern, all_text, re.IGNORECASE)
        if match:
            return match.group(1).strip()
    
    # Approach 5: Look for the last element in the match container
    last_elements = match_element.select('div:last-child, span:last-child, p:last-child')
    for elem in last_elements:
        text = elem.text.strip()
        # Avoid elements that are clearly not venues
        if (text and 
            not re.search(r'\d{1,2}:\d{2}', text) and  # Not a time
            not re.search(r'match', text, re.IGNORECASE) and  # Not a match reference
            not re.search(r'vs', text, re.IGNORECASE) and  # Not team vs team
            len(text) > 3):  # Not too short
            return text
            
    # If all approaches fail, return N/A
    return "N/A"

# Run the scraper
if __name__ == "__main__":
    scrape_ipl_matches_selenium()

Initializing Chrome WebDriver...
Accessing website: https://www.iplt20.com/matches/fixtures
Waiting for page to load...
Page title: IPL 2025 Fixtures | Complete Match Schedule | IPLT20
Found 0 individual matches
Alternative approach: Found 16 list items as potential matches
Processed match 1: Lucknow Super Giants vs RCB on MAY, FRI 9 at Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow
Processed match 2: Sunrisers Hyderabad vs KKR on MAY, SAT 10 at Rajiv Gandhi International Stadium, Hyderabad
Processed match 3: Punjab Kings vs Mumbai Indians on MAY, SUN 11 at Narendra Modi Stadium, Ahmedabad
Processed match 4: Delhi Capitals vs Gujarat Titans on MAY, SUN 11 at Arun Jaitley Stadium, Delhi
Processed match 5: Chennai Super Kings vs Rajasthan Royals on MAY, MON 12 at MA Chidambaram Stadium, Chennai
Processed match 6: RCB vs Sunrisers Hyderabad on MAY, TUE 13 at M Chinnaswamy Stadium, Bengaluru
Processed match 7: Gujarat Titans vs Lucknow Super Giants on MAY, WED 14 at 

In [5]:
#Matches Results

import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import re
import os


def scrape_ipl_results_selenium():
    """Scrape IPL match results data using Selenium with improved extraction functions"""
    
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
    
    driver = None
    
    try:
        print("Initializing Chrome WebDriver...")
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        
        url = "https://www.iplt20.com/matches/results"
        print(f"Accessing website: {url}")
        driver.get(url)
        
        # Wait for page to load completely
        print("Waiting for page to load...")
        wait = WebDriverWait(driver, 20)
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "vn-sheduleList")))
        
        # Consider adding scrolling to ensure all content loads
        scroll_height = 500
        for i in range(10):  # Scroll multiple times to load all content
            driver.execute_script(f"window.scrollTo(0, {scroll_height * i});")
            time.sleep(0.5)
        
        time.sleep(5)  # Additional wait to ensure dynamic content loads
        
        # Save page source for debugging if needed
        with open("page_source.html", "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        
        # Analyze the page structure
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        print(f"Page title: {soup.title.text if soup.title else 'No title'}")
        
        # Find all match items
        match_container = soup.select_one('.vn-sheduleList')
        if not match_container:
            print("Main schedule container not found!")
            return
            
        match_items = match_container.select('.vn-sheduleListItem')
        print(f"Found {len(match_items)} individual matches")
        
        if not match_items:
            match_items = match_container.select('li')
            print(f"Alternative approach: Found {len(match_items)} list items as potential matches")

        match_data = []
        
        for index, match in enumerate(match_items, 1):
            try:
                # Extract basic match info
                match_no_elem = match.select_one('.vn-matchOrder')
                match_no = match_no_elem.text.strip() if match_no_elem else f"Match {index}"
                
                date = extract_date(match)
                time_text = extract_time(match)
                venue = extract_venue(match)
                
                # Extract teams with improved function
                teams_info = extract_teams_improved(match)
                
                # Check for super over
                is_super_over, super_over_info = check_super_over(match)
                
                # Extract match result with improved function
                result = extract_match_result(match)
                
                # Add super over information to result if applicable
                if is_super_over:
                    result = f"{result} - {super_over_info}"
                
                # Extract team scores with improved function
                team1_score, team2_score = extract_team_scores(match)
                
                # Debug output to help identify the match
                print(f"\nProcessing match {index}: {teams_info}")
                print(f"HTML snippet: {match.select_one('[class*=team]')}")
                print(f"Extracted scores: Team1={team1_score}, Team2={team2_score}")
                
                # If super over, add super over scores
                if is_super_over:
                    team1_so_score, team2_so_score = extract_super_over_scores(match)
                    if team1_so_score and team2_so_score:
                        team1_score = f"{team1_score} (Super Over: {team1_so_score})"
                        team2_score = f"{team2_score} (Super Over: {team2_so_score})"
                
                match_data.append({
                    "No": index,
                    "Match_No": match_no,
                    "Team1_Team2": teams_info,
                    "Team1_Score": team1_score,
                    "Team2_Score": team2_score,
                    "Result": result,
                    "Date": date,
                    "Time": time_text,
                    "Venue": venue,
                    "Is_Super_Over": "Yes" if is_super_over else "No"
                })
                
                print(f"Processed match {index}: {teams_info} - Result: {result}")
                print(f"Scores: {team1_score} vs {team2_score}")
                print(f"Date: {date}, Time: {time_text}, Venue: {venue}")
                print(f"Super Over: {'Yes' if is_super_over else 'No'}")
                
            except Exception as e:
                print(f"Error processing match {index}: {e}")
                import traceback
                traceback.print_exc()
        
        if match_data:
            # Ensure the directory exists
            os.makedirs("Ipl Matches (2025)", exist_ok=True)
            
            df = pd.DataFrame(match_data)
            print(f"\nMatch data scraped successfully: {len(match_data)} matches found")
            print(df.head(10))  # Show first 10 matches
            
            # Save data to CSV in the specified folder
            df.to_csv(os.path.join("Ipl Matches (2025)", "ipl_match_results.csv"), index=False)
            print("Data saved to Ipl Matches (2025)/ipl_match_results.csv")
        else:
            print("No match data was extracted. Please check the HTML structure.")
            
    except Exception as e:
        print(f"An error occurred during Selenium scraping: {e}")
        import traceback
        traceback.print_exc()
        
    finally:
        if driver:
            driver.quit()      


def extract_date(match_element):
    """Enhanced function to extract date information using multiple approaches"""
    # Approach 1: Try the expected class
    date_elem = match_element.select_one('.vn-matchDate')
    if date_elem and date_elem.text.strip():
        return date_elem.text.strip()
    
    # Approach 2: Try alternative date classes
    for date_class in ['.date', '.match-date', '.fixture-date', 
                       '[class*=date]', '[class*=Date]', '.vn-date']:
        date_elem = match_element.select_one(date_class)
        if date_elem and date_elem.text.strip():
            return date_elem.text.strip()
    
    # Approach 3: Look for date patterns in all text
    all_text = match_element.get_text()
    
    # Common date formats in IPL website
    date_patterns = [
        # Standard date format: Saturday, April 09, 2022
        r'([A-Za-z]+,\s+[A-Za-z]+\s+\d{1,2},\s+\d{4})',
        # Shortened format: Sat, Apr 09, 2022
        r'([A-Za-z]{3},\s+[A-Za-z]{3}\s+\d{1,2},\s+\d{4})',
        # Another format: 09 April, 2022
        r'(\d{1,2}\s+[A-Za-z]+,?\s+\d{4})',
        # DD/MM/YYYY or MM/DD/YYYY
        r'(\d{1,2}/\d{1,2}/\d{4})',
        # DD-MM-YYYY or MM-DD-YYYY
        r'(\d{1,2}-\d{1,2}-\d{4})',
        # Day and month only: Saturday, April 09
        r'([A-Za-z]+,\s+[A-Za-z]+\s+\d{1,2})',
        # Month and day: April 09
        r'([A-Za-z]+\s+\d{1,2})'
    ]
    
    for pattern in date_patterns:
        match = re.search(pattern, all_text)
        if match:
            return match.group(1).strip()
    
    # If we still can't find a date, look for elements that might contain just the day/month
    day_month_elements = match_element.find_all(['span', 'div', 'p'])
    for elem in day_month_elements:
        text = elem.text.strip()
        # Look for month names
        if any(month in text.lower() for month in [
            'january', 'february', 'march', 'april', 'may', 'june', 
            'july', 'august', 'september', 'october', 'november', 'december',
            'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'
        ]):
            return text
    
    return "Date Not Available"

def extract_time(match_element):
    """Enhanced function to extract time information using multiple approaches"""
    # Approach 1: Try the expected class
    time_elem = match_element.select_one('.vn-matchTime')
    if time_elem and time_elem.text.strip():
        return time_elem.text.strip()
    
    # Approach 2: Try alternative time classes
    for time_class in ['.time', '.match-time', '.fixture-time', 
                       '[class*=time]', '[class*=Time]', '.vn-time']:
        time_elem = match_element.select_one(time_class)
        if time_elem and time_elem.text.strip() and not time_elem.text.strip().isdigit():
            # Avoid elements that are just numbers (likely not time)
            return time_elem.text.strip()
    
    # Approach 3: Look for time patterns in all text
    all_text = match_element.get_text()
    
    # Common time formats: 7:30 PM, 19:30, etc.
    time_patterns = [
        r'(\d{1,2}:\d{2}\s*[AP]M)',  # 7:30 PM
        r'(\d{1,2}[:.]\d{2}\s*(?:hours|hrs))',  # 19:30 hours
        r'(\d{1,2}[:.]\d{2}\s*[AP]M\s*(?:IST|local\s*time))',  # 7:30 PM IST
        r'(\d{1,2}[:.]\d{2})',  # 19:30 (24hr format)
        r'(at\s+\d{1,2}[:.]\d{2})',  # "at 19:30"
    ]
    
    for pattern in time_patterns:
        match = re.search(pattern, all_text, re.IGNORECASE)
        if match:
            # Clean up "at " prefix if present
            time_text = match.group(1).strip()
            return re.sub(r'^at\s+', '', time_text)
    
    return "Time Not Available"

def check_super_over(match_element):
    """Check if the match went to a super over and extract super over details"""
    all_text = match_element.get_text().lower()
    
    # Check for super over keywords
    is_super_over = any(term in all_text for term in [
        'super over', 'superover', 'super-over', 'tie-breaker', 
        'after tie', 'after a tie', 'decided in super over'
    ])
    
    super_over_info = ""
    
    if is_super_over:
        # Try to extract more detailed super over info
        super_over_patterns = [
            r'(super\s*over[^.]*)',
            r'(decided\s+by\s+super\s*over[^.]*)',
            r'(won\s+in\s+super\s*over[^.]*)',
            r'(after\s+super\s*over[^.]*)',
            r'(tied,\s*[^.]*super\s*over[^.]*)'
        ]
        
        for pattern in super_over_patterns:
            match = re.search(pattern, all_text, re.IGNORECASE)
            if match:
                super_over_info = match.group(1).strip().capitalize()
                break
        
        if not super_over_info:
            super_over_info = "Super Over"
    
    return is_super_over, super_over_info

def extract_super_over_scores(match_element):
    """Extract the super over scores from the match element"""
    all_text = match_element.get_text()
    
    # Look for super over score patterns
    # Format: "Super Over: MI 11/1, CSK 10/2"
    super_over_section = re.search(r'super\s*over:?\s*(.*?)(?:\.|$)', all_text, re.IGNORECASE)
    
    if super_over_section:
        so_text = super_over_section.group(1)
        
        # Extract scores from this section
        scores = re.findall(r'(\d+/\d+)', so_text)
        if len(scores) >= 2:
            return scores[0], scores[1]
        
        # Try alternative pattern with team names
        team_score_pattern = r'([A-Z]{2,4})\s*(\d+/\d+|\d+\s+for\s+\d+)'
        team_scores = re.findall(team_score_pattern, so_text)
        
        if len(team_scores) >= 2:
            return clean_score(team_scores[0][1]), clean_score(team_scores[1][1])
    
    # Look for super over scores anywhere in the text
    super_over_score_pattern = r'super\s*over.*?(\d+/\d+|\d+\s+for\s+\d+).*?(\d+/\d+|\d+\s+for\s+\d+)'
    so_scores = re.search(super_over_score_pattern, all_text, re.IGNORECASE)
    
    if so_scores:
        return clean_score(so_scores.group(1)), clean_score(so_scores.group(2))
    
    return "", ""

def extract_match_result(match_element):
    """Enhanced function to extract the match result information with time information removed"""
    # Try various selectors that might contain result information
    result_selectors = [
        '.vn-matchResult', '.result', '.match-result', '.vn-result',
        '[class*=result]', '[class*=Result]', '.vn-matchStatus', '.status',
        '.vn-winningTeam', '.winning-team', '.winner'
    ]
    
    for selector in result_selectors:
        result_elem = match_element.select_one(selector)
        if result_elem and result_elem.text.strip():
            result_text = result_elem.text.strip()
            # Clean time information from result
            return clean_result_text(result_text)
    
    # Look for specific patterns in the text content
    all_text = match_element.get_text()
    
    # Pattern for "Team Won by X Wickets/Runs"
    won_by_pattern = r'([A-Za-z\s]+)\s+Won\s+by\s+(\d+)\s+([Ww]ickets?|[Rr]uns)'
    won_match = re.search(won_by_pattern, all_text)
    if won_match:
        team = won_match.group(1).strip()
        margin = won_match.group(2)
        type_win = won_match.group(3)
        return clean_result_text(f"{team} Won by {margin} {type_win}")
    
    # Pattern for "Team beat Team"
    beat_pattern = r'([A-Za-z\s]+)\s+beat\s+([A-Za-z\s]+)'
    beat_match = re.search(beat_pattern, all_text)
    if beat_match:
        return clean_result_text(f"{beat_match.group(1).strip()} beat {beat_match.group(2).strip()}")
    
    # Pattern for "Team defeated Team"
    defeated_pattern = r'([A-Za-z\s]+)\s+defeated\s+([A-Za-z\s]+)'
    defeated_match = re.search(defeated_pattern, all_text)
    if defeated_match:
        return clean_result_text(f"{defeated_match.group(1).strip()} defeated {defeated_match.group(2).strip()}")
    
    # Check for special results
    special_results = [
        'Match tied', 'No result', 'Match abandoned', 
        'Match drawn', 'Match cancelled'
    ]
    
    for result in special_results:
        if result.lower() in all_text.lower():
            return result
    
    # Look for mentions of team names plus victory terms
    team_victory_pattern = r'([A-Z]{2,4}|[A-Za-z\s]+)\s+(?:win|won|victory|triumph)'
    team_victory = re.search(team_victory_pattern, all_text, re.IGNORECASE)
    if team_victory:
        return clean_result_text(f"{team_victory.group(1)} won")
    
    result = "Result Not Available"
    return result

def clean_result_text(result_text):
    """Clean up result text by removing time information and other unwanted parts"""
    # Remove time patterns like "7:30 pm IST"
    result = re.sub(r'\d{1,2}[:\.]\d{2}\s*(?:AM|PM|am|pm)(?:\s+IST)?', '', result_text)
    
    # Remove just "pm IST" or "am IST" patterns
    result = re.sub(r'(?:AM|PM|am|pm)\s+IST', '', result)
    
    # Remove any time zone references
    result = re.sub(r'IST|GMT|UTC|EST|PST', '', result)
    
    # Clean up multiple spaces
    result = re.sub(r'\s+', ' ', result).strip()
    
    return result



def extract_team_scores(match_element):
    """Enhanced function to extract team scores from the match element"""
    # First try to get scores from the HTML structure
    scores = []
    
    # Try standard score selectors
    score_selectors = [
        '.vn-score', '.score', '.match-score', '.team-score',
        '[class*=score]', '[class*=Score]', '.vn-teamScore'
    ]
    
    for selector in score_selectors:
        score_elems = match_element.select(selector)
        for elem in score_elems:
            if elem and elem.text.strip():
                # Clean score text
                score_text = elem.text.strip()
                # Keep only if it looks like a score (contains numbers)
                if re.search(r'\d', score_text):
                    scores.append(clean_score(score_text))
    
    # New approach: Extract scores with more specific approach
    # Try to find containers that have team name and score together
    team_containers = match_element.select('[class*=team-container], [class*=teamContainer], [class*=fixture-team]')
    for container in team_containers:
        # Find score element within this container
        score_elem = container.select_one('[class*=score], [class*=Score]')
        if score_elem and re.search(r'\d', score_elem.text):
            scores.append(clean_score(score_elem.text))
    
    # If we found exactly two scores, return them
    if len(scores) == 2:
        return scores[0], scores[1]
    
    # Extract all text from the match element to look for scores
    all_text = match_element.get_text()
    
    # Look for team score patterns with overs
    team_score_pattern = r'([A-Z]{2,4})\s*(?:\s+|:|-)?\s*(\d+(?:/\d+)?)\s*(?:\(\s*(\d+(?:\.\d+)?)\s*OV\s*\))?'
    team_scores = re.findall(team_score_pattern, all_text)
    
    processed_scores = {}
    for team, score, overs in team_scores:
        # Skip if we've already processed this team or score is too short
        if team in processed_scores or len(score) < 2:
            continue
        
        # Format the score with overs if available
        formatted_score = score
        if overs:
            formatted_score += f" ({overs} OV)"
        
        processed_scores[team] = formatted_score
    
    # Get unique team abbreviations from your data pattern
    teams = []
    team_abbr_pattern = r'\b([A-Z]{2,4})\b'
    team_abbrs = re.findall(team_abbr_pattern, all_text)
    for abbr in team_abbrs:
        if abbr not in ['IST', 'PM', 'AM', 'OV']:  # Filter out non-team abbreviations
            teams.append(abbr)
    
    # Remove duplicates while preserving order
    teams = list(dict.fromkeys(teams))
    
    # Match scores to teams if possible
    if len(teams) >= 2 and len(processed_scores) >= 2:
        # Try to get scores for the first two teams
        if teams[0] in processed_scores and teams[1] in processed_scores:
            return processed_scores[teams[0]], processed_scores[teams[1]]
    
    # Direct pattern matching for the data format you provided
    score_pattern = r'(\d+(?:/\d+)?)\s*\(\s*(\d+(?:\.\d+)?)\s*OV\s*\)'
    all_scores_with_overs = re.findall(score_pattern, all_text)
    
    clean_scores = []
    for score, overs in all_scores_with_overs:
        clean_scores.append(f"{score} ({overs} OV)")
    
    if len(clean_scores) >= 2:
        return clean_scores[0], clean_scores[1]
    
    # If we still don't have two scores but have found some scores
    if scores:
        if len(scores) >= 2:
            return scores[0], scores[1]
        elif len(scores) == 1:
            return scores[0], "N/A"
    
    # Last resort: Look for any numeric patterns that might be scores
    simple_scores = re.findall(r'(?<!\d)(\d{2,3})(?:/\d)?(?!\d)', all_text)
    if len(simple_scores) >= 2:
        return simple_scores[0], simple_scores[1]
    
    return "N/A", "N/A"

def clean_score(score_text):
    """Clean the score text to make it consistent"""
    # Remove team abbreviations first
    score = re.sub(r'\b[A-Z]{2,4}\b', '', score_text)
    
    # Replace 'for' with '/'
    score = re.sub(r'\s+for\s+', '/', score)
    
    # Replace '-' with '/'
    score = re.sub(r'-', '/', score)
    
    # Extract score and overs separately
    score_match = re.search(r'(\d+(?:/\d+)?)', score)
    overs_match = re.search(r'(\d+(?:\.\d+)?)\s*(?:ov(?:er)?s?)', score_text, re.IGNORECASE)
    
    if score_match:
        clean_score = score_match.group(1)
        if overs_match:
            clean_score += f" ({overs_match.group(1)} OV)"
        return clean_score
    
    # Remove extra spaces
    score = re.sub(r'\s+', '', score).strip()
    
    # If we couldn't extract a clean score, return the original with minimal cleaning
    if re.search(r'\d', score):
        return score
    
    return score_text.strip()


def clean_score(score_text):
    """Clean the score text to make it consistent"""
    # Replace 'for' with '/'
    score = re.sub(r'\s+for\s+', '/', score_text)
    # Replace '-' with '/'
    score = re.sub(r'-', '/', score)
    # Remove extra spaces
    score = re.sub(r'\s+', '', score)
    # Add overs if found
    overs_match = re.search(r'(\(\d+(?:\.\d+)?\s*(?:ov(?:er)?s?)?\))', score_text, re.IGNORECASE)
    if overs_match and overs_match.group(1) not in score:
        score += " " + overs_match.group(1)
    return score

def extract_teams_improved(match_element):
    """Enhanced function to extract both teams using multiple approaches with better consistency"""
    # Direct HTML analysis to extract image alt attributes or team name elements
    
    # First check for team images with alt text (most reliable for IPL website)
    team_images = match_element.select('img[class*=team], img[class*=Team], img[alt*=logo], img[alt*=team]')
    
    if len(team_images) >= 2:
        team1 = clean_team_name(team_images[0].get('alt', ''))
        team2 = clean_team_name(team_images[1].get('alt', ''))
        
        if team1 and team2:
            return f"{team1} vs {team2}"
    
    # Try to find dedicated team name containers - IPL often has specific elements for each team
    team_containers = find_team_containers(match_element)
    if team_containers and len(team_containers) >= 2:
        teams = []
        for container in team_containers[:2]:  # Get the first two teams
            team_name = extract_team_from_container(container)
            if team_name:
                teams.append(team_name)
                
        if len(teams) == 2:
            return f"{teams[0]} vs {teams[1]}"
    
    # Look for team abbreviations in different elements
    all_text = match_element.get_text()
    
    # Common IPL team full names and abbreviations
    ipl_teams = {
        'MI': ['Mumbai Indians', 'MI'],
        'CSK': ['Chennai Super Kings', 'CSK'],
        'RCB': ['Royal Challengers Bangalore', 'RCB'],
        'KKR': ['Kolkata Knight Riders', 'KKR'],
        'DC': ['Delhi Capitals', 'DC'],
        'SRH': ['Sunrisers Hyderabad', 'SRH'],
        'PBKS': ['Punjab Kings', 'PBKS'],
        'RR': ['Rajasthan Royals', 'RR'],
        'GT': ['Gujarat Titans', 'GT'],
        'LSG': ['Lucknow Super Giants', 'LSG']
    }
    
    # Find team abbreviations in the text
    found_teams = []
    for abbr, names in ipl_teams.items():
        for name in names:
            if re.search(r'\b' + re.escape(name) + r'\b', all_text):
                found_teams.append(abbr)
                break
    
    # Remove duplicates while preserving order
    found_teams = list(dict.fromkeys(found_teams))
    
    if len(found_teams) >= 2:
        return f"{found_teams[0]} vs {found_teams[1]}"
    
    # Try looking for versus pattern in text
    vs_pattern = re.search(r'([A-Za-z\s]+)\s+(?:vs\.?|versus)\s+([A-Za-z\s]+)', all_text, re.IGNORECASE)
    if vs_pattern:
        team1 = clean_team_name(vs_pattern.group(1))
        team2 = clean_team_name(vs_pattern.group(2))
        if team1 and team2:
            return f"{team1} vs {team2}"
    
    # Last attempt: Check for team elements by class name
    team1_elem = match_element.select_one('.vn-teamname1, .team1-name, .team-name-1, .vn-teamCode1')
    team2_elem = match_element.select_one('.vn-teamname2, .team2-name, .team-name-2, .vn-teamCode2')
    
    if team1_elem and team2_elem:
        team1 = clean_team_name(team1_elem.text)
        team2 = clean_team_name(team2_elem.text)
        if team1 and team2:
            return f"{team1} vs {team2}"
    
    # If we can't find both teams, look for any team name we can find
    all_potential_teams = match_element.select('[class*=team], [class*=Team]')
    teams_text = [clean_team_name(team.text) for team in all_potential_teams if clean_team_name(team.text)]
    
    # Remove duplicates while preserving order
    teams_text = list(dict.fromkeys(teams_text))
    
    if len(teams_text) >= 2:
        return f"{teams_text[0]} vs {teams_text[1]}"
    elif len(teams_text) == 1:
        # Try to find the second team
        all_text = match_element.get_text()
        vs_text = re.search(rf'{re.escape(teams_text[0])}\s+(?:vs\.?|versus)\s+([A-Za-z\s]+)', all_text, re.IGNORECASE)
        if vs_text:
            team2 = clean_team_name(vs_text.group(1))
            if team2:
                return f"{teams_text[0]} vs {team2}"
        
        # If we can't find vs text, look for IPL team abbreviations
        for abbr in ipl_teams.keys():
            if abbr != teams_text[0] and re.search(r'\b' + re.escape(abbr) + r'\b', all_text):
                return f"{teams_text[0]} vs {abbr}"
        
        return f"{teams_text[0]} vs Unknown Team"
    
    # Absolute last resort - look for capitalized abbreviations
    abbrs = re.findall(r'\b[A-Z]{2,4}\b', all_text)
    unique_abbrs = list(dict.fromkeys(abbrs))
    
    if len(unique_abbrs) >= 2:
        return f"{unique_abbrs[0]} vs {unique_abbrs[1]}"
    
    return "Teams Not Found"

def clean_team_name(name_text):
    """Clean up team name text"""
    if not name_text:
        return ""
    
    # Remove common suffixes/words
    name = name_text.strip()
    name = re.sub(r'logo|team|image|alt|vs\.?|versus', '', name, flags=re.IGNORECASE).strip()
    
    # Remove any non-alphanumeric characters at the end
    name = re.sub(r'[^\w\s]+$', '', name).strip()
    
    # If name is too long, try to extract abbreviation
    if len(name) > 20:
        abbr = ''.join(word[0] for word in name.split() if word)
        if len(abbr) >= 2:
            return abbr
    
    return name if name else ""

def find_team_containers(match_element):
    """Find containers that might hold team information"""
    containers = []
    
    # Look for team containers using various class selectors
    team_selectors = [
        '.vn-teamVs', '.team-vs', '.match-teams', '.fixture-teams',
        '[class*=team-container]', '[class*=teamContainer]'
    ]
    
    for selector in team_selectors:
        team_container = match_element.select_one(selector)
        if team_container:
            # Find direct children that might be team elements
            children = team_container.find_all(['div', 'span'], recursive=False)
            if len(children) >= 2:
                return children
    
    # Try finding adjacent team elements
    team_elements = match_element.select('.vn-teamname1, .vn-teamname2, .team1-name, .team2-name')
    if len(team_elements) >= 2:
        return team_elements

    # Try looking at image containers
    image_containers = match_element.select('[class*=logo-container], [class*=logoContainer]')
    if len(image_containers) >= 2:
        return image_containers
        
    return containers

def extract_team_from_container(container):
    """Extract team name from a container element"""
    # First check for image with alt text
    img = container.select_one('img')
    if img and img.get('alt'):
        return clean_team_name(img.get('alt'))
    
    # Look for specific team name elements
    team_name_elem = container.select_one('[class*=name], [class*=Name], [class*=title], [class*=Title]')
    if team_name_elem:
        return clean_team_name(team_name_elem.text)
    
    # Just use the container text if it's short enough
    text = container.get_text().strip()
    if len(text) < 30:  # Not too long
        return clean_team_name(text)
    
    return ""

def extract_venue(match_element):
    """Enhanced function to extract venue information using multiple approaches"""
    # Approach 1: Try the expected class
    venue_elem = match_element.select_one('.vn-matchVenue')
    if venue_elem and venue_elem.text.strip():
        return venue_elem.text.strip()
    
    # Approach 2: Try alternative venue classes
    for venue_class in ['.venue', '.match-venue', '.fixture-venue', 
                       '[class*=venue]', '[class*=Venue]', '.vn-venue', 
                       '.vn-venueDesc', '.vn-venueTitle']:
        venue_elem = match_element.select_one(venue_class)
        if venue_elem and venue_elem.text.strip():
            return venue_elem.text.strip()
    
    # Approach 3: Look for elements that might contain venue information
    # Often venues come after time or have specific keywords
    venue_containing_elements = match_element.find_all(['span', 'div', 'p'])
    for elem in venue_containing_elements:
        text = elem.text.strip()
        # Check if text might be a venue (contains stadium, arena, ground, common IPL venues)
        if text and len(text) > 3:
            if (
                'stadium' in text.lower() or 
                'arena' in text.lower() or 
                'ground' in text.lower() or
                any(city.lower() in text.lower() for city in [
                    'delhi', 'mumbai', 'kolkata', 'bangalore', 'bengaluru', 'chennai', 
                    'ahmedabad', 'pune', 'hyderabad', 'jaipur', 'mohali', 'dharamsala',
                    'lucknow', 'guwahati', 'chandigarh', 'indore'
                ])
            ):
                return text
    
    # Approach 4: Try to find venue using regex patterns
    all_text = match_element.get_text()
    venue_patterns = [
        r'at\s+([A-Za-z\s\.]+Stadium)',
        r'at\s+([A-Za-z\s\.]+Ground)',
        r'at\s+([A-Za-z\s\.]+Arena)',
        r'venue:?\s+([^,\n]+)',
        r'stadium:?\s+([^,\n]+)',
    ]
    
    for pattern in venue_patterns:
        match = re.search(pattern, all_text, re.IGNORECASE)
        if match:
            return match.group(1).strip()
    
    # Approach 5: Look for the last element in the match container
    last_elements = match_element.select('div:last-child, span:last-child, p:last-child')
    for elem in last_elements:
        text = elem.text.strip()
        # Avoid elements that are clearly not venues
        if (text and 
            not re.search(r'\d{1,2}:\d{2}', text) and  # Not a time
            not re.search(r'match', text, re.IGNORECASE) and  # Not a match reference
            not re.search(r'vs', text, re.IGNORECASE) and  # Not team vs team
            len(text) > 3):  # Not too short
            return text
            
    # If all approaches fail, return N/A
    return "N/A"

# Run the scraper
if __name__ == "__main__":
    scrape_ipl_results_selenium()

Initializing Chrome WebDriver...
Accessing website: https://www.iplt20.com/matches/results
Waiting for page to load...
Page title: IPL 2025 Match Results | Full Scorecard & Summaries | IPLT20
Found 0 individual matches
Alternative approach: Found 58 list items as potential matches

Processing match 1: Punjab Kings vs Delhi Capitals
HTML snippet: <div class="vn-teamTitle"> <div class="vn-teamName"> <!-- ngIf: list.HomeTeamName == undefined || list.HomeTeamName == '' --> <!-- ngIf: list.HomeTeamName != undefined && list.HomeTeamName != '' --><h3 class="ng-binding ng-scope" ng-if="list.HomeTeamName != undefined &amp;&amp; list.HomeTeamName != ''">Punjab Kings</h3><!-- end ngIf: list.HomeTeamName != undefined && list.HomeTeamName != '' --> </div> <div class="vn-teamCode"> <!-- ngIf: list.HomeTeamID == list.FirstBattingTeamID --><h3 class="ng-binding ng-scope" ng-if="list.HomeTeamID == list.FirstBattingTeamID">PBKS</h3><!-- end ngIf: list.HomeTeamID == list.FirstBattingTeamID --> <!-- ngIf:

In [6]:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import re
import os

def setup_driver():
    """Setup and return a configured Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    return driver

def extract_officials_from_text(officials_text):
    """Extract individual officials from a combined officials text"""
    officials = {
        "On_Field_Umpire": "",
        "Third_Umpire": "",
        "Referee": "",
        "MOM": ""
    }
    
    # Pattern to extract on-field umpires
    on_field_pattern = r'(.*?)(?:Third Umpire|$)'
    on_field_match = re.search(on_field_pattern, officials_text, re.IGNORECASE)
    if on_field_match:
        officials["On_Field_Umpire"] = on_field_match.group(1).strip()
    
    # Pattern to extract third umpire
    third_pattern = r'Third Umpire\s+(.*?)(?:Referee|$)'
    third_match = re.search(third_pattern, officials_text, re.IGNORECASE)
    if third_match:
        officials["Third_Umpire"] = third_match.group(1).strip()
    
    # Pattern to extract referee
    referee_pattern = r'Referee\s+(.*?)(?:MOM|Man of the Match|Player of the Match|$)'
    referee_match = re.search(referee_pattern, officials_text, re.IGNORECASE)
    if referee_match:
        officials["Referee"] = referee_match.group(1).strip()
    
    # Pattern to extract MOM - improved to only get the name
    mom_pattern = r'(?:MOM|Man of the Match|Player of the Match)\s+([A-Za-z\s\-\.]+)'
    mom_match = re.search(mom_pattern, officials_text, re.IGNORECASE)
    if mom_match:
        officials["MOM"] = mom_match.group(1).strip()
    
    return officials

def clean_name(name_text):
    """Clean up a name by removing descriptions and keeping only the person's name"""
    # Remove text in parentheses
    name_text = re.sub(r'\s*\([^)]*\)\s*', '', name_text)
    
    # Keep only words that look like names (first letters capitalized, possibly with periods for initials)
    words = name_text.split()
    clean_words = []
    
    for word in words:
        # Check if word looks like a name (starts with capital letter, contains letters)
        if re.match(r'^[A-Z][a-z]*$|^[A-Z]\.$', word):
            clean_words.append(word)
        else:
            # If we've already found some name parts and this isn't a name, stop
            if clean_words:
                break
    
    # Join the name parts back together
    if clean_words:
        return ' '.join(clean_words)
    
    # If no clean parts were found, just return the first word or two
    parts = name_text.split()
    if len(parts) > 0:
        if len(parts[0]) > 2:  # Make sure it's not just a short word
            return parts[0]
        elif len(parts) > 1:
            return f"{parts[0]} {parts[1]}"
    
    return name_text

def combine_umpires(umpire1, umpire2):
    """Combine two umpire names into a single field with comma separator"""
    umpires = []
    if umpire1:
        umpires.append(clean_name(umpire1))
    if umpire2:
        umpires.append(clean_name(umpire2))
    
    if len(umpires) == 0:
        return ""
    elif len(umpires) == 1:
        return umpires[0]
    else:
        return f"{umpires[0]}, {umpires[1]}"  # Using comma as separator

def split_umpires(umpires_text):
    """Better split function for umpires that might be in a single field"""
    # Try common separators
    for separator in [',', ' and ', ' & ', ';']:
        if separator in umpires_text:
            parts = [p.strip() for p in umpires_text.split(separator)]
            return parts
    
    # Try to split by looking for two capitalized names
    name_pattern = r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)'
    names = re.findall(name_pattern, umpires_text)
    if len(names) >= 2:
        return [names[0], names[1]]
    
    # If nothing else works
    return [umpires_text, ""]

def format_multiple_umpires(umpires_text):
    """Format multiple umpires with comma separator"""
    # Split by common separators
    parts = []
    for separator in [' and ', ' & ']:
        if separator in umpires_text:
            parts = [p.strip() for p in umpires_text.split(separator)]
            break
    
    # If no parts were found, return original
    if not parts:
        return umpires_text
    
    # Join with comma
    return ", ".join(parts)

def scrape_match_data(match_url, driver, match_index):
    """Scrape specific match data from a given URL"""
    try:
        print(f"Accessing: {match_url}")
        driver.get(match_url)
        
        # Wait for page to load - use more specific selector for match info
        wait = WebDriverWait(driver, 20)
        # Try multiple selectors that might indicate the page is loaded
        possible_selectors = [
            (By.CLASS_NAME, "sc-matchInfo"),
            (By.CLASS_NAME, "sc-matchHeader"),
            (By.CLASS_NAME, "sc-teamName"),
            (By.CLASS_NAME, "widget"),
            (By.CLASS_NAME, "matchDetails")
        ]
        
        for selector in possible_selectors:
            try:
                wait.until(EC.presence_of_element_located(selector))
                break
            except:
                continue
                
        # Additional wait for dynamic content
        time.sleep(3)
        
        # Click on "match summary" tab if available to ensure we get all data
        try:
            summary_tab = driver.find_element(By.XPATH, "//li[contains(text(), 'Match Summary') or contains(@class, 'summary')]")
            driver.execute_script("arguments[0].click();", summary_tab)
            time.sleep(2)
        except:
            pass
        
        # Get page source after everything has loaded
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Extract match data with modified fields
        match_data = {
            "Match_No": str(match_index + 1),
            "On_Field_Umpire": "",
            "Third_Umpire": "",
            "Referee": "",
            "MOM": "",
            "Toss_Winner": "",
            "Toss_Decision": "",
            "Match_Winner": "",
            "Result_Margin": ""
        }
        
        # Try to find match officials using Angular attributes from the provided selectors
        umpire1_elem = soup.select_one('[ng-if*="matchSummary.Umpire1Name"]')
        umpire2_elem = soup.select_one('[ng-if*="matchSummary.Umpire2Name"]')
        umpire3_elem = soup.select_one('[ng-if*="matchSummary.Umpire3Name"]')
        referee_elem = soup.select_one('[ng-if*="matchSummary.Referee"]')
        mom_elem = soup.select_one('[ng-if*="matchSummary.MOM"]')
        
        # Extract values if elements found
        umpire1 = umpire1_elem.text.strip() if umpire1_elem else ""
        umpire2 = umpire2_elem.text.strip() if umpire2_elem else ""
        
        # Combine on-field umpires with comma separator
        if umpire1 and umpire2:
            match_data["On_Field_Umpire"] = f"{clean_name(umpire1)}, {clean_name(umpire2)}"
        elif umpire1:
            match_data["On_Field_Umpire"] = clean_name(umpire1)
        elif umpire2:
            match_data["On_Field_Umpire"] = clean_name(umpire2)
        
        # Set third umpire
        if umpire3_elem:
            match_data["Third_Umpire"] = clean_name(umpire3_elem.text.strip())
        
        # Set referee
        if referee_elem:
            match_data["Referee"] = clean_name(referee_elem.text.strip())
        
        # Set MOM
        if mom_elem:
            match_data["MOM"] = clean_name(mom_elem.text.strip())
        
        # If direct Angular elements not found, fall back to traditional methods
        if not match_data["On_Field_Umpire"] and not match_data["Third_Umpire"] and not match_data["Referee"] and not match_data["MOM"]:
            # Try to find match officials section
            officials_selectors = [
                '.match-officials',
                '.sc-matchOfficials',
                '#match-officials',
                '.officials-info',
                '.widgetContent'
            ]
            
            officials_section = None
            for selector in officials_selectors:
                section = soup.select_one(selector)
                if section:
                    officials_section = section.get_text()
                    break
                    
            # Extract officials properly from the officials section
            if officials_section:
                # First, try to extract using structured patterns
                on_field_pattern = r'(?:On-Field|On Field) Umpires?:\s*(.*?)(?=\s*(?:Third|$|\n))'
                third_pattern = r'Third Umpire:\s*(.*?)(?=\s*(?:Match Referee|Referee|$|\n))'
                referee_pattern = r'(?:Match )?Referee:\s*(.*?)(?=\s*(?:MOM|Man of the Match|Player of the Match|$|\n))'
                mom_pattern = r'(?:MOM|Man of the Match|Player of the Match):\s*([A-Za-z\s\-\.]+)'
                
                on_field_match = re.search(on_field_pattern, officials_section, re.IGNORECASE)
                if on_field_match:
                    on_field_text = on_field_match.group(1).strip()
                    # Split field umpires and join with comma
                    umpires = split_umpires(on_field_text)
                    if len(umpires) >= 2:
                        match_data["On_Field_Umpire"] = f"{clean_name(umpires[0])}, {clean_name(umpires[1])}"
                    elif len(umpires) == 1:
                        match_data["On_Field_Umpire"] = clean_name(umpires[0])
                
                third_match = re.search(third_pattern, officials_section, re.IGNORECASE)
                if third_match:
                    match_data["Third_Umpire"] = clean_name(third_match.group(1).strip())
                
                referee_match = re.search(referee_pattern, officials_section, re.IGNORECASE)
                if referee_match:
                    match_data["Referee"] = clean_name(referee_match.group(1).strip())
                
                mom_match = re.search(mom_pattern, officials_section, re.IGNORECASE)
                if mom_match:
                    match_data["MOM"] = clean_name(mom_match.group(1).strip())
                    
                # If structured patterns didn't work, try combining all officials search
                if not match_data["On_Field_Umpire"] and not match_data["Third_Umpire"] and not match_data["Referee"] and not match_data["MOM"]:
                    # Try to extract from the unstructured text
                    combined_officials = extract_officials_from_text(officials_section)
                    for key, value in combined_officials.items():
                        if value and not match_data[key]:
                            if key == "On_Field_Umpire":
                                # Format on-field umpires with comma
                                match_data[key] = format_multiple_umpires(clean_name(value))
                            else:
                                match_data[key] = clean_name(value)
            else:
                # Try to find officials in the general text
                all_text = soup.get_text()
                
                # Try looking for a section with officials
                officials_text_pattern = r'(?:Officials|Match Officials|Umpires)[:\s]+(.*?)(?=\n\n|\Z)'
                officials_section_match = re.search(officials_text_pattern, all_text, re.IGNORECASE)
                
                if officials_section_match:
                    officials_text = officials_section_match.group(1).strip()
                    combined_officials = extract_officials_from_text(officials_text)
                    for key, value in combined_officials.items():
                        if value:
                            if key == "On_Field_Umpire":
                                # Format on-field umpires with comma
                                match_data[key] = format_multiple_umpires(clean_name(value))
                            else:
                                match_data[key] = clean_name(value)
                else:
                    # Try individual patterns for each official type
                    on_field_pattern = r'(?:On-Field|On Field) Umpires?:\s*(.*?)(?=\s*(?:Third|$|\n))'
                    third_pattern = r'Third Umpire:\s*(.*?)(?=\s*(?:Match Referee|Referee|$|\n))'
                    referee_pattern = r'(?:Match )?Referee:\s*(.*?)(?=\s*(?:MOM|Man of the Match|Player of the Match|$|\n))'
                    mom_pattern = r'(?:MOM|Man of the Match|Player of the Match):\s*([A-Za-z\s\-\.]+)'
                    
                    on_field_match = re.search(on_field_pattern, all_text, re.IGNORECASE)
                    if on_field_match:
                        on_field_text = on_field_match.group(1).strip()
                        # Split field umpires and format with comma
                        umpires = split_umpires(on_field_text)
                        if len(umpires) >= 2:
                            match_data["On_Field_Umpire"] = f"{clean_name(umpires[0])}, {clean_name(umpires[1])}"
                        elif len(umpires) == 1:
                            match_data["On_Field_Umpire"] = clean_name(umpires[0])
                    
                    third_match = re.search(third_pattern, all_text, re.IGNORECASE)
                    if third_match:
                        match_data["Third_Umpire"] = clean_name(third_match.group(1).strip())
                    
                    referee_match = re.search(referee_pattern, all_text, re.IGNORECASE)
                    if referee_match:
                        match_data["Referee"] = clean_name(referee_match.group(1).strip())
                    
                    mom_match = re.search(mom_pattern, all_text, re.IGNORECASE)
                    if mom_match:
                        match_data["MOM"] = clean_name(mom_match.group(1).strip())
        
        # Check for any remaining official information in specific elements
        official_elements = soup.select('.official-name, .umpire-name, .referee-name, .mom-name')
        field_umpires = []
        
        for elem in official_elements:
            element_text = elem.text.strip()
            element_class = elem.get('class', [])
            
            if any('field' in cls.lower() for cls in element_class) or any('umpire' in cls.lower() for cls in element_class) and 'third' not in ' '.join(element_class).lower():
                field_umpires.append(clean_name(element_text))
            elif any('third' in cls.lower() for cls in element_class):
                if not match_data["Third_Umpire"]:
                    match_data["Third_Umpire"] = clean_name(element_text)
            elif any('referee' in cls.lower() for cls in element_class):
                if not match_data["Referee"]:
                    match_data["Referee"] = clean_name(element_text)
            elif any('mom' in cls.lower() for cls in element_class) or any('player-match' in cls.lower() for cls in element_class):
                if not match_data["MOM"]:
                    match_data["MOM"] = clean_name(element_text)
        
        # Combine field umpires if found
        if field_umpires and not match_data["On_Field_Umpire"]:
            match_data["On_Field_Umpire"] = ", ".join(field_umpires)
        
        # Special handling for the case where all officials are in one string
        if match_data["On_Field_Umpire"] and "Third Umpire" in match_data["On_Field_Umpire"]:
            # This indicates we have a combined string of all officials
            combined_officials = extract_officials_from_text(match_data["On_Field_Umpire"])
            
            # Format on-field umpires with comma
            on_field = combined_officials["On_Field_Umpire"]
            umpires = split_umpires(on_field)
            if len(umpires) >= 2:
                match_data["On_Field_Umpire"] = f"{clean_name(umpires[0])}, {clean_name(umpires[1])}"
            elif len(umpires) == 1:
                match_data["On_Field_Umpire"] = clean_name(umpires[0])
            
            if combined_officials["Third_Umpire"]:
                match_data["Third_Umpire"] = clean_name(combined_officials["Third_Umpire"])
            if combined_officials["Referee"]:
                match_data["Referee"] = clean_name(combined_officials["Referee"])
            if combined_officials["MOM"]:
                match_data["MOM"] = clean_name(combined_officials["MOM"])
        
        # Same check for Third_Umpire
        if match_data["Third_Umpire"] and "Referee" in match_data["Third_Umpire"]:
            combined_officials = extract_officials_from_text("Third Umpire " + match_data["Third_Umpire"])
            match_data["Third_Umpire"] = clean_name(combined_officials["Third_Umpire"])
            
            if combined_officials["Referee"]:
                match_data["Referee"] = clean_name(combined_officials["Referee"])
            if combined_officials["MOM"]:
                match_data["MOM"] = clean_name(combined_officials["MOM"])
        
        # Same check for Referee
        if match_data["Referee"] and ("MOM" in match_data["Referee"] or "Man of the Match" in match_data["Referee"]):
            combined_officials = extract_officials_from_text("Referee " + match_data["Referee"])
            match_data["Referee"] = clean_name(combined_officials["Referee"])
            
            if combined_officials["MOM"]:
                match_data["MOM"] = clean_name(combined_officials["MOM"])
        
        # Toss information - try to find a specific section first
        toss_elem = soup.select_one('[ng-if*="matchSummary.TossDetails"]')
        if toss_elem:
            toss_text = toss_elem.text.strip()
            toss_pattern = re.search(r'([A-Za-z\s]+) won the toss and (?:elected|chose|opted) to ([a-z]+)', toss_text, re.IGNORECASE)
            if toss_pattern:
                match_data["Toss_Winner"] = toss_pattern.group(1).strip()
                match_data["Toss_Decision"] = toss_pattern.group(2).strip().capitalize()
        
        if not match_data["Toss_Winner"] or not match_data["Toss_Decision"]:
            toss_selectors = [
                '.toss-info',
                '.sc-tossResult',
                '.match-toss'
            ]
            
            toss_section = None
            for selector in toss_selectors:
                section = soup.select_one(selector)
                if section:
                    toss_section = section.get_text()
                    break
            
            if toss_section:
                toss_pattern = re.search(r'([A-Za-z\s]+) won the toss and (?:elected|chose|opted) to ([a-z]+)', toss_section, re.IGNORECASE)
                if toss_pattern:
                    match_data["Toss_Winner"] = toss_pattern.group(1).strip()
                    match_data["Toss_Decision"] = toss_pattern.group(2).strip().capitalize()
            else:
                # Try to find toss info in general text
                all_text = soup.get_text()
                toss_pattern = re.search(r'([A-Za-z\s]+) won the toss and (?:elected|chose|opted) to ([a-z]+)', all_text, re.IGNORECASE)
                if toss_pattern:
                    match_data["Toss_Winner"] = toss_pattern.group(1).strip()
                    match_data["Toss_Decision"] = toss_pattern.group(2).strip().capitalize()
        
        # Result information - try multiple selectors
        result_selectors = [
            '.sc-matchStatus',
            '.match-result',
            '.sc-matchResult'
        ]
        
        result_text = ""
        for selector in result_selectors:
            result_elem = soup.select_one(selector)
            if result_elem and result_elem.text.strip():
                result_text = result_elem.text.strip()
                break
        
        # If no result found in specific selectors, try to get it from general text
        if not result_text:
            all_text = soup.get_text()
            result_patterns = [
                r'([A-Za-z\s]+) won by.*',
                r'([A-Za-z\s]+) beat.*',
                r'Match.*(?:tied|Tied)'
            ]
            
            for pattern in result_patterns:
                result_match = re.search(pattern, all_text, re.IGNORECASE)
                if result_match:
                    result_text = result_match.group(0)
                    break
        
        # Process result text to extract winner and margin
        if result_text:
            # Check for Super Over
            super_over = re.search(r'super over', result_text, re.IGNORECASE)
            
            # Extract winner
            winner_patterns = [
                r'([A-Za-z\s]+) won by',
                r'([A-Za-z\s]+) beat'
            ]
            
            for pattern in winner_patterns:
                winner_match = re.search(pattern, result_text, re.IGNORECASE)
                if winner_match:
                    match_data["Match_Winner"] = winner_match.group(1).strip()
                    break
            
            # Extract margin
            if super_over:
                match_data["Result_Margin"] = "Super Over"
            else:
                margin_patterns = [
                    r'won by\s+(\d+\s+(?:runs|wickets))',
                    r'beat\s+.*?\s+by\s+(\d+\s+(?:runs|wickets))'
                ]
                
                for pattern in margin_patterns:
                    margin_match = re.search(pattern, result_text, re.IGNORECASE)
                    if margin_match:
                        match_data["Result_Margin"] = margin_match.group(1).strip()
                        break
            
            # Check for tied match
            tied_match = re.search(r'match(?:.*?)tied', result_text, re.IGNORECASE)
            if tied_match and not match_data["Result_Margin"]:
                match_data["Result_Margin"] = "Tied"
        
        # Final clean-up - ensure all name fields are just names without descriptions
        for field in ["Third_Umpire", "Referee", "MOM"]:
            if match_data[field]:
                match_data[field] = clean_name(match_data[field])
        
        # Ensure on-field umpires are properly formatted with comma
        if " and " in match_data["On_Field_Umpire"] or " & " in match_data["On_Field_Umpire"]:
            match_data["On_Field_Umpire"] = format_multiple_umpires(match_data["On_Field_Umpire"])
        
        print(f"Extracted data for match {match_data['Match_No']}")
        return match_data
    
    except Exception as e:
        print(f"Error scraping match data: {e}")
        import traceback
        traceback.print_exc()
        return None

def scrape_multiple_matches(start_id, num_matches):
    """Scrape data from multiple matches"""
    driver = setup_driver()
    match_data_list = []
    
    try:
        for i in range(num_matches):
            match_id = start_id + i
            match_url = f"https://www.iplt20.com/match/2025/{match_id}"
            
            match_data = scrape_match_data(match_url, driver, i)
            if match_data:
                match_data_list.append(match_data)
                print(f"Successfully scraped match {i+1}/{num_matches}")
            else:
                print(f"Failed to scrape match {i+1}")
            
            # Add random delay between requests to avoid being blocked
            delay = 2 + (time.time() % 2)  # 2-4 seconds
            time.sleep(delay)
    
    except Exception as e:
        print(f"An error occurred: {e}")
    
    finally:
        driver.quit()
    
    # Create and save DataFrame
    if match_data_list:
        df = pd.DataFrame(match_data_list)
        
        # Create directory if it doesn't exist
        os.makedirs("Ipl Matches (2025)", exist_ok=True)
        
        # Save to CSV
        csv_path = os.path.join("Ipl Matches (2025)", "ipl_match_data_matches.csv")
        df.to_csv(csv_path, index=False)
        print(f"\nData saved to {csv_path}")
        
        # Print summary
        print(f"\nTotal matches scraped: {len(match_data_list)}")
        print("\nSample data:")
        print(df.head())
        
        return df
    else:
        print("No match data was scraped")
        return None

def main():
    print("IPL Match Data Scraper - Matches")
    print("====================================")
    
    start_id = 1799  # Starting match ID
    num_matches = 74  # Number of matches to scrape (change as needed)
    
    print(f"Will scrape {num_matches} matches starting from match ID {start_id}")
    
    # Run the scraper
    scrape_multiple_matches(start_id, num_matches)

if __name__ == "__main__":
    main()

IPL Match Data Scraper - Matches
Will scrape 74 matches starting from match ID 1799
Accessing: https://www.iplt20.com/match/2025/1799
Extracted data for match 1
Successfully scraped match 1/74
Accessing: https://www.iplt20.com/match/2025/1800
Extracted data for match 2
Successfully scraped match 2/74
Accessing: https://www.iplt20.com/match/2025/1801
Extracted data for match 3
Successfully scraped match 3/74
Accessing: https://www.iplt20.com/match/2025/1802
Extracted data for match 4
Successfully scraped match 4/74
Accessing: https://www.iplt20.com/match/2025/1803
Extracted data for match 5
Successfully scraped match 5/74
Accessing: https://www.iplt20.com/match/2025/1804
Extracted data for match 6
Successfully scraped match 6/74
Accessing: https://www.iplt20.com/match/2025/1805
Extracted data for match 7
Successfully scraped match 7/74
Accessing: https://www.iplt20.com/match/2025/1806
Extracted data for match 8
Successfully scraped match 8/74
Accessing: https://www.iplt20.com/match/2025

In [3]:
import os
import pandas as pd
import re

def standardize_role(role):
    """
    Standardize player roles into main categories:
    - Batsman
    - Bowler
    - All-Rounder
    - Wicketkeeper
    """
    role = str(role).upper()
    
    if 'WICKET' in role or 'KEEPER' in role or 'WK' in role or 'KEEP' in role:
        return 'Wicketkeeper'
    elif 'ALL' in role or 'ROUNDER' in role or 'ALL-ROUND' in role or ('BAT' in role and 'BOWL' in role):
        return 'All-Rounder'
    elif any(bowl in role for bowl in ['BOWL', 'SPINNER', 'PACE', 'FAST', 'MEDIUM']):
        return 'Bowler'
    elif any(bat in role for bat in ['BAT', 'BATTER', 'BATSMAN']):
        return 'Batsman'
    else:
        # If no clear category found, check for common abbreviations
        if role in ['WK', 'W/K', 'W-K']:
            return 'Wicketkeeper'
        elif role in ['AR', 'A/R', 'A-R']:
            return 'All-Rounder'
        elif role in ['BLR', 'BWL', 'B']:
            return 'Bowler'
        elif role in ['BAT', 'BT', 'BTR']:
            return 'Batsman'
        else:
            return 'Batsman'  # Default to Batsman as most common fallback

def process_team_files():
    """Process all team CSV files in the Team_Player directory and save back to the same files."""
    # Path to directory containing team CSV files
    directory = 'Team_Player'
    
    # Dictionary to store role counts for summary
    role_counts = {'Batsman': 0, 'Bowler': 0, 'All-Rounder': 0, 'Wicketkeeper': 0, 'Unknown': 0}
    team_stats = {}
    
    # List all team CSV files (assuming the 10 IPL team files)
    expected_teams = [
        'Chennai_Super_Kings.csv',
        'Delhi_Capitals.csv',
        'Gujarat_Titans.csv',
        'Kolkata_Knight_Riders.csv',
        'Lucknow_Super_Giants.csv',
        'Mumbai_Indians.csv',
        'Punjab_Kings.csv',
        'Rajasthan_Royals.csv',
        'Royal_Challengers_Bangalore.csv',
        'Sunrisers_Hyderabad.csv'
    ]
    
    # Check which files exist in the directory
    existing_files = [f for f in expected_teams if os.path.exists(os.path.join(directory, f))]
    if not existing_files:
        existing_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    
    print(f"Found {len(existing_files)} team files to process.")
    
    for file in existing_files:
        team_name = file.replace('.csv', '')
        team_stats[team_name] = {'Batsman': 0, 'Bowler': 0, 'All-Rounder': 0, 'Wicketkeeper': 0, 'Unknown': 0}
        
        file_path = os.path.join(directory, file)
        
        try:
            # Read the CSV file
            df = pd.read_csv(file_path)
            
            # Check if 'role' column exists
            if 'role' not in df.columns:
                print(f"Warning: 'role' column not found in {file}. Checking for similar columns...")
                # Try to find a column that might contain role information
                role_cols = [col for col in df.columns if 'role' in col.lower()]
                if role_cols:
                    print(f"Using column '{role_cols[0]}' as role column in {file}")
                    df['role'] = df[role_cols[0]]
                else:
                    print(f"No role column found in {file}. Skipping...")
                    continue
            
            # Standardize roles directly in the role column
            df['role'] = df['role'].apply(lambda x: standardize_role(str(x)))
            
            # Count roles for this team
            role_counts_team = df['role'].value_counts().to_dict()
            for role, count in role_counts_team.items():
                if role in role_counts:
                    role_counts[role] += count
                    team_stats[team_name][role] = count
            
            # Save processed file back to the same location
            df.to_csv(file_path, index=False)
            print(f"Processed {file}: {len(df)} players")
            
        except Exception as e:
            print(f"Error processing {file}: {e}")
    
    # Create a summary file in the same directory
    with open(os.path.join(directory, 'role_summary.txt'), 'w') as f:
        f.write("IPL 2025 Player Role Summary\n")
        f.write("===========================\n\n")
        f.write("Overall Role Distribution:\n")
        for role, count in role_counts.items():
            f.write(f"{role}: {count} players\n")
        
        f.write("\nTeam-wise Role Distribution:\n")
        for team, roles in team_stats.items():
            f.write(f"\n{team}:\n")
            for role, count in roles.items():
                if count > 0:
                    f.write(f"  {role}: {count} players\n")
    
    # Create a team comparison CSV in the same directory
    comparison_df = pd.DataFrame.from_dict(team_stats, orient='index')
    comparison_df.to_csv(os.path.join(directory, 'team_role_comparison.csv'))
    
    print(f"\nProcessing complete! All files updated in the '{directory}' directory.")
    print(f"Summary saved to '{os.path.join(directory, 'role_summary.txt')}'")
    print(f"Team comparison saved to '{os.path.join(directory, 'team_role_comparison.csv')}'")

if __name__ == "__main__":
    process_team_files()

Found 9 team files to process.
Processed Chennai_Super_Kings.csv: 25 players
Processed Delhi_Capitals.csv: 22 players
Processed Gujarat_Titans.csv: 25 players
Processed Kolkata_Knight_Riders.csv: 21 players
Processed Lucknow_Super_Giants.csv: 24 players
Processed Mumbai_Indians.csv: 23 players
Processed Punjab_Kings.csv: 25 players
Processed Rajasthan_Royals.csv: 20 players
Processed Sunrisers_Hyderabad.csv: 20 players

Processing complete! All files updated in the 'Team_Player' directory.
Summary saved to 'Team_Player\role_summary.txt'
Team comparison saved to 'Team_Player\team_role_comparison.csv'
