In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [None]:
# List of years for which we want to scrape data
years = [1930, 1934, 1938, 1950, 1954, 1958, 1962, 1966, 1970, 1974, 1978, 
         1982, 1986, 1990, 1994, 1998, 2002, 2006, 2010, 2014, 2018, 2022]

# Base URL for Wikipedia squad pages
base_url = "https://en.wikipedia.org/wiki/{}_FIFA_World_Cup_squads"

# Data storage
all_player_data = []

for year in years:
    # Construct the URL for each year
    url = base_url.format(year)
    print(f"Scraping data for the year: {year}")
    
    # Fetch and parse the page
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data for {year}. Status code: {response.status_code}")
        continue
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all squad tables
    tables = soup.find_all("table", {"class": "wikitable"})
    
    for table in tables:
        rows = table.find_all("tr")
        
        # Ensure the 'table' variable is correctly defined, and rows are being iterated correctly
        for row in table.find_all("tr", class_="nat-fs-player"):
            cols = row.find_all("td")
            if len(cols) > 3:
                # Locate the <th> tag and then the <a> tag within it safely
                th_element = row.find('th').find('a')
                player_name = th_element.text if th_element else "N/A"  # Get text from the <a> tag
                
                player = {
                    "Year": year,  # Add year field
                    "Name": player_name,
                    "Position": cols[1].text.strip(),
                    "Date of Birth": cols[2].text.strip() if len(cols) > 2 else "N/A",
                    "Caps": cols[3].text.strip() if len(cols) > 3 else "N/A",
                    "Club": cols[4].text.strip() if len(cols) > 4 else "N/A"
                }
                all_player_data.append(player)

    # Pause between requests to avoid overwhelming the server
    time.sleep(1)

# Convert to DataFrame
df = pd.DataFrame(all_player_data)

# Save to CSV for further analysis
df.to_csv("world_cup_players_all_years.csv", index=False)
print("Data scraping completed and saved to 'world_cup_players_all_years.csv'")


Scraping data for the year: 1930
Scraping data for the year: 1934
Scraping data for the year: 1938
Scraping data for the year: 1950
Scraping data for the year: 1954
Scraping data for the year: 1958
Scraping data for the year: 1962
Scraping data for the year: 1966
Scraping data for the year: 1970
Scraping data for the year: 1974
Scraping data for the year: 1978
Scraping data for the year: 1982
Scraping data for the year: 1986
Scraping data for the year: 1990
Scraping data for the year: 1994
Scraping data for the year: 1998
Scraping data for the year: 2002


KeyboardInterrupt: 

In [None]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10964 entries, 0 to 10963
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Year           10964 non-null  int64 
 1   Name           10964 non-null  object
 2   Position       10964 non-null  object
 3   Date of Birth  10964 non-null  object
 4   Caps           10964 non-null  object
 5   Club           10964 non-null  object
dtypes: int64(1), object(5)
memory usage: 514.1+ KB


In [None]:
df.head()

Unnamed: 0,Year,Name,Position,Date of Birth,Caps,Club
0,1930,Ángel Bossio,1GK,(1905-05-05)5 May 1905 (aged 25),16,Talleres (BA)
1,1930,Juan Botasso,1GK,(1908-10-23)23 October 1908 (aged 21),2,Argentino (Q)
2,1930,Roberto Cherro,4FW,(1907-02-23)23 February 1907 (aged 23),10,Boca Juniors
3,1930,Alberto Chividini,2DF,(1907-02-23)23 February 1907 (aged 23),2,Central Norte
4,1930,Attilio Demaría,4FW,(1909-03-19)19 March 1909 (aged 21),0,Estudiantil Porteño


In [None]:
df['Name'].value_counts()

Name
N/A                  18
Luis Suárez           6
Pepe                  6
Lothar Matthäus       5
Lionel Messi          5
                     ..
Salvador Salguero     1
Eusebio Acasuzo       1
Franco Selvaggi       1
Gabriele Oriali       1
Tadeusz Dolny         1
Name: count, Length: 8428, dtype: int64

In [2]:
fifa = pd.read_csv('player_info.csv')

In [None]:
gfyufyugyu yuogoygou

In [4]:
fifa[fifa['Name']=='Cristiano Ronaldo']

Unnamed: 0,Year,Name,Position,Date of Birth,Caps,Club
7550,2006,Cristiano Ronaldo,4FW,(1985-02-05)5 February 1985 (aged 21),32,Manchester United
8552,2010,Cristiano Ronaldo,4FW,(1985-02-05)5 February 1985 (aged 25),72,Real Madrid
9265,2014,Cristiano Ronaldo,4FW,(1985-02-05)5 February 1985 (aged 29),111,Real Madrid
9541,2018,Cristiano Ronaldo,4FW,(1985-02-05)5 February 1985 (aged 33),150,81
10892,2022,Cristiano Ronaldo,4FW,(1985-02-05)5 February 1985 (aged 37),191,117


In [None]:
fifa['Name'].value_counts()

Name
Pepe                 6
Luis Suárez          6
Rafael Márquez       5
Guillermo Ochoa      5
Antonio Carbajal     5
                    ..
Hugo Gastulo         1
Salvador Salguero    1
Eusebio Acasuzo      1
Franco Selvaggi      1
Paweł Janas          1
Name: count, Length: 8429, dtype: int64

In [None]:
['Cristiano Ronaldo', 'Lional Messi']

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Sample player name dataset, replace with fifa['Name'].unique()
# Ensure NaN values are filtered out
player_names = ['Cristiano Ronaldo', 'Lional Messi']  # Drop NaN values from the player names

# Function to format player name to Wikipedia URL format
def format_name_for_wiki(name):
    return name.replace(" ", "_")

# Function to parse career entries from a given start row
def parse_career_section(start_tr):
    career_data = []
    current_tr = start_tr.find_next_sibling("tr")
    while current_tr:
        # Check if we've reached a new section or header row
        header = current_tr.find("th")
        if header and header.text in ["International career", "Youth career", "Honours"]:
            break
        
        # Extract data from each column if available
        cols = current_tr.find_all("td")
        if len(cols) >= 4:
            years = cols[0].get_text(strip=True)
            team = cols[1].get_text(strip=True)
            apps = cols[2].get_text(strip=True) if cols[2].get_text(strip=True) != "" else "0"
            gls = cols[3].get_text(strip=True) if cols[3].get_text(strip=True) != "" else "0"
            career_data.append(f"{years}, {team}, {apps}, {gls}")
        
        # Move to the next row
        current_tr = current_tr.find_next_sibling("tr")
    
    return " | ".join(career_data) if career_data else None

def get_player_profile(player_name):
    player_url = f"https://en.wikipedia.org/wiki/{format_name_for_wiki(player_name)}"
    response = requests.get(player_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    profile = {'Name': player_name, 'URL': player_url}
    
    # Extract Date of Birth
    dob = soup.find('span', {'class': 'bday'})
    if dob:
        profile['Date of Birth'] = dob.text
    
    # Extract Height
    height = soup.find(string="Height")
    if height:
        profile['Height'] = height.find_next("td").text.strip()
    
    # Extract Position
    position = soup.find(string="Position(s)") or soup.find(string="Position")
    if position:
        profile['Position'] = position.find_next("td").text.strip()
    
    # Extract Caps (Appearances)
    caps_label = soup.find(string="National team caps and goals") or soup.find(string="Caps")
    if caps_label:
        caps_td = caps_label.find_next("td")
        if caps_td:
            profile['Caps'] = caps_td.text.strip().split()[0]  # Take the first part before "caps"
    
    # Extract Club
    club_info = soup.find(string="Club information") or soup.find(string="Current club") or soup.find(string="Current team")
    if club_info:
        profile['Club'] = club_info.find_next("td").text.strip()
    
    # Extract Senior Career
    senior_career_label = soup.find("th", text="Senior career")
    if senior_career_label:
        profile['Senior Career'] = parse_career_section(senior_career_label)
    
    # Extract International Career
    international_career_label = soup.find("th", text="International career")
    if international_career_label:
        profile['International Career'] = parse_career_section(international_career_label)

    return profile

# Scrape profiles and compile data
player_profiles = []
for player in player_names:
    try:
        profile = get_player_profile(player)
        player_profiles.append(profile)
        time.sleep(1)  # Delay to avoid overwhelming server
    except Exception as e:
        print(f"Error scraping {player}: {e}")

# Convert to DataFrame and save to CSV
player_df = pd.DataFrame(player_profiles)
player_df.to_csv("world_cup_players_career_data.csv", index=False)
print("Data scraping completed and saved to 'world_cup_players_career_data.csv'")
print(player_df.info())


  senior_career_label = soup.find("th", text="Senior career")
  international_career_label = soup.find("th", text="International career")


Data scraping completed and saved to 'world_cup_players_career_data.csv'
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Name           2 non-null      object
 1   URL            2 non-null      object
 2   Date of Birth  1 non-null      object
 3   Height         1 non-null      object
 4   Position       1 non-null      object
 5   Club           1 non-null      object
dtypes: object(6)
memory usage: 228.0+ bytes
None


In [6]:
player_df

Unnamed: 0,Name,URL,Date of Birth,Height,Position,Club
0,Cristiano Ronaldo,https://en.wikipedia.org/wiki/Cristiano_Ronaldo,1985-02-05,1.87 m (6 ft 2 in)[note 1],Forward,Al Nassr
1,Lional Messi,https://en.wikipedia.org/wiki/Lional_Messi,,,,


In [None]:
skldfgo sdfgsjgf

In [None]:
player_df

NameError: name 'player_df' is not defined

In [None]:
df.to_csv('player_info.csv', index=False)

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def format_name_for_wiki(player_name):
    # Replace spaces with underscores and format the name to match Wikipedia URL structure
    return player_name.replace(" ", "_")

def scrape_player_info(player_name):
    # Format the player's name for the Wikipedia URL
    formatted_name = format_name_for_wiki(player_name)
    url = f'https://en.wikipedia.org/wiki/{formatted_name}'
    
    # Send a GET request to the Wikipedia page
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Initialize a dictionary to store player information
        player_info = {
            "Full name": None,
            "Date of birth": None,
            "Place of birth": None,
            "Height": None,
            "Position(s)": None,
            "Current team": None,
            "Number": None,
            "Senior career": None,           # DataFrame for Senior career
            "International career": None     # DataFrame for International career
        }
        
        # Find the table with class 'infobox' and locate each field
        infobox = soup.find('table', class_='infobox')
        if infobox:
            senior_career_section = False
            international_career_section = False
            career_data = []  # List to hold senior career rows for DataFrame
            international_data = []  # List to hold international career rows for DataFrame

            for row in infobox.find_all('tr'):
                header = row.find('th')
                data = row.find('td')
                
                # Extract general information before "Senior career" section
                if header and data:
                    field_name = header.text.strip()
                    field_value = data.text.strip()
                    
                    # Match each field and store it in the dictionary
                    if "Full name" in field_name:
                        player_info["Full name"] = field_value
                    elif "Date of birth" in field_name:
                        player_info["Date of birth"] = field_value
                    elif "Place of birth" in field_name:
                        player_info["Place of birth"] = field_value
                    elif "Height" in field_name:
                        player_info["Height"] = field_value
                    elif "Position(s)" in field_name:
                        player_info["Position(s)"] = field_value
                    elif "Current team" in field_name:
                        player_info["Current team"] = field_value
                    elif "Number" in field_name:
                        player_info["Number"] = field_value
                
                # Start capturing "Senior career" data
                if header and "Senior career*" in header.text:
                    senior_career_section = True
                    continue
                
                # Stop capturing "Senior career" data if "International career" is reached
                if senior_career_section and header and "International career" in header.text:
                    senior_career_section = False
                    international_career_section = True  # Start International career section
                    continue

                # Stop capturing "International career" data if "Medal record" is reached
                if international_career_section and header and "Medal record" in header.text:
                    international_career_section = False  # Stop International career section
                    break

                # Collect "Senior career" data
                if senior_career_section:
                    career_row = {"Years": None, "Team": None, "Apps": None, "Goals": None}
                    if header:
                        career_row["Years"] = header.text.strip()  # The years (e.g., 2004–2012)
                    
                    # Collect team, apps, and goals data from corresponding 'td' tags
                    data_cells = row.find_all('td', class_=lambda x: x and x.startswith('infobox-data'))
                    if data_cells:
                        career_row["Team"] = data_cells[0].text.strip() if len(data_cells) > 0 else None
                        career_row["Apps"] = data_cells[1].text.strip() if len(data_cells) > 1 else None
                        career_row["Goals"] = data_cells[2].text.strip() if len(data_cells) > 2 else None
                    
                    # Append each row to the career_data list
                    if any(career_row.values()):
                        career_data.append(career_row)

                # Collect "International career" data
                if international_career_section:
                    intl_row = {"Years": None, "Team": None, "Apps": None, "Goals": None}
                    if header:
                        intl_row["Years"] = header.text.strip()  # The years (e.g., 2005–2021)
                    
                    # Collect team, apps, and goals data from corresponding 'td' tags
                    data_cells = row.find_all('td', class_=lambda x: x and x.startswith('infobox-data'))
                    if data_cells:
                        intl_row["Team"] = data_cells[0].text.strip() if len(data_cells) > 0 else None
                        intl_row["Apps"] = data_cells[1].text.strip() if len(data_cells) > 1 else None
                        intl_row["Goals"] = data_cells[2].text.strip() if len(data_cells) > 2 else None
                    
                    # Append each row to the international_data list
                    if any(intl_row.values()):
                        international_data.append(intl_row)
            
            # Convert career_data and international_data lists to DataFrames and store them in player_info
            if career_data:
                player_info["Senior career"] = pd.DataFrame(career_data)
            if international_data:
                player_info["International career"] = pd.DataFrame(international_data)
        else:
            print("Infobox not found.")
        
        return player_info
    else:
        print(f"Failed to retrieve page for {player_name}")
        return None

# Example of using the function with a player name from your dataset
player_name = "Cristiano Ronaldo"  # Replace with a name from your dataset
extracted_info = scrape_player_info(player_name)

# Display general player information
print("General Information:")
print({k: v for k, v in extracted_info.items() if k not in ["Senior career", "International career"]})

# Display Senior career as a DataFrame
print("\nSenior Career DataFrame:")
print(extracted_info["Senior career"])

# Display International career as a DataFrame
print("\nInternational Career DataFrame:")
print(extracted_info["International career"])


General Information:
{'Full name': 'Cristiano Ronaldo dos Santos Aveiro[1]', 'Date of birth': '(1985-02-05) 5 February 1985 (age\xa039)[2]', 'Place of birth': 'Funchal, Madeira, Portugal', 'Height': '1.87\xa0m (6\xa0ft 2\xa0in)[note 1]', 'Position(s)': 'Forward', 'Current team': 'Al Nassr', 'Number': '7'}

Senior Career DataFrame:
       Years               Team  Apps  Goals
0      Years               Team  Apps  (Gls)
1  2002–2003      Sporting CP B     2    (0)
2  2002–2003        Sporting CP    25    (3)
3  2003–2009  Manchester United   196   (84)
4  2009–2018        Real Madrid   292  (311)
5  2018–2021           Juventus    98   (81)
6  2021–2022  Manchester United    40   (19)
7      2023–           Al Nassr    56   (55)

International Career DataFrame:
       Years          Team Apps  Goals
0       2001  Portugal U15    9    (7)
1  2001–2002  Portugal U17    7    (5)
2       2003  Portugal U20    5    (1)
3  2002–2003  Portugal U21   10    (3)
4       2004  Portugal U23    3   