In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [22]:
# List of years for which we want to scrape data
years = [1930, 1934, 1938, 1950, 1954, 1958, 1962, 1966, 1970, 1974, 1978, 
         1982, 1986, 1990, 1994, 1998, 2002, 2006, 2010, 2014, 2018, 2022]

# Base URL for Wikipedia squad pages
base_url = "https://en.wikipedia.org/wiki/{}_FIFA_World_Cup_squads"

# Data storage
all_player_data = []

for year in years:
    # Construct the URL for each year
    url = base_url.format(year)
    print(f"Scraping data for the year: {year}")
    
    # Fetch and parse the page
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data for {year}. Status code: {response.status_code}")
        continue
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all squad tables
    tables = soup.find_all("table", {"class": "wikitable"})
    
    for table in tables:
        rows = table.find_all("tr")
        
        for row in rows[1:]:  # Skip header
            cols = row.find_all("td")
            if len(cols) > 3:
                # Extract player data with the year included
                player = {
                    "Year": year,  # Add year field
                    "Name": cols[0].text.strip(),
                    "Position": cols[1].text.strip(),
                    "Date of Birth": cols[2].text.strip() if len(cols) > 2 else "N/A",
                    "Caps": cols[3].text.strip() if len(cols) > 3 else "N/A",
                    "Club": cols[4].text.strip() if len(cols) > 4 else "N/A"
                }
                all_player_data.append(player)
    
    # Pause between requests to avoid overwhelming the server
    time.sleep(1)

# Convert to DataFrame
df = pd.DataFrame(all_player_data)

# Save to CSV for further analysis
df.to_csv("world_cup_players_all_years.csv", index=False)
print("Data scraping completed and saved to 'world_cup_players_all_years.csv'")


Scraping data for the year: 1930
Scraping data for the year: 1934
Scraping data for the year: 1938
Scraping data for the year: 1950
Scraping data for the year: 1954
Scraping data for the year: 1958
Scraping data for the year: 1962
Scraping data for the year: 1966
Scraping data for the year: 1970
Scraping data for the year: 1974
Scraping data for the year: 1978
Scraping data for the year: 1982
Scraping data for the year: 1986
Scraping data for the year: 1990
Scraping data for the year: 1994
Scraping data for the year: 1998
Scraping data for the year: 2002
Scraping data for the year: 2006
Scraping data for the year: 2010
Scraping data for the year: 2014
Scraping data for the year: 2018
Scraping data for the year: 2022
Data scraping completed and saved to 'world_cup_players_all_years.csv'


In [23]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11078 entries, 0 to 11077
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Year           11078 non-null  int64 
 1   Name           11078 non-null  object
 2   Position       11078 non-null  object
 3   Date of Birth  11078 non-null  object
 4   Caps           11078 non-null  object
 5   Club           11078 non-null  object
dtypes: int64(1), object(5)
memory usage: 519.4+ KB


In [11]:
df.to_csv('player_info.csv', index=False)