In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import seaborn as sns
import requests


In [3]:
def get_matches(year):
    url = f'https://en.wikipedia.org/wiki/{year}_FIFA_World_Cup'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    home_teams = []
    scores = []
    away_teams = []
    second_half_scorer = []
    first_team_goals = []
    second_team_goals = []
    dates = []
    times = []
    stadiums = []
    cities = []
    attendances = []
    referees = []
    referee_cities = []

    matches = soup.find_all('div', class_='footballbox')
    for match in matches:
        home_teams.append(match.find('th', class_='fhome').get_text())
        scores.append(match.find('th', class_='fscore').get_text())
        away_teams.append(match.find('th', class_='faway').get_text())
        
        goals = match.find_all('tr', class_="fgoals")
        if len(goals) > 1:
            pan_sec = goals[1]
            second_half_scorer.append(pan_sec.find("th").text)
        else:
            second_half_scorer.append("NO")
        
        # Handling First Team Goals
        first_team_goal_td = goals[0].find("td", class_="fhgoal")
        first_team_goal_str = ""
        if first_team_goal_td:
            first_team_goal_div = first_team_goal_td.find("div", class_="plainlist")
            if first_team_goal_div:
                first_team_goal_list = first_team_goal_div.find('ul')
                if first_team_goal_list:
                    first_team_goal_items = first_team_goal_list.find_all("li")
                    for first_goal in first_team_goal_items:
                        first_goal_text = first_goal.find("a").text
                        fb_goal_spans = first_goal.find('span', class_='fb-goal').find_all('span')
                        goal_info = ""
                        if len(fb_goal_spans) > 1:
                            for i in range(len(fb_goal_spans) - 1):
                                goal_info += fb_goal_spans[i + 1].text + " "
                        first_team_goal_str += first_goal_text + " " + goal_info.strip() + "; "
            elif first_team_goal_td.find("a"):
                first_team_goal_names = first_team_goal_td.find_all('a')
                first_team_goal_num = first_team_goal_td.find_all('span', class_='fb-goal')
                
                if len(first_team_goal_names) > 0 and len(first_team_goal_num) > 0:
                    for i in range(len(first_team_goal_names)):
                        goal_scorer = first_team_goal_names[i].text
                        goal_count = first_team_goal_num[i].text if i < len(first_team_goal_num) else ""
                        first_team_goal_str += f"{goal_scorer} {goal_count}; "
        first_team_goals.append(first_team_goal_str.strip("; "))

        # Handling Second Team Goals
        second_team_goal_td = goals[0].find("td", class_="fagoal")
        second_team_goal_str = ""
        if second_team_goal_td:
            second_team_goal_div = second_team_goal_td.find("div", class_="plainlist")
            if second_team_goal_div:
                second_team_goal_list = second_team_goal_div.find('ul')
                if second_team_goal_list:
                    second_team_goal_items = second_team_goal_list.find_all("li")
                    for second_goal in second_team_goal_items:
                        second_goal_text = second_goal.find("a").text
                        fb_goal_spans = second_goal.find('span', class_='fb-goal').find_all('span')
                        goal_info = ""
                        if len(fb_goal_spans) > 1:
                            for i in range(len(fb_goal_spans) - 1):
                                goal_info += fb_goal_spans[i + 1].text + " "
                        second_team_goal_str += second_goal_text + " " + goal_info.strip() + "; "
            elif second_team_goal_td.find("a"):
                second_team_goal_names = second_team_goal_td.find_all('a')
                second_team_goal_num = second_team_goal_td.find_all('span', class_='fb-goal')
                
                if len(second_team_goal_names) > 0 and len(second_team_goal_num) > 0:
                    for i in range(len(second_team_goal_names)):
                        goal_scorer = second_team_goal_names[i].text
                        goal_count = second_team_goal_num[i].text if i < len(second_team_goal_num) else ""
                        second_team_goal_str += f"{goal_scorer} {goal_count}; "
        second_team_goals.append(second_team_goal_str.strip("; "))

        # Date and time
        date_time = match.find("div", class_='fleft').find("time").find_all("div")
        dates.append(date_time[0].text)
        if len(date_time) > 1:
            times.append(date_time[1].text)
        else:
            times.append("N/A")  # Append a default value if time is not available
        
        # Stadium and City
        right_div = match.find('div', class_='fright').find_all('div')
        stadium_city = right_div[0].find('span').find_all('a')
        stadiums.append(stadium_city[0].text)
        
        if len(stadium_city) > 1:
            cities.append(stadium_city[1].text)
        else:
            cities.append("N/A")  # Append a default value if city is not available
        
        # Attendance
        if len(right_div) > 1:
            attendances.append(right_div[1].text)
        else:
            attendances.append("N/A")  # Append a default value if attendance is not available

        # Referee information
        if len(right_div) > 2:  # Check if referee information exists
            referee_info = right_div[2].find_all('a')
            referees.append(referee_info[0].text)
            if len(referee_info) > 1:
                referee_cities.append(referee_info[1].get_text())
            else:
                referee_cities.append("N/A")
        else:
            referees.append("N/A")  # Append a default value if referee is not available
            referee_cities.append("N/A")

    # Create DataFrame
    data = {
        'Home_Team': home_teams,
        'Score': scores,
        'Away_Team': away_teams,
        'Penalties': second_half_scorer,
        'First_Team_Goals': first_team_goals,
        'Second_Team_Goals': second_team_goals,
        'Date': dates,
        'Time': times,
        'Stadium': stadiums,
        'City': cities,
        'Attendance': attendances,
        'Referee': referees,
        'Referee_City': referee_cities
    }
    df = pd.DataFrame(data)

    return df

years = [1930, 1934, 1938, 1950, 1954, 1958, 1962, 1966, 1970, 1974, 1978, 1982, 1986, 1990, 1994, 1998, 2002, 2006, 2010, 2014, 2018,2022]

fifa = [get_matches(year) for year in years]

df_fifa = pd.concat(fifa, ignore_index=True)


In [4]:
df_fifa.to_csv("Fifa_world_cup3.csv", index=False)

In [5]:
df_fifa = pd.read_csv('Fifa_world_cup3.csv')
