**DESCRIPTION:**

This Jupyter Notebook shows the data collection of world cup matches from 1930 to 2018 and fixture in the Qatar 2022 World Cup. The "web scraping" will be done with the BeautifulSoup library to finally both be saved as a csv file.

# 2. Data collection of worldcup matches from 1930 to 2018 and Qatar 2022 fixture with BeautifulSoup

In [1]:
# To install BeautifulSoup
# !pip install bs4

**Libraries**

In [2]:
from bs4 import BeautifulSoup   #Web scraping

In [3]:
import requests   #Send requests to a web page

In [4]:
import pandas as pd   #DataFrame

## 2.1 Data collection of worldcup matches from 1930 to 2018 with BeautifulSoup

### 2.1.1 Web scraping

In [5]:
# Generate a list of years of all worldcups
years = [x for x in range(1930, 2019, 4)]
years

[1930,
 1934,
 1938,
 1942,
 1946,
 1950,
 1954,
 1958,
 1962,
 1966,
 1970,
 1974,
 1978,
 1982,
 1986,
 1990,
 1994,
 1998,
 2002,
 2006,
 2010,
 2014,
 2018]

In [6]:
# Function to extract the matches of every world cup
def get_matches(year):

    website = f'https://en.wikipedia.org/wiki/{year}_FIFA_World_Cup'   # website link

    response = requests.get(website)   # Request website
    content = response.text   # Response in text format
    soup = BeautifulSoup(content, 'lxml')   # assigning the parse 'lxml'

    # Searching multiple items
    matches = soup.find_all('div', class_="footballbox")   #list

    # Empty lists
    first_team = []
    score = []
    second_team = []

    # Two teams and the goals
    for match in matches:
        first_team.append(match.find('th', class_="fhome").get_text())
        score.append(match.find('th', class_="fscore").get_text())
        second_team.append(match.find('th', class_="faway").get_text())

    # A dictionary
    dict_football = {'First_team':first_team, 'Score':score, 'Second_team':second_team}


    # Create DataFrame
    df_football = pd.DataFrame(dict_football)

    # Create a column for its year
    df_football['Year'] = year
    df_football
    
    return df_football

In [7]:
# Use the function "get_matches()" to get a list of all the World Cup matches.
world_cups = [get_matches(year) for year in years]

In [8]:
# Showing all the matches of each World Cup
#world_cups

### 2.1.2 Save list in a DataFrame

In [9]:
# Create a DataFrame of the concatenation of all the elements of the list "world_cups".
df_world_cups = pd.concat(world_cups, ignore_index=True)
df_world_cups

Unnamed: 0,First_team,Score,Second_team,Year
0,France,4–1,Mexico,1930
1,Argentina,1–0,France,1930
2,Chile,3–0,Mexico,1930
3,Chile,1–0,France,1930
4,Argentina,6–3,Mexico,1930
...,...,...,...,...
896,Russia,2–2 (a.e.t.),Croatia,2018
897,France,1–0,Belgium,2018
898,Croatia,2–1 (a.e.t.),England,2018
899,Belgium,2–0,England,2018


### 2.1.3 Export dictionary as a csv file

In [10]:
# Export all worldcups to an csv file
#df_world_cups.to_csv('data/FIFA_Worldcup_historical_data.csv', index=False)

## 2.2 Data collection of fixture in the Qatar 2022 World Cup with BeautifulSoup

### 2.2.1 Web scraping

In [11]:
# website link
website = 'https://web.archive.org/web/20221115040351/https://en.wikipedia.org/wiki/2022_FIFA_World_Cup'

In [12]:
response = requests.get(website)
content = response.text
soup = BeautifulSoup(content, 'lxml')   #assigning the parse 'lxml'

# searching multiple items
matches = soup.find_all('div', class_="footballbox")   #list

# Empty lists
first_team = []
score = []
second_team = []

# Two teams and the goals
for match in matches:
    first_team.append(match.find('th', class_="fhome").get_text())
    score.append(match.find('th', class_="fscore").get_text())
    second_team.append(match.find('th', class_="faway").get_text())

# A dictionary
dict_football = {'First_team':first_team, 'Score':score, 'Second_team':second_team}

### 2.2.2 Save in a DataFrame

In [13]:
# Create DataFrame
df_fixture = pd.DataFrame(dict_football)

# Create a column for its year
df_fixture['Year'] = 2022

In [14]:
# Fixture Qatar 2022
df_fixture

Unnamed: 0,First_team,Score,Second_team,Year
0,Qatar,Match 1,Ecuador,2022
1,Senegal,Match 2,Netherlands,2022
2,Qatar,Match 18,Senegal,2022
3,Netherlands,Match 19,Ecuador,2022
4,Ecuador,Match 35,Senegal,2022
...,...,...,...,...
59,Winners Match 51,Match 59,Winners Match 52,2022
60,Winners Match 57,Match 61,Winners Match 58,2022
61,Winners Match 59,Match 62,Winners Match 60,2022
62,Losers Match 61,Match 63,Losers Match 62,2022


### 2.2.3 Export DataFrame as a csv file

In [15]:
# Export worldcup Qatar 2022 to csv file
df_fixture.to_csv('data/FIFA_Worldcup_2022_Qatar.csv', index=False)