## **IMPORTS**

In [1]:
import pandas as pd
from string import ascii_uppercase as alphabet
import pickle
from bs4 import BeautifulSoup
import requests

- I need to read the necessary data from a page that contains basic data for the 2022 World Cup without any results, I do this with the read_html method which is available with pandas

### **WEBSCRAPING WORLD CUP 2022 GROUPS**

In [None]:
all_tables = pd.read_html("https://web.archive.org/web/20221115040351/https://en.wikipedia.org/wiki/2022_FIFA_World_Cup")

In [None]:
#all_tables[12]
#all_tables[19]
all_tables[26]
all_tables[61]

-  getting all groups and applying changes to the dataframe

In [None]:
dict_table = {}
for letter, i in zip(alphabet, range(12, 68, 7)):
    df = all_tables[i] # vucem svaku od grupa u dataframe
    df.rename(columns={"Teamvte":"Team"}, inplace = True) # renameam stupac da je jasniji
    df.pop("Qualification")
    dict_table[f"Group {letter}"] = df

In [None]:
dict_table.keys()

In [None]:
dict_table["Group A"]

- saving group data with pickle

In [None]:
with open("world_cup_groups_2022","wb") as output:
    pickle.dump(dict_table, output)

### **WEBSCRAPING ALL THE HISTORICAL DATA**

In [None]:
years = [1930, 1934, 1938, 1950, 1954, 1958, 1962, 1966, 1970, 1974,
        1978, 1982, 1986, 1990, 1994, 1998, 2002, 2006, 2010, 2014,
        2018]

In [None]:
def get_matches(year):
    if year == 2022:
        website = f"https://web.archive.org/web/20221115040351/https://en.wikipedia.org/wiki/2022_FIFA_World_Cup"
    else:
        website = f"https://en.wikipedia.org/wiki/{year}_FIFA_World_Cup"
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(website, headers=headers)
    content = response.text
    soup = BeautifulSoup(content, "lxml")
    matches = soup.find_all("div", class_="footballbox")
    home = []
    score = []
    away = []
    for match in matches:
        home.append(match.find("th", class_="fhome").get_text())
        score.append(match.find("th", class_="fscore").get_text())
        away.append(match.find("th", class_="faway").get_text())
    dict_football = {"home": home, "score": score, "away": away}
    df_football = pd.DataFrame(dict_football)
    df_football["year"] = year
    return df_football

In [None]:
get_matches("2018")

In [None]:
fifa = [get_matches(year) for year in years]

In [None]:
df_fifa = pd.concat(fifa, ignore_index = True)

In [None]:
df_fifa.to_csv("fifa_worldcup_historical_data.csv", index=False)

### **MISSING DATA**

- Some of historical data is missing because the web pages are built with different HTML tags. Therefore, I decided to use the Selenium package to ensure all data is properly extracted.

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
import pandas as pd

path = #put your chromedriver path here
service = Service(executable_path=path)
driver = webdriver.Chrome(service=service)


def get_misssing_data(year):
    web = f'https://en.wikipedia.org/wiki/{year}_FIFA_World_Cup'

    driver.get(web)
    matches = driver.find_elements(by='xpath', value='//td[@align="right"]/.. | //td[@style="text-align:right;"]/..')
    matches = driver.find_elements(by='xpath', value='//tr[@style="font-size:90%"]')

    home = []
    score = []
    away = []

    for match in matches:
        home.append(match.find_element(by='xpath', value='./td[1]').text)
        score.append(match.find_element(by='xpath', value='./td[2]').text)
        away.append(match.find_element(by='xpath', value='./td[3]').text)

    dict_football = {'home': home, 'score': score, 'away': away}
    df_football = pd.DataFrame(dict_football)
    df_football['year'] = year
    time.sleep(2)
    return df_football


years = [1930, 1934, 1938, 1950, 1954, 1958, 1962, 1966, 1970, 1974,
         1978, 1982, 1986, 1990, 1994, 1998, 2002, 2006, 2010, 2014,
         2018]

fifa = [get_misssing_data(year) for year in years]
driver.quit()
df_fifa = pd.concat(fifa, ignore_index=True)
df_fifa.to_csv("fifa_worldcup_missing_data.csv", index=False)

### **GETTING 2022 FIXTURES**

In [None]:
df_fixture = get_matches(2022)

In [None]:
df_fixture.to_csv("fifa_worldcup_2022_fixtures.csv", index=False)