# 2023 FIFA Group_stage table_data extraction


In [1]:
#Importing required libraries 
import pandas as pd
import pickle
from string import ascii_uppercase as alphabet

In [2]:
#  pd.read_html parse HTML tables from the webpage and return a list of DataFrame objects.
tables = pd.read_html("https://web.archive.org/web/20230609142654/https://en.wikipedia.org/wiki/2023_FIFA_Women's_World_Cup")  

In [3]:
tables[10]  #Finding pattern for our dataset

Unnamed: 0,Pos,Teamvte,Pld,W,D,L,GF,GA,GD,Pts,Qualification
0,1,New Zealand (H),0,0,0,0,0,0,0,0,Advance to knockout stage
1,2,Norway,0,0,0,0,0,0,0,0,Advance to knockout stage
2,3,Philippines,0,0,0,0,0,0,0,0,
3,4,Switzerland,0,0,0,0,0,0,0,0,


In [4]:
tables[17] #Finding pattern for our dataset

Unnamed: 0,Pos,Teamvte,Pld,W,D,L,GF,GA,GD,Pts,Qualification
0,1,Australia (H),0,0,0,0,0,0,0,0,Advance to knockout stage
1,2,Republic of Ireland,0,0,0,0,0,0,0,0,Advance to knockout stage
2,3,Nigeria,0,0,0,0,0,0,0,0,
3,4,Canada,0,0,0,0,0,0,0,0,


In [5]:
tables[59]  #Finding pattern for our dataset

Unnamed: 0,Pos,Teamvte,Pld,W,D,L,GF,GA,GD,Pts,Qualification
0,1,Germany,0,0,0,0,0,0,0,0,Advance to knockout stage
1,2,Morocco,0,0,0,0,0,0,0,0,Advance to knockout stage
2,3,Colombia,0,0,0,0,0,0,0,0,
3,4,South Korea,0,0,0,0,0,0,0,0,


In [6]:
tables = pd.read_html("https://web.archive.org/web/20230609142654/https://en.wikipedia.org/wiki/2023_FIFA_Women's_World_Cup")
for i in range(10,60,7):   # Looping and extracting based on pattern matched for our prediction
    
    df = tables[i]
    df.rename(columns={df.columns[1]:'Team'}, inplace=True)
    df.pop('Qualification')

In [7]:
tables = pd.read_html("https://web.archive.org/web/20230609142654/https://en.wikipedia.org/wiki/2023_FIFA_Women's_World_Cup")

tables_2023 = {}   #Storing The entire content into dictionary with Group name as keys
for letter, i in zip(alphabet, range(10,60,7)): 
    df = tables[i]
    df.rename(columns={df.columns[1]:'Team'}, inplace=True)
    df.pop('Qualification')
    tables_2023[f'Group {letter}'] = df

In [9]:
tables_2023['Group A']   #Checking data

Unnamed: 0,Pos,Team,Pld,W,D,L,GF,GA,GD,Pts
0,1,New Zealand (H),0,0,0,0,0,0,0,0
1,2,Norway,0,0,0,0,0,0,0,0
2,3,Philippines,0,0,0,0,0,0,0,0
3,4,Switzerland,0,0,0,0,0,0,0,0


In [10]:
#data serialization using the pickle module(converts the object into a byte stream )
with open('tables_2023', 'wb') as output:
    pickle.dump(tables_2023, output)

# Scraping past_stats of FIFA match results


In [11]:
#Importing required libraries 
from bs4 import BeautifulSoup
import requests

In [12]:
#scrapes the data of football matches from Every year of FIFA Women's World Cup webpage on
# Wikipedia and returns the data as a DataFrame.

def extract_matches(year):
    web = f'https://en.wikipedia.org/wiki/{year}_FIFA_Women%27s_World_Cup'
    response = requests.get(web)
    content = response.text
    soup = BeautifulSoup(content, 'lxml')  #Webscraping using BeautifulSoup
    matches = soup.find_all('div', class_='footballbox')

    home = []
    score = []
    away = []

    # Data collection after identification of pattern in Elements of html page by inspecting 
    for match in matches:    
        home.append(match.find('th', class_='fhome').get_text())
        score.append(match.find('th', class_='fscore').get_text())
        away.append(match.find('th', class_='faway').get_text())

    football = {'home': home, 'score': score, 'away': away}  
    football = pd.DataFrame(football)  # Creating dataframe using data scraped
    football['year'] = year
    return football


In [13]:
years = [1991, 1995, 1999, 2003, 2007, 2011, 2015, 2019]

data = [extract_matches(year) for year in years]
frame= pd.concat(data, ignore_index=True)  #concatenates all the DataFrames into a single DataFrame
frame.to_csv("womens_fifa_pastdata.csv", index=False) # saves the final DataFrame as a CSV file

In [14]:
fixture = extract_matches(2023)
fixture.to_csv('fifa2023_fixtures.csv', index=False) #saves 2023 match fixtures DataFrame as a CSV file


# Cleaning _data for accurate  prediction


In [15]:
#Reads the contents of the CSV file into pandas DataFrame
past_data = pd.read_csv('womens_fifa_pastdata.csv')
fixtures_2023 = pd.read_csv('fifa2023_fixtures.csv')

In [16]:
past_data  #Checking data

Unnamed: 0,home,score,away,year
0,China,4–0,Norway,1991
1,Denmark,3–0,New Zealand,1991
2,Norway,4–0,New Zealand,1991
3,China,2–2,Denmark,1991
4,China,4–1,New Zealand,1991
...,...,...,...,...
279,Germany,1–2,Sweden,2019
280,England,1–2,United States,2019
281,Netherlands,1–0 (a.e.t.),Sweden,2019
282,England,1–2,Sweden,2019


In [17]:
fixtures_2023   #Checking data

Unnamed: 0,home,score,away,year
0,New Zealand,Match 1,Norway,2023
1,Philippines,Match 3,Switzerland,2023
2,New Zealand,Match 17,Philippines,2023
3,Switzerland,Match 18,Norway,2023
4,Switzerland,Match 33,New Zealand,2023
...,...,...,...,...
59,Winner Match 54,Match 60,Winner Match 56,2023
60,Winner Match 57,Match 61,Winner Match 58,2023
61,Winner Match 59,Match 62,Winner Match 60,2023
62,Loser Match 61,Match 63,Loser Match 62,2023


In [18]:
# Removes leading and trailing whitespace characters from the values in the columns mentiones 
fixtures_2023 ['home'] = fixtures_2023 ['home'].str.strip()
fixtures_2023 ['away'] = fixtures_2023 ['away'].str.strip()

In [19]:
# Removes any duplicate rows
past_data.drop_duplicates(inplace=True)
past_data.sort_values('year', inplace=True) #sorts the DataFrame based on the values in the 'year' column

In [20]:
# Renaming columns
past_data.rename(columns={'home': 'Favor_Team', 'away': 'Against_Team'}, inplace=True)

In [21]:
# Removes any non-digit and non-hyphen characters from the values in the 'score' column
past_data['score'] = past_data['score'].str.replace('[^\d–]', '', regex=True)
past_data['Favor_Team'] = past_data['Favor_Team'].str.strip() 
past_data['Against_Team'] = past_data['Against_Team'].str.strip()


#  Extracting Scored_goals and Conceded_goals from  column score and making new columns
past_data[['Scored_goals', 'Conceded_goals']] = past_data['score'].str.split('–', expand=True)
past_data.drop('score', axis=1, inplace=True)  #Droping column score

In [52]:
# Changing data types
past_data = past_data.astype({'Scored_goals': int, 'Conceded_goals':int})

past_data

Unnamed: 0,Favor_Team,Against_Team,year,Scored_goals,Conceded_goals
0,China,Norway,1991,4,0
25,Norway,United States,1991,1,2
24,Sweden,Germany,1991,4,0
23,Germany,United States,1991,2,5
22,Sweden,Norway,1991,1,4
...,...,...,...,...,...
252,Japan,Scotland,2019,2,1
253,England,Argentina,2019,1,0
254,Japan,England,2019,0,2
256,Canada,Cameroon,2019,1,0


In [22]:
#Saves the cleaned versions of the past_data and fixtures_2023 DataFrames as separate CSV files
past_data.to_csv('cleaned_pastdata.csv',index=False)
fixtures_2023.to_csv('cleaned_fixtures_2023.csv',index=False)

# Worldcup_winner prediction_process


In [23]:
#Importing required libraries 
from scipy.stats import poisson

In [24]:
#Loading files into pandas Dataframe 
tables_2023 = pickle.load(open('tables_2023','rb'))
past_data = pd.read_csv('cleaned_pastdata.csv')
fixtures_2023 = pd.read_csv('cleaned_fixtures_2023.csv')

In [25]:
#To screen the data required for our prediction
favor_Tm = past_data [['Favor_Team', 'Scored_goals', 'Conceded_goals']]
against_Tm = past_data [['Against_Team', 'Scored_goals', 'Conceded_goals']]

favor_Tm = favor_Tm.rename(columns={'Favor_Team':'Team', 'Scored_goals': 'GoalsScored', 'Conceded_goals': 'GoalsConceded'})
against_Tm = against_Tm.rename(columns={'Against_Team':'Team', 'Scored_goals': 'GoalsConceded', 'Conceded_goals': 'GoalsScored'})


In [26]:
#Groups the rows of the conatinated DataFrame by the 'Team' column using .groupby(['Team']).Then calculates the mean value 
Team_data= pd.concat([favor_Tm,against_Tm], ignore_index=True).groupby(['Team']).mean()

In [27]:
Team_data = Team_data.rename(columns={'GoalsScored':'GScore_index', 'GoalsConceded': 'Gconcede_index'})
Team_data

Unnamed: 0_level_0,GScore_index,Gconcede_index
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Argentina,0.555556,4.111111
Australia,1.461538,1.923077
Brazil,1.941176,1.176471
Cameroon,1.5,1.5
Canada,1.259259,1.925926
Chile,0.666667,1.666667
China,1.606061,0.969697
Chinese Taipei,0.5,3.75
Colombia,0.571429,1.285714
Costa Rica,1.0,1.333333


In [28]:
# Below function uses the Poisson distribution to calculate the probabilities of different goal outcomes and 
# assigns points accordingly to determine the likely outcome of the match.

#Below function utilizes the performance indices from Team_data to calculate the points for a home team and an away team

def points_calculate(home, away):
    if home in Team_data.index and away in Team_data.index:
        # goals_scored * goals_conceded
        lambda_home = Team_data.at[home,'GScore_index'] * Team_data.at[away,'Gconcede_index']
        lambda_away = Team_data.at[away,'GScore_index'] * Team_data.at[home,'Gconcede_index']
        prob_home, prob_away, prob_draw = 0, 0, 0
        for x in range(0,12):    #Taking cases diff goal combinations (considered 0-12 goals) home team
            for y in range(0, 12):    #Taking cases diff goal combinations (considered 0-12 goals) away team 
                prob = poisson.pmf(x, lambda_home) * poisson.pmf(y, lambda_away)
        
                if x == y:prob_draw += prob
                elif x > y:prob_home += prob
                else:prob_away += prob
        
        home_pnts =  prob_draw + 3 * prob_home  # 3 points for winning , 1 point for draw
        away_pnts = prob_draw + 3 * prob_away 
        return (home_pnts, away_pnts)
    else:
        return (0, 0)     # If any of the inputs teams are new team in FIFA..cannot be predicted

In [29]:
points_calculate('New Zealand','Norway') #Testing above function

(0.054817242336792256, 2.896025146966355)

In [30]:
# Split the fixtures_2023 DataFrame into separate subsets group stage, knockout stage, quarterfinals, semifinals, and the final.
fixture_group = fixtures_2023[:48].copy()
fixture_knockout = fixtures_2023[48:56].copy()
fixture_quarter = fixtures_2023[56:60].copy()
fixture_semi = fixtures_2023[60:62].copy()
fixture_final = fixtures_2023[62:].copy()

In [35]:
# Creating cluster_teams for each group table to conduct group stage matches only btw teams with each group
for group in tables_2023:
    group_teams = tables_2023[group]['Team'].values
    cluster_matches= fixture_group[fixture_group['home'].isin(group_teams)] # cluster_teams creation
    for index, row in cluster_matches.iterrows():
        home, away = row['home'], row['away']
        home_pts, away_pts = points_calculate(home, away) #Calculating points for match btw 2 teams with function points_calculate
        tables_2023[group].loc[tables_2023[group]['Team'] == home, 'Pts'] += home_pts
        tables_2023[group].loc[tables_2023[group]['Team'] == away, 'Pts'] += away_pts

    tables_2023[group] = tables_2023[group].sort_values('Pts', ascending=False).reset_index()
    tables_2023[group] = tables_2023[group][['Team', 'Pts']]
    tables_2023[group] = tables_2023[group].round(0)  #Rounding points obtained

In [36]:
# Extracting top 2 teams from each group and adding to knockout dataframe for next stage matches
for group in tables_2023:
    grp_winner = tables_2023[group].loc[0, 'Team']
    grp_runners_up = tables_2023[group].loc[1, 'Team']
    fixture_knockout.replace({f'Winner {group}':grp_winner,
                                 f'Runner-up {group}':grp_runners_up}, inplace=True)


In [39]:
fixture_knockout['win'] = '*'


In [41]:
fixture_knockout= fixture_knockout.drop('winner', axis=1)

In [42]:
fixture_knockout

Unnamed: 0,home,score,away,year,win
48,Switzerland,Match 49,Spain,2023,*
49,Costa Rica,Match 50,Norway,2023,*
50,United States,Match 51,Sweden,2023,*
51,Italy,Match 52,Netherlands,2023,*
52,China,Match 54,Nigeria,2023,*
53,Canada,Match 53,England,2023,*
54,Germany,Match 56,Brazil,2023,*
55,France,Match 55,Colombia,2023,*


In [43]:
# Updating tables with winners of each match based on the win prediction function ..points_calculate.. called for each match
def winner_predictor(df_fixtures):
    for index, row in df_fixtures.iterrows():
        home, away = row['home'], row['away']
        home_pts, away_pts = points_calculate(home, away)  # Calculating points for a match btw 2 teams
        if home_pts > away_pts :
            winner = home
        else:
            winner = away
        df_fixtures.loc[index, 'win'] = winner   # Winner updating in the table
    return df_fixtures

In [44]:
# Winner predictor function called for fixture_knockout..to update winner column
winner_predictor(fixture_knockout)

Unnamed: 0,home,score,away,year,win
48,Switzerland,Match 49,Spain,2023,Switzerland
49,Costa Rica,Match 50,Norway,2023,Norway
50,United States,Match 51,Sweden,2023,United States
51,Italy,Match 52,Netherlands,2023,Italy
52,China,Match 54,Nigeria,2023,China
53,Canada,Match 53,England,2023,England
54,Germany,Match 56,Brazil,2023,Germany
55,France,Match 55,Colombia,2023,France


In [45]:
fixture_quarter

Unnamed: 0,home,score,away,year
56,Winner Match 49,Match 57,Winner Match 51,2023
57,Winner Match 50,Match 58,Winner Match 52,2023
58,Winner Match 53,Match 59,Winner Match 55,2023
59,Winner Match 54,Match 60,Winner Match 56,2023


In [46]:
#Below function moves the winner teams to next league stage
def transit_table(table_round1, table_round2):
    for index, row in table_round1.iterrows():
        winner = table_round1.loc[index, 'win']
        match = table_round1.loc[index, 'score']

        table_round2.replace({f'Winner {match}':winner}, inplace=True)  #moving winners to next stage
    table_round2['win'] = '*'
    return table_round2

In [47]:
# transit_table function called for fixture_knockout & fixture_quarter..to update fixture_quarter table with winners from fixture_knockout
transit_table(fixture_knockout, fixture_quarter)

Unnamed: 0,home,score,away,year,win
56,Switzerland,Match 57,United States,2023,*
57,Norway,Match 58,Italy,2023,*
58,England,Match 59,France,2023,*
59,China,Match 60,Germany,2023,*


In [48]:
# Winner predictor function called for fixture_quarter..to update winner column
winner_predictor(fixture_quarter)

Unnamed: 0,home,score,away,year,win
56,Switzerland,Match 57,United States,2023,United States
57,Norway,Match 58,Italy,2023,Norway
58,England,Match 59,France,2023,France
59,China,Match 60,Germany,2023,Germany


In [49]:
# transit_table function called for fixture_quarter & fixture_semi..to update fixture_semi table with winners from fixture_quarter
transit_table(fixture_quarter, fixture_semi)

Unnamed: 0,home,score,away,year,win
60,United States,Match 61,Norway,2023,*
61,France,Match 62,Germany,2023,*


In [50]:
 winner_predictor(fixture_semi)

Unnamed: 0,home,score,away,year,win
60,United States,Match 61,Norway,2023,United States
61,France,Match 62,Germany,2023,Germany


In [51]:
transit_table(fixture_semi, fixture_final)

Unnamed: 0,home,score,away,year,win
62,Loser Match 61,Match 63,Loser Match 62,2023,*
63,United States,Match 64,Germany,2023,*


In [52]:
final_res=winner_predictor(fixture_final)

In [53]:
final_res.drop(62, inplace=True)

In [54]:
final_res  # FIFA Final match result details

Unnamed: 0,home,score,away,year,win
63,United States,Match 64,Germany,2023,United States


In [55]:
f"Predicted FIFA Women's World Cup 2023 Winner is {final_res.loc[63, 'win']} "

"Predicted FIFA Women's World Cup 2023 Winner is United States "