In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [69]:
# init graph

font = {
    'family' : 'DejaVu Sans',
    'weight' : 'normal',
    'size'   : 20
}

plt.rc('font', **font)

# Summary

In [None]:
hist_worldcup = pd.read_csv('WorldCupsSummary.csv')

hist_worldcup

From the table above, we can conclude the summary for all world cups.

## Preprocessing

In [None]:
hist_worldcup = hist_worldcup.replace('Germany FR', 'Germany') # Replacing Germany FR with Germany

hist_worldcup

## Analysis for Summary

To analyze the following questions:

- Trends in the number of live audience attendance over the years  
- Trends in the number of participating teams over the years  
- Trends in the number of goals scored over the years  
- Which national team has won the championship the most in history?  
- Analysis of the continents of champion teams  
- Which national teams frequently reach the finals or semi-finals?  
- What is the probability of a team winning the championship after reaching the finals?  
- How likely is the host country to reach the finals or semi-finals?  

### Trends in the number of live audience attendance over the years

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.title("Attendance per cup")

hist_worldcup.plot.scatter(
    x='Year', 
    y='Attendance', 
    s=100,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

# enable grid
ax.grid(visible=True)

# set tick params
ax.tick_params(axis='both', which='major', labelsize=12)

# set xticks
ax.set_xticks(np.arange(1930, 2021, 5))
ax.set_yticks(np.arange(0, 4000000, 500000))

ax.ticklabel_format(style='plain')

plt.tick_params(bottom=False, left=False)

### Trends in the number of participating teams over the years  

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.title("Teams per cup")

hist_worldcup.plot.scatter(
    x='Year', 
    y='QualifiedTeams', 
    s=100,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

# enable grid
ax.grid(visible=True)

# set tick params
ax.tick_params(axis='both', which='major', labelsize=12)

# set xticks
ax.set_xticks(np.arange(1930, 2021, 5))
ax.set_yticks(np.arange(0, 40, 2))

ax.ticklabel_format(style='plain')

plt.tick_params(bottom=False, left=False)

### Trends in the number of goals scored over the years  

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.title("Goals per cup")

hist_worldcup.plot.scatter(
    x='Year', 
    y='GoalsScored', 
    s=100,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

# enable grid
ax.grid(visible=True)

# set tick params
ax.tick_params(axis='both', which='major', labelsize=12)

# set xticks
ax.set_xticks(np.arange(1930, 2021, 5))
ax.set_yticks(np.arange(0, 200, 10))

ax.ticklabel_format(style='plain')

plt.tick_params(bottom=False, left=False)

### Which national team has won the championship the most in history? 

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.title("Champion Number per Country")

sns.countplot(
    x='Winner',
    data=hist_worldcup,
    order = hist_worldcup['Winner'].value_counts().index,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

plt.tick_params(labelleft=False, left=False, labelsize=14)

for i in ax.containers:
    ax.bar_label(i, label_type='edge', fontsize=14)

### Analysis of the continents of champion teams  

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.title("Champion Number per Continent")

sns.countplot(
    x='WinnerContinent',
    data=hist_worldcup,
    order = hist_worldcup['WinnerContinent'].value_counts().index,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

plt.tick_params(labelleft=False, left=False, labelsize=14)

for i in ax.containers:
    ax.bar_label(i, label_type='edge', fontsize=14)

### Which national teams frequently reach the finals or semi-finals?  


In [None]:
fig, ax = plt.subplots(figsize=(28, 4))
plt.title("Semi Finalist per Country")

# calculate the number of semi-finalists
# = Winner + Second + Third + Fourth

# get all occurrences of the countries
countries = []
for i in hist_worldcup['Winner']:
    countries.append(i)
for i in hist_worldcup['Second']:
    countries.append(i)
for i in hist_worldcup['Third']:
    countries.append(i)
for i in hist_worldcup['Fourth']:
    countries.append(i)

sns.countplot(
    x=countries,
    order = pd.Series(countries).value_counts().index,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

plt.tick_params(labelleft=False, left=False, labelsize=12)

for i in ax.containers:
    ax.bar_label(i, label_type='edge', fontsize=12)

In [None]:
fig, ax = plt.subplots(figsize=(28, 4))
plt.title("Finalist per Country")

# calculate the number of finalists
# = Winner + Second

# get all occurrences of the countries
countries = []
for i in hist_worldcup['Winner']:
    countries.append(i)
for i in hist_worldcup['Second']:
    countries.append(i)

sns.countplot(
    x=countries,
    order = pd.Series(countries).value_counts().index,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

plt.tick_params(labelleft=False, left=False, labelsize=12)

for i in ax.containers:
    ax.bar_label(i, label_type='edge', fontsize=12)

### What is the probability of a team winning the championship after reaching the finals?  

In [None]:
fig, ax = plt.subplots(figsize=(20, 4))
plt.title("Probability of Winning after Reaching Final")

# calculate the number of finalist
# = Winner + Second

# get all occurrences of the countries
countries = []
for i in hist_worldcup['Winner']:
    countries.append(i)
for i in hist_worldcup['Second']:
    countries.append(i)

# calculate the probability of winning after reaching final
winning = []
for i in hist_worldcup['Winner']:
    winning.append(i)

probability = []
for i in countries:
    probability.append(winning.count(i) / countries.count(i))

# sort the countries by probability
countries, probability = zip(*sorted(zip(countries, probability), key=lambda x: x[1], reverse=True))

sns.barplot(
    x=countries,
    y=probability,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

plt.tick_params(labelleft=False, left=False, labelsize=12)

for i in ax.containers:
    ax.bar_label(i, label_type='edge', fontsize=12)

### How likely is the host country to reach the finals or semi-finals? 

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.title("Probability of the Host Reaching Final")

# for each cup, check if the host is in the finalist
# = Winner + Second

hist_worldcup['HostFinalist'] = 0

for i in range(len(hist_worldcup)):
    if hist_worldcup['HostCountry'][i] in [hist_worldcup['Winner'][i], hist_worldcup['Second'][i]]:
        hist_worldcup.loc[i, 'HostFinalist'] = 1
    else:
        hist_worldcup.loc[i, 'HostFinalist'] = 0

prob = hist_worldcup['HostFinalist'].value_counts(normalize=True)

sns.barplot(
    x=[['Not Host Finalist', 'Host Finalist'][i] for i in prob.index],
    y=prob,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

plt.tick_params(labelleft=False, left=False, labelsize=12)

for i in ax.containers:
    ax.bar_label(i, label_type='edge', fontsize=12)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.title("Probability of the Host Reaching Semi Final")

# for each cup, check if the host is in the semi-finalist
# = Winner + Second + Third + Fourth

hist_worldcup['HostSemiFinalist'] = 0

for i in range(len(hist_worldcup)):
    if hist_worldcup['HostCountry'][i] in \
        [   hist_worldcup['Winner'][i], \
            hist_worldcup['Second'][i], \
            hist_worldcup['Third'][i], \
            hist_worldcup['Fourth'][i]
        ]:
        hist_worldcup.loc[i, 'HostSemiFinalist'] = 1
    else:
        hist_worldcup.loc[i, 'HostSemiFinalist'] = 0

prob = hist_worldcup['HostSemiFinalist'].value_counts(normalize=True)

sns.barplot(
    x=[['Not Host Semi Finalist', 'Host Semi Finalist'][i] for i in prob.index],
    y=prob,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

plt.tick_params(labelleft=False, left=False, labelsize=12)

for i in ax.containers:
    ax.bar_label(i, label_type='edge', fontsize=12)

## Analysis for Matches

In [None]:
matches = pd.read_csv('WorldCupMatches.csv')
matches

In [None]:
# Chinese team matches
matches[(
        (matches['Home Team Name'] == 'China PR') 
        | (matches['Away Team Name'] == 'China PR')
)]

### Preprocessing

In [84]:
# Germany FR to Germany
matches = matches.replace('Germany FR', 'Germany')

# Type conversion
matches['Home Team Goals'] = matches['Home Team Goals'].astype(int)
matches['Away Team Goals'] = matches['Away Team Goals'].astype(int)

# format result
matches['Result'] = \
    matches['Home Team Goals'].astype(str) \
    + ' - ' \
    + matches['Away Team Goals'].astype(str)

### Attendance Analysis

In [None]:
top5_attendance = matches.sort_values(by='Attendance', ascending=False).head(5)
top5_attendance

top5_attendance['VS'] = \
    top5_attendance['Home Team Name'] \
    + ' vs ' \
    + top5_attendance['Away Team Name']

plt.figure(figsize=(12, 10))

ax = sns.barplot(
    y='VS',
    x='Attendance',
    data=top5_attendance
)

plt.title("Top 5 Matches with Highest Attendance")

sns.despine(right=True, top=True)

plt.ylabel('Teams')
plt.xlabel('Attendance')

for i, s in enumerate(
    'Stadium: ' + top5_attendance['Stadium'] + '\n' +
    'Date: ' + top5_attendance['Datetime'] + '\n' +
    'Attendance: ' + top5_attendance['Attendance'].astype(str)
):
    ax.text(2000, i + 0.1, s, color='white', fontsize=14)

plt.show()

The results might be somewhat surprising. The top 5 matches with the highest attendance include 4 matches from the 1950 Brazil World Cup, which highlights the passion of Brazilians for football and reaffirms Brazil's title as the football kingdom.

Furthermore, we see that the top 4 matches all took place at the "Maracanã - Estádio Jornalista Mário Filho" stadium, commonly known as the Maracanã Stadium, located in Rio de Janeiro, Brazil. It is one of the most famous football stadiums in the world and has hosted 8 World Cup finals. Before its renovation, the Maracanã Stadium could accommodate 200,000 spectators. However, due to FIFA's requirements to ensure the safety of the audience, the renovated stadium can now only hold 82,000 spectators. Despite this, it still holds a significant place in the hearts of football fans worldwide.

### Goals Analysis

In [None]:
matches['Total Goals'] = matches['Home Team Goals'] + matches['Away Team Goals']
matches['VS'] = \
    matches['Home Team Name'] \
    + ' vs ' \
    + matches['Away Team Name']

top10_goals = matches.sort_values(by='Total Goals', ascending=False).head(10)

top10_goals['Home Team Goals'] = top10_goals['Home Team Goals'].astype(str)
top10_goals['Away Team Goals'] = top10_goals['Away Team Goals'].astype(str)

top10_goals['Result'] = \
    top10_goals['Home Team Goals'] \
    + ' - ' \
    + top10_goals['Away Team Goals']

plt.figure(figsize=(12, 10))

ax = sns.barplot(
    y='VS',
    x='Total Goals',
    data=top10_goals
)

plt.title("Top 10 Matches with Highest Total Goals")

sns.despine(right=True, top=True)

plt.ylabel('Teams')
plt.xlabel('Total Goals')

for i, s in enumerate(
    'Stadium: ' + top10_goals['Stadium'] + ', ' +
    'Date: ' + top10_goals['Datetime'] + '\n' +
    'Total Goals: ' + top10_goals['Total Goals'].astype(str)
):
    ax.text(1, i + 0.2, s, color='white', fontsize=14)
    pass

plt.show()

In [None]:
matches['Difference'] = pd.Series.abs(matches['Home Team Goals'] - matches['Away Team Goals'])

top10_difference = matches.sort_values(by='Difference', ascending=False).head(10)

top10_difference['Home Team Goals'] = top10_difference['Home Team Goals'].astype(str)
top10_difference['Away Team Goals'] = top10_difference['Away Team Goals'].astype(str)

top10_difference['Result'] = \
    top10_difference['Home Team Goals'] \
    + ' - ' \
    + top10_difference['Away Team Goals']

plt.figure(figsize=(12, 10))

ax = sns.barplot(
    y='VS',
    x='Difference',
    data=top10_difference
)

plt.title("Top 10 Matches with Highest Goal Difference")

sns.despine(right=True, top=True)

plt.ylabel('Teams')
plt.xlabel('Goal Difference')

for i, s in enumerate(
    'Stadium: ' + top10_difference['Stadium'] + ', ' +
    'Date: ' + top10_difference['Datetime'] + '\n' +
    'Goal Difference: ' + top10_difference['Difference'].astype(str)
):
    ax.text(1, i + 0.2, s, color='white', fontsize=14)
    pass

plt.show()

In [None]:
list_countries = matches['Home Team Name'].unique().tolist()

lista_home, lista_away = [], []

for i in list_countries:
    lista_home.append(matches[matches['Home Team Name'] == i]['Home Team Goals'].sum())
    lista_away.append(matches[matches['Away Team Name'] == i]['Away Team Goals'].sum())

df = pd.DataFrame({
    'Country': list_countries,
    'Home Goals': lista_home,
    'Away Goals': lista_away
})

df['Total Goals'] = df['Home Goals'] + df['Away Goals']

most_goals = df.sort_values(by='Total Goals', ascending=False).head(10)
most_goals

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))

plt.title("Top 10 Countries with Most Goals")
most_goals.plot.bar(
    x='Country',
    y=['Home Goals', 'Away Goals', 'Total Goals'],
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

plt.tick_params(labelleft=False, left=False, labelsize=12)

for i in ax.containers:
    ax.bar_label(i, label_type='edge', fontsize=12)

plt.show()

In [None]:
finalista = hist_worldcup['Winner'].tolist() + hist_worldcup['Second'].tolist()
finalista = list(set(finalista))

goals_conceded_home, goal_sconceded_away = [], []

match1, match2 = [], []

for i in finalista:
    goals_conceded_home.append(matches[matches['Home Team Name'] == i]['Away Team Goals'].sum())
    goal_sconceded_away.append(matches[matches['Away Team Name'] == i]['Home Team Goals'].sum())

    match1.append((matches['Home Team Name'] == i).sum())
    match2.append((matches['Away Team Name'] == i).sum())

df = pd.DataFrame({
    'Country': finalista,
    'Goals Conceded Home': goals_conceded_home,
    'Goals Conceded Away': goal_sconceded_away,
    'Matches Home': match1,
    'Matches Away': match2
})

df['Total Goals Conceded'] = df['Goals Conceded Home'] + df['Goals Conceded Away']
df['Total Matches'] = df['Matches Home'] + df['Matches Away']
df['Goal Match Rate'] = df['Total Goals Conceded'] / df['Total Matches']

goals_conceded = df.sort_values(by='Goal Match Rate', ascending=True).head(10)
goals_conceded

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 8))

plt.suptitle("Top 10 Countries with Best Goal Match Rate")

goals_conceded.plot.bar(
    x='Country',
    y=['Goals Conceded Home', 'Goals Conceded Away', 'Total Goals Conceded'],
    ax=ax[1]
)

goals_conceded.plot.bar(
    x='Country',
    y='Goal Match Rate',
    ax=ax[0]
)

# disable all the spines
for spine in ax[0].spines.values():
    spine.set_visible(False)
for spine in ax[1].spines.values():
    spine.set_visible(False)

# disable labels
ax[0].set_xlabel(None)
ax[0].set_ylabel(None)
ax[1].set_xlabel(None)
ax[1].set_ylabel(None)

plt.tick_params(labelleft=False, left=False, labelsize=12)

for i in ax[0].containers:
    ax[0].bar_label(i, label_type='edge', fontsize=12)
for i in ax[1].containers:
    ax[1].bar_label(i, label_type='edge', fontsize=12)

plt.show()