In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
# init graph

font = {
    'family' : 'DejaVu Sans',
    'weight' : 'normal',
    'size'   : 20
}

plt.rc('font', **font)

# Summary

In [None]:
hist_worldcup = pd.read_csv('WorldCupsSummary.csv')

hist_worldcup

From the table above, we can conclude the summary for all world cups.

## Preprocessing

In [None]:
hist_worldcup = hist_worldcup.replace('Germany FR', 'Germany') # Replacing Germany FR with Germany

hist_worldcup

## Analysis

To analyze the following questions:

- Trends in the number of live audience attendance over the years  
- Trends in the number of participating teams over the years  
- Trends in the number of goals scored over the years  
- Which national team has won the championship the most in history?  
- Analysis of the continents of champion teams  
- Which national teams frequently reach the finals or semi-finals?  
- What is the probability of a team winning the championship after reaching the finals?  
- How likely is the host country to reach the finals or semi-finals?  

### Trends in the number of live audience attendance over the years

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.title("Attendance per cup")

hist_worldcup.plot.scatter(
    x='Year', 
    y='Attendance', 
    s=100,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

# enable grid
ax.grid(visible=True)

# set tick params
ax.tick_params(axis='both', which='major', labelsize=12)

# set xticks
ax.set_xticks(np.arange(1930, 2021, 5))
ax.set_yticks(np.arange(0, 4000000, 500000))

ax.ticklabel_format(style='plain')

plt.tick_params(bottom=False, left=False)

### Trends in the number of participating teams over the years  

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.title("Teams per cup")

hist_worldcup.plot.scatter(
    x='Year', 
    y='QualifiedTeams', 
    s=100,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

# enable grid
ax.grid(visible=True)

# set tick params
ax.tick_params(axis='both', which='major', labelsize=12)

# set xticks
ax.set_xticks(np.arange(1930, 2021, 5))
ax.set_yticks(np.arange(0, 40, 2))

ax.ticklabel_format(style='plain')

plt.tick_params(bottom=False, left=False)

### Trends in the number of goals scored over the years  

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.title("Goals per cup")

hist_worldcup.plot.scatter(
    x='Year', 
    y='GoalsScored', 
    s=100,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

# enable grid
ax.grid(visible=True)

# set tick params
ax.tick_params(axis='both', which='major', labelsize=12)

# set xticks
ax.set_xticks(np.arange(1930, 2021, 5))
ax.set_yticks(np.arange(0, 200, 10))

ax.ticklabel_format(style='plain')

plt.tick_params(bottom=False, left=False)

### Which national team has won the championship the most in history? 

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.title("Champion Number per Country")

sns.countplot(
    x='Winner',
    data=hist_worldcup,
    order = hist_worldcup['Winner'].value_counts().index,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

plt.tick_params(labelleft=False, left=False, labelsize=14)

for i in ax.containers:
    ax.bar_label(i, label_type='edge', fontsize=14)

### Analysis of the continents of champion teams  

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.title("Champion Number per Continent")

sns.countplot(
    x='WinnerContinent',
    data=hist_worldcup,
    order = hist_worldcup['WinnerContinent'].value_counts().index,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

plt.tick_params(labelleft=False, left=False, labelsize=14)

for i in ax.containers:
    ax.bar_label(i, label_type='edge', fontsize=14)

### Which national teams frequently reach the finals or semi-finals?  


In [None]:
fig, ax = plt.subplots(figsize=(28, 4))
plt.title("Semi Finalist per Country")

# calculate the number of semi-finalists
# = Winner + Second + Third + Fourth

# get all occurrences of the countries
countries = []
for i in hist_worldcup['Winner']:
    countries.append(i)
for i in hist_worldcup['Second']:
    countries.append(i)
for i in hist_worldcup['Third']:
    countries.append(i)
for i in hist_worldcup['Fourth']:
    countries.append(i)

sns.countplot(
    x=countries,
    order = pd.Series(countries).value_counts().index,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

plt.tick_params(labelleft=False, left=False, labelsize=12)

for i in ax.containers:
    ax.bar_label(i, label_type='edge', fontsize=12)

In [None]:
fig, ax = plt.subplots(figsize=(28, 4))
plt.title("Finalist per Country")

# calculate the number of finalists
# = Winner + Second

# get all occurrences of the countries
countries = []
for i in hist_worldcup['Winner']:
    countries.append(i)
for i in hist_worldcup['Second']:
    countries.append(i)

sns.countplot(
    x=countries,
    order = pd.Series(countries).value_counts().index,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

plt.tick_params(labelleft=False, left=False, labelsize=12)

for i in ax.containers:
    ax.bar_label(i, label_type='edge', fontsize=12)

### What is the probability of a team winning the championship after reaching the finals?  

In [None]:
fig, ax = plt.subplots(figsize=(20, 4))
plt.title("Probability of Winning after Reaching Final")

# calculate the number of finalist
# = Winner + Second

# get all occurrences of the countries
countries = []
for i in hist_worldcup['Winner']:
    countries.append(i)
for i in hist_worldcup['Second']:
    countries.append(i)

# calculate the probability of winning after reaching final
winning = []
for i in hist_worldcup['Winner']:
    winning.append(i)

probability = []
for i in countries:
    probability.append(winning.count(i) / countries.count(i))

# sort the countries by probability
countries, probability = zip(*sorted(zip(countries, probability), key=lambda x: x[1], reverse=True))

sns.barplot(
    x=countries,
    y=probability,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

plt.tick_params(labelleft=False, left=False, labelsize=12)

for i in ax.containers:
    ax.bar_label(i, label_type='edge', fontsize=12)

### How likely is the host country to reach the finals or semi-finals? 

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.title("Probability of the Host Reaching Final")

# for each cup, check if the host is in the finalist
# = Winner + Second

hist_worldcup['HostFinalist'] = 0

for i in range(len(hist_worldcup)):
    if hist_worldcup['HostCountry'][i] in [hist_worldcup['Winner'][i], hist_worldcup['Second'][i]]:
        hist_worldcup.loc[i, 'HostFinalist'] = 1
    else:
        hist_worldcup.loc[i, 'HostFinalist'] = 0

prob = hist_worldcup['HostFinalist'].value_counts(normalize=True)

sns.barplot(
    x=[['Not Host Finalist', 'Host Finalist'][i] for i in prob.index],
    y=prob,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

plt.tick_params(labelleft=False, left=False, labelsize=12)

for i in ax.containers:
    ax.bar_label(i, label_type='edge', fontsize=12)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.title("Probability of the Host Reaching Semi Final")

# for each cup, check if the host is in the semi-finalist
# = Winner + Second + Third + Fourth

hist_worldcup['HostSemiFinalist'] = 0

for i in range(len(hist_worldcup)):
    if hist_worldcup['HostCountry'][i] in \
        [   hist_worldcup['Winner'][i], \
            hist_worldcup['Second'][i], \
            hist_worldcup['Third'][i], \
            hist_worldcup['Fourth'][i]
        ]:
        hist_worldcup.loc[i, 'HostSemiFinalist'] = 1
    else:
        hist_worldcup.loc[i, 'HostSemiFinalist'] = 0

prob = hist_worldcup['HostSemiFinalist'].value_counts(normalize=True)

sns.barplot(
    x=[['Not Host Semi Finalist', 'Host Semi Finalist'][i] for i in prob.index],
    y=prob,
    ax=ax
)

# disable all the spines
for spine in ax.spines.values():
    spine.set_visible(False)

# disable labels
ax.set_xlabel(None)
ax.set_ylabel(None)

plt.tick_params(labelleft=False, left=False, labelsize=12)

for i in ax.containers:
    ax.bar_label(i, label_type='edge', fontsize=12)