In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from nssstats.plots import std_plot
from nssstats.plots import iqr_plot
from nssstats.plots import quadrant_plot, half_plot
from ipywidgets import interact, FloatSlider
import warnings
import plotly.express as px 
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly import graph_objects as go
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode,iplot
#from dash import Dash, dcc, html, Input, Output
from IPython.core.interactiveshell import InteractiveShell
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

In [None]:
wc = pd.read_csv("WorldCups")
wc_player = pd.read_csv("WorldCupPlayers.csv")
wc_matches = pd.read_csv("WorldCupMatches.csv")

In [None]:
wc.head(3)

In [None]:
wc_player.head(3)

In [None]:
wc_matches.head(3)

How many Null Values do we have in each Dataframe.

In [None]:
wc.isnull().sum()

In [None]:
wc_player.isnull().sum()

In [None]:
wc_matches.isnull().sum()

Let's Drop all the Null Values

In [None]:
wc = wc.dropna()
wc_player = wc_player.dropna()
wc_matches = wc_matches.dropna()

# Let's First Explore the World Cup Dataframe

In [None]:
wc.info

In [None]:
wc.dtypes

Let's add a column to the data about the Champion's continent.

In [None]:

wc['Champion_Continent']= ['South America','Europe','Europe','South America','Europe','South America','South America','Europe','South America','Europe',
                                   'South America','Europe','South America','Europe','South America','Europe','South America','Europe','Europe','Europe','Europe']



Let's add a column to see if the host country won the world cup or not.

In [None]:
wc['Host_Winner']= wc['Country']== wc['Winner']

Let's add a column to see if the host country makes the Final Four.

In [None]:
wc['Host_Final_Four'] = wc[['Winner','Runners-Up','Third','Fourth']].eq(wc['Country'],axis=0).any(1)

Let's Combine Germany FR & Germany 

In [None]:
wc = wc.replace(['Germany FR'],'Germany')

In [None]:
sns.heatmap(wc.corr(numeric_only=True), annot=True, cmap="coolwarm")

In [None]:
plotPerColumnDistribution(wc, 10, 5)

In [None]:
sns.set(style="ticks", color_codes=True)

sns.pairplot(wc);

In [None]:
fig, ax= plt.subplots(figsize=(12,8))
plt.title('Number of Countries in the World Cup',size=20,weight='bold')
wc.plot.scatter(x='QualifiedTeams',y='Year',ax=ax,zorder=2,s=100)
ax.spines[['right', 'top', 'left','bottom']].set_visible(False)
ax.set_ylabel(None)
ax.set_xlabel(None)
ax.grid(visible=True)
ax.tick_params(axis='both', which='major', labelsize=15)
ax.set_yticks(hist_worldcup['Year'].tolist())
ax.set_xticks([0,16,24 ,32,48])
plt.tick_params(bottom=False, left=False)

In [None]:
palette=['coral','orange','orange','yellow','firebrick','coral','coral','firebrick']
fig, ax= plt.subplots(figsize=(16,8))

plt.title('World Cup Champions',size=20,weight='bold')
sns.countplot(x = wc['Winner'], palette=palette,linewidth=2.5, edgecolor=".2")
ax.spines[['right', 'top', 'left','bottom']].set_visible(False)
ax.set_ylabel(None)
ax.set_xlabel(None)
plt.tick_params(labelleft=False, left=False,labelsize=14)

for i in ax.containers:
    ax.bar_label(i,fontsize=15);

In [None]:
gold = world_cups["Winner"]
silver = world_cups["Runners-Up"]
bronze = world_cups["Third"]

gold_count = pd.DataFrame.from_dict(gold.value_counts())
silver_count = pd.DataFrame.from_dict(silver.value_counts())
bronze_count = pd.DataFrame.from_dict(bronze.value_counts())
podium_count = gold_count.join(silver_count, how='outer').join(bronze_count, how='outer')
podium_count = podium_count.fillna(0)
podium_count.columns = ['WINNER', 'SECOND', 'THIRD']
podium_count = podium_count.astype('int64')
podium_count = podium_count.sort_values(by=['WINNER', 'SECOND', 'THIRD'], ascending=False)

podium_count.plot(y=['WINNER', 'SECOND', 'THIRD'], kind="bar", 
                  color =['gold','silver','brown'], figsize=(15, 6), fontsize=14,
                 width=0.8, align='center')
plt.xlabel('Countries')
plt.ylabel('Number of podium')
plt.title('Number of podium by country')

Which Continent Has Won The Most World Cups?

In [None]:
index = wc['Champion_Continent'].value_counts().index.tolist() #preprocessing for plotting a pie chart
value = wc['Champion_Continent'].value_counts().values.tolist()

In [None]:
palette=['firebrick','yellow']

fig, ax= plt.subplots(nrows=1, ncols=2, figsize=(16,8))


sns.countplot(ax=ax[0],x=wc['Champion_Continent'],palette=palette,linewidth=2.5, edgecolor=".2")
ax[0].set_title('World Cup by Continent',size=20,weight='bold')
ax[0].spines[['right', 'top', 'left','bottom']].set_visible(False)
ax[0].set_ylabel(None)
ax[0].set_xlabel(None)
ax[0].tick_params(labelleft=False, left=False,labelsize=14)


for i in ax[0].containers:
    ax[0].bar_label(i,fontsize=15);
    
    
    
plt.pie(value, labels = index, autopct='%.0f%%' ,colors=['yellow','firebrick'],wedgeprops={"edgecolor":"0",'linewidth': 2.5,
                    'antialiased': True},startangle=90,textprops={'fontsize': 20})
ax[1].set_title('World Cup winning percentage by Continent',size=20,weight='bold');

Does The Host Nation Reach The Final?

In [None]:
index1 = wc['Host_Winner'].value_counts().index.tolist()
value1 = wc['Host_Winner'].value_counts().values.tolist()

In [None]:
palette=['firebrick','yellow']

fig, ax= plt.subplots(nrows=1, ncols=2,figsize=(16,8))

sns.countplot(ax=ax[0], x=wc['Host_Winner'],palette=palette,linewidth=2.5, edgecolor=".2")
ax[0].set_title('Host Nation Finalist?',size=20,weight='bold')
ax[0].spines[['right', 'top', 'left','bottom']].set_visible(False)
ax[0].set_ylabel(None)
ax[0].set_xlabel(None)
ax[0].tick_params(labelleft=False, left=False,labelsize=14)
for i in ax[0].containers:
    ax[0].bar_label(i,fontsize=15);

    
plt.pie(value1, labels = index1, autopct='%.0f%%' ,colors=['firebrick','yellow'],wedgeprops={"edgecolor":"0",'linewidth': 2.5,
                    'antialiased': True},startangle=90,textprops={'fontsize': 20})
ax[1].set_title('Percentage',size=20,weight='bold');

Does The Host Nation Reach The Semi-Finals (Final Four)?

In [None]:
index2  = hist_worldcup['Host_Final_Four'].value_counts().index.tolist()
value2 = hist_worldcup['Host_Final_Four'].value_counts().values.tolist()

In [None]:
palette=['firebrick','yellow']

fig, ax= plt.subplots(nrows=1, ncols=2,figsize=(16,8))

sns.countplot(ax=ax[0],x=wc['Host_Final_Four'],palette=palette,linewidth=2.5, edgecolor=".2")
ax[0].set_title('Host in Final Four',size=20,weight='bold')
ax[0].spines[['right', 'top', 'left','bottom']].set_visible(False)
ax[0].set_ylabel(None)
ax[0].set_xlabel(None)
ax[0].tick_params(labelleft=False, left=False,labelsize=14)
for i in ax[0].containers:
    ax[0].bar_label(i,fontsize=15);
    
plt.pie(value2, labels = index2, autopct='%.0f%%' ,colors=['yellow','firebrick'],wedgeprops={"edgecolor":"0",'linewidth': 2.5,
                    'antialiased': True},startangle=90,textprops={'fontsize': 20})
ax[1].set_title('Percentage',size=20,weight='bold');

Comparing Attendance, Qualified Teams, Matches Played, and Goals Scored By Year.

In [None]:
plt.figure(figsize = (22,12))
sns.set_style("whitegrid")
plt.subplot(221)
g1 = sns.barplot(x="Year", y="Attendance", data=wc, palette="Blues")
g1.set_title("ATTENDANCE PER CUP", fontsize=14)

plt.subplot(222)
g2 = sns.barplot(x="Year", y="QualifiedTeams", data=wc, palette="Blues")
g2.set_title("NUMBER OF TEAMS PER CUP", fontsize=14)

plt.subplot(223)
g2 = sns.barplot(x="Year", y="MatchesPlayed", data=wc, palette="Blues")
g2.set_title("NUMBER OF MATCHS PER CUP", fontsize=14)

plt.subplot(224)
g2 = sns.barplot(x="Year", y="GoalsScored", data=wc, palette="Blues")
g2.set_title("NUMBER OF GOALS PER CUP", fontsize=14)

plt.subplots_adjust(wspace = 0.2, hspace = 0.4,top = 0.9)

plt.show()

# Let's Explore the World Cup Matches Dataframe

In [None]:
wc_matches.info

In [None]:
wc_matches.dtypes

In [None]:
sns.heatmap(wc_matches.corr(numeric_only=True), annot=True, cmap="coolwarm")

In [None]:
plotPerColumnDistribution(wc_matches, 10, 5)

In [None]:
sns.set(style="ticks", color_codes=True)

sns.pairplot(wc_matches);

Let's look at which country scored the most Goals in World Cup History and distinguish how many goals were scored when the team was home vs away.

In [None]:
wc_matches['Home Team Goals'] = wc_matches['Home Team Goals'].astype('int64')
wc_matches['Away Team Goals'] = wc_matches['Away Team Goals'].astype('int64')
wc_matches['Half-time Home Goals'] = wc_matches['Half-time Home Goals'].astype('int64')
wc_matches['Half-time Away Goals'] = wc_matches['Half-time Away Goals'].astype('int64')
wc_matches['Attendance'] = wc['Attendance'].astype('int64')
wc_matches.dtypes

In [None]:
wc_matches.Datetime = pd.to_datetime(wc_matches.Datetime)
wc_matches.dtypes

In [None]:
goals_by_home_team = wc_matches[['Home Team Goals','Home Team Initials']].groupby('Home Team Initials')
ax = goals_by_home_team.sum() \
.sort_values('Home Team Goals', ascending = False) \
.head(25) \
.plot(kind ='bar', figsize =(10,8))

goals_by_away_team = wc_matches[['Away Team Goals','Away Team Initials']].groupby('Away Team Initials')
goals_by_away_team.sum() \
.sort_values('Away Team Goals', ascending = False) \
.head(25) \
.plot(kind ='bar', figsize =(10,8), ax=ax,stacked = True, color ='red')
plt.title('Total Goals Scored by Team')
plt.xlabel('Team Initials')
plt.ylabel('Number of Goals');

Is it getting harder to score goals each World Cup?

In [None]:
wc_matches['Total Goals'] = wc_matches['Home Team Goals'] + wc_matches['Away Team Goals']
wc_matches.head(3)

In [None]:
grouped_by_year = wc_matches[['Total Goals','Year']].groupby('Year')
goals_per_year = grouped_by_year['Total Goals'].sum()
goals_per_year.plot(color = 'red', figsize =(10,7))
plt.title('Total Goals Scored', fontsize = 18)
plt.xlabel('World Cup')
plt.ylabel('Number of Goals');

In [None]:
games_per_year = wc['Year'].value_counts()
games_per_year.sort_index() \
.plot(color = 'red', figsize = (10,7));
plt.title('Total Games Played', fontsize = 18)
plt.xlabel('World Cup')
plt.ylabel('Number of Games');

In [None]:
goals_per_game_per_year = goals_per_year / games_per_year
goals_per_game_per_year.plot(figsize = (10,7), color = 'Teal')
plt.title('Average Goals per Game by Year', fontsize = 20)
plt.xlabel('World Cup')
plt.ylabel('Average Number of Goals per Game');

What time of the day are most goals scored?

In [None]:
wc_matches['Hour Played'] = round(wc_matches['Datetime'].dt.minute/60 + wc_matches['Datetime'].dt.hour)
grouped_by_time_played = wc.groupby('Hour Played')
avg_goal_per_hour = grouped_by_time_played['Total Goals'].sum()/grouped_by_time_played['Total Goals'].count()
avg_goal_per_hour.plot(kind = 'bar', figsize = (18,9), color = 'purple');
plt.title('Goals Scored at Times of Day', fontsize=20)
plt.xlabel('Hour of the day', fontsize= 14)
plt.ylabel('Number of Goals', fontsize = 14);

Does the Home Team have an advantage?

In [None]:
plot_univariate_countplot("Stage")