# **Project Name: World Cup 2023 Data Analysis**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv('/content/CWC23_all_innings.csv')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

# ***1. Team Performance Analysis:***

**- Explore team-wise performance metrics.**

In [None]:
# Exploring team-wise performance metrics
team_performance = df.groupby('team').agg({
    'runs': 'sum',
    'wkts': 'sum',
    'bat_or_bowl': lambda x: x.mode().iloc[0]  # Most frequent value indicates batting or bowling style
}).reset_index()

# Visualize top-performing teams
teams = team_performance.nlargest(10, 'runs')
sns.barplot(x='runs', y='team', data=teams, palette='viridis')
plt.title('Performance of  Teams - Runs Scored')
plt.xlabel('Total Runs Scored')
plt.ylabel('Team')
plt.show()


So the newzeland team has scored most runs and Afghanistan scored the least in the whole tournament. So it shows the teams like Newzeland, Australia and South Africa were the high-scoring teams. While Sirilanka, Bangladesh, Netherland and Afghanistan are among the low scoring teams.

In [None]:
# a bar chart for the average runs scored by players of each team in increasing order
teams = df['team'].unique()

team_avg_runs_data = df[df['bat_or_bowl'] == 'bat'].groupby('team').agg({
    'runs': 'mean'
}).reset_index().sort_values('runs')

plt.figure(figsize=(12, 6))
sns.barplot(x='team', y='runs', data=team_avg_runs_data, palette='muted')
plt.title('Team-wise Average Runs Scored by Players (Increasing Order)')
plt.xlabel('Team')
plt.ylabel('Average Runs')
plt.show()
print(team_avg_runs_data.sort_values('runs',ascending = False))

This graph shows the batting average by the players of each team. The batting average is the average of runs scored by the batters of that team. This is a good way to evaluate the batting performance of a team.The table shows that India is leading it followed by NZ and SA. And BAN, ENG and NED are present down the table.

In [None]:
#a bar chart for the bowling performance by the bowling average of each team in increasing order
teams = df['team'].unique()

team_bowling_avg_data = df[df['bat_or_bowl'] == 'bowl'].groupby('team').agg({
    'runs': 'sum',
    'wkts': 'sum'
}).reset_index()

# Calculate bowling average (runs conceded per wicket)
team_bowling_avg_data['bowling_avg'] = team_bowling_avg_data['runs'] / team_bowling_avg_data['wkts']

# Sort teams based on bowling average in increasing order
team_bowling_avg_data = team_bowling_avg_data.sort_values('bowling_avg')

plt.figure(figsize=(12, 6))
sns.barplot(x='team', y='bowling_avg', data=team_bowling_avg_data, palette='muted')
plt.title('Team-wise Bowling Performance by Bowling Average (Increasing Order)')
plt.xlabel('Team')
plt.ylabel('Bowling Average (Runs Conceded per Wicket)')
plt.show()
print(team_bowling_avg_data.sort_values('bowling_avg',ascending = True))


It shows the bowling average of each team. Bowling average represent the no of runs conceded per wicket by the bowler. It is a good way to evaluate the bowling performance by the team. The table and plot shows that India is leading the table with an average of just 20 followed by South Africa and Australia ,while Sirilanka is present down the table with an bowling average of 48

In [None]:
#total wickets taken by each team
total_wickets_by_team = df.groupby('team')['wkts'].sum().reset_index()
total_wickets_by_team = total_wickets_by_team.sort_values('wkts', ascending = False)

plt.figure(figsize=(10, 6))
sns.barplot(x='wkts', y='team', data=total_wickets_by_team, palette='muted')
plt.title('Total Wickets Taken by Each Team')
plt.xlabel('Total Wickets Taken')
plt.ylabel('Team')

# Adjust the x-axis labels for better readability
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

plt.show()
print(total_wickets_by_team)

This graph shows the comparison of wickets taken by each team in the tournament. It is a way to check the wicket-taking ability of the bowlers of a team. It shows that Team india has taken the most wickets i.e. 94 while team Sirilanka has taken the least i.e 50.

In [None]:
avg_economy_by_team = df.groupby('team')['econ'].mean().reset_index()
# Sort the dataframe by average economy in ascending order
avg_economy_by_team = avg_economy_by_team.sort_values('econ', ascending=True)

# Visualize average economy for each team's bowlers
plt.figure(figsize=(10, 6))
sns.barplot(x='team', y='econ', data=avg_economy_by_team, palette='viridis')
plt.title('Average Economy of Each Team\'s Bowlers')
plt.xlabel('Team')
plt.ylabel('Average Economy')

# Adjust the x-axis labels for better readability
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

plt.show()
print(avg_economy_by_team)

This graph shows the comparisong of economy of the blowlers of each team. Economt is the score hit to the bowler in an over. And it shows the bowling quality of the bowler. Team india has the lowest economy while sirilnkan bowlers bowled expensively and rest of team has plotted above. So Indian Bowlers performed best in the tournament as compared to the bowlers of the rest of the teams followed by Australia and South Africa.


In [None]:


avg_wicketbal_by_team = df.groupby('team')['wicketball_prob'].mean().reset_index()

# Sort the dataframe by average wickets/ball in ascending order
avg_wicketbal_by_team = avg_wicketbal_by_team.sort_values('wicketball_prob', ascending=True)

# Visualize average wickets/ball for each team
plt.figure(figsize=(10, 6))
sns.barplot(x='team', y='wicketball_prob', data=avg_wicketbal_by_team, palette='viridis')
plt.title('Average Wickets/Ball for Each Team')
plt.xlabel('Team')
plt.ylabel('Average Wickets/Ball')

# Adjust the x-axis labels for better readability
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

plt.show()
print(avg_wicketbal_by_team)


This graph shows the comparison of Average wickets per ball. Which shows the wicket taking ability of the bowlers of each team. So if we see the average  wickets taken per ball by each team bowlers, We will observe that Afghanistan bowlwers are the worst in wicket taking while sirilankan bowlers although so expensive but very good at taking wickets with a high per ball wicket taking average followed by England, Netherland and Australia.

In [None]:
total_maiden_overs_by_team = df.groupby('team')['mdns'].sum().reset_index()

# Sort the dataframe by total maiden overs in ascending order
total_maiden_overs_by_team = total_maiden_overs_by_team.sort_values('mdns', ascending=False)

# Visualize total maiden overs for each team's bowlers
plt.figure(figsize=(10, 6))
sns.barplot(x='team', y='mdns', data=total_maiden_overs_by_team, palette='viridis')
plt.title('Total Maiden Overs Bowled by Each Team')
plt.xlabel('Team')
plt.ylabel('Total Maiden Overs')

# Adjust the x-axis labels for better readability
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

plt.show()
print(total_maiden_overs_by_team)

This graph shows the comparison of no of maiden overs by each team bowlers.So pakistani bowlers bowled the least no of maiden overs while indian bowlers bowled the maximum maiden overs even more than 20.So Indian bowlers are very good at bowling maiden overs.

In [None]:
df.head()
total_4s_by_team = df.groupby('team')['4s'].sum().reset_index()

# Sort the dataframe by total 4s in ascending order
total_4s_by_team = total_4s_by_team.sort_values('4s', ascending=False)

# Visualize total 4s for each team
plt.figure(figsize=(10, 6))
sns.barplot(x='team', y='4s', data=total_4s_by_team, palette='viridis')
plt.title('Total 4s Hit by Each Team')
plt.xlabel('Team')
plt.ylabel('Total 4s')

# Adjust the x-axis labels for better readability
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

plt.show()
print(total_4s_by_team)


In [None]:
total_6s_by_team = df.groupby('team')['6s'].sum().reset_index()

# Sort the dataframe by total 6s in ascending order
total_6s_by_team = total_6s_by_team.sort_values('6s', ascending=True)
print(total_6s_by_team)

# Visualize total 6s for each team
plt.figure(figsize=(10, 6))
sns.barplot(x='team', y='6s', data=total_6s_by_team, palette='viridis')
plt.title('Total 6s Hit by Each Team')
plt.xlabel('Team')
plt.ylabel('Total 6s')

# Adjust the x-axis labels for better readability
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

plt.show()

This graph is a comparison of sixes by each team. It shows the power-hitting ability of teams. As it is shown that South Africa hit the most sixes followed by Australia and India while Nehterlands hit the least no of sixes. It shows that Nz, Ind,Aus and Sa have good power-hitters.

## **Top performing teams**

So Newzeland, Australia, India and South Africa are amongst the top performing teams depending  upon their runs scored and wickets taken and the economy of their bowlers.


*   Australlia  


*   India

*   South Africa


*   Newzeland



# **2. Player Performance Analysis:
- Evaluate individual player statistics for both batting and bowling.
- Identify leading run-scorers and wicket-takers.
- Assess the impact of players on their team's performance.**

In [None]:
# Evaluating individual player statistics for both batting and bowling
player_stats = df.groupby(['team', 'player', 'bat_or_bowl']).agg({
    'runs': 'sum',
    'wkts': 'sum',
    'sr': 'mean'  # Average strike rate
}).reset_index()

# Display the player statistics
player_stats.head()


This a whole table showing the individual statistics of each player with the no of runs he scored , his strike rate while scoring and the no of wickets he taken
it also provide the iformation of whether he is a batsman or a bowler.

In [None]:
# Evaluating individual player statistics for both batting and bowling
player_stats = df.groupby(['team', 'player', 'bat_or_bowl']).agg({
    'runs': 'sum',
    'wkts': 'sum',
    'sr': 'mean'  # You can add more metrics as needed
}).reset_index()


# Identify leading run-scorers and wicket-takers
leading_scorers = player_stats[player_stats['bat_or_bowl'] == 'bat'].nlargest(10, 'runs')
leading_wicket_takers = player_stats[player_stats['bat_or_bowl'] == 'bowl'].nlargest(10, 'wkts')

# Visualize leading run-scorers
sns.barplot(x='runs', y='player', data=leading_scorers, hue='team', palette='muted')
plt.title('Leading Run-scorers')
plt.xlabel('Total Runs Scored')
plt.ylabel('Player')
plt.legend(title='Team')
plt.show()

# Visualize leading wicket-takers
sns.barplot(x='wkts', y='player', data=leading_wicket_takers, hue='team', palette='muted')
plt.title('Leading Wicket-takers')
plt.xlabel('Total Wickets Taken')
plt.ylabel('Player')
plt.legend(title='Team')
plt.show()
print(leading_scorers)
print("-----------------------------------------------------------------")
print(leading_wicket_takers)


The above plots showing the leading run-scorers and leading wicket takers of world-cup 2023. It shows that Virat Kohli is leading the batting board followed by Q de Kock of South Africa and R Ravindra from Newzeland. while Mohammad Shami is leading the bowling board followed by ADam Zampa of Australia and D Madushanka of Sirilanka and rest of the list is shown above.

# **3. Opposition and Ground Analysis:
- Investigate how teams and players perform against different oppositions.
- Examine performance variations across different playing grounds.
- Identify if there are specific teams or players that excel in certain conditions.**

In [None]:
# the individual plots for team-wise performance against different oppositions - Average Runs
teams = df['team'].unique()

for team in teams:
    team_data = df[(df['team'] == team) & (df['bat_or_bowl'] == 'bat')].groupby('opposition').agg({
        'runs': 'mean'
    }).reset_index().sort_values('runs')

    plt.figure(figsize=(10, 5))
    sns.barplot(x='runs', y='opposition', data=team_data, palette='muted')
    plt.title(f'{team} Performance Against Different Oppositions - Average Runs')
    plt.xlabel('Average Runs')
    plt.ylabel('Opposition')
    plt.show()
    print(team_data)


The above bar charts shows the batting performance by each team against different oppositions in term of the average runs scored by each batter. By observing these graphs we can conclude the performance of different teams against different oppositions.

Let's observe the performance of team India against different oppositons in term of their runs scored:

> .The average runs scored by team India players was the lowest against England followed by Australia.

> .The average runs scored by team India players was highest against Afghanistan and Netherlands.

> .And the rest of performances are shown in the plot.

> .Similarly we can the performance of desired team against the desired opposition.

In [None]:
#a single bar chart for each batsman's performance against different oppositions - Total Runs
batsmen = df[df['bat_or_bowl'] == 'bat']['player'].unique()

for batsman in batsmen:
    batsman_data = df[(df['player'] == batsman) & (df['bat_or_bowl'] == 'bat')].groupby('opposition').agg({
        'runs': 'sum'
    }).reset_index().sort_values('runs')

    plt.figure(figsize=(10, 5))
    sns.barplot(x='runs', y='opposition', data=batsman_data, palette='muted')
    plt.title(f'{batsman} Performance Against Different Oppositions - Total Runs')
    plt.xlabel('Total Runs')
    plt.ylabel('Opposition')
    plt.show()


The above plot shows the performance of each batsmen against different oppositions in term of the average runs scored by the player against that opposition. So by observing the plot of a particular batsman we can include about him that against which team he scores good runs and against which team he struggle to score.
if we see towards the performance of Mohammad Rizwan the legend wicket-keeper batasman from Pakistan so:

*   He was struggling against Afghanistan, Bangladesh, South Africa and England.
*   He was scoring good runs against Sirilanka, Netherlands and India.



In [None]:
#individual plots for team-wise performance against different oppositions - Average Economy
for team in teams:
    team_data = df[(df['team'] == team) & (df['bat_or_bowl'] == 'bowl')].groupby('opposition').agg({
        'runs': 'mean',
        'econ': 'mean'
    }).reset_index().sort_values('econ')

    plt.figure(figsize=(10, 5))
    sns.barplot(x='econ', y='opposition', data=team_data, palette='muted')
    plt.title(f'{team} Performance Against Different Oppositions - Average Economy')
    plt.xlabel('Average Economy')
    plt.ylabel('Opposition')
    plt.show()
    print(team_data)


The above chart is the presentation of each team's bowling performance against different oppositions in term of their average economy rates. It shows the comparison bowling of different teams against against different oppositions. By observing this we can conclude that which team bowled well against which team.

So if we observe team India:

.Their bowling ecomony was low against Sirilanka, South Africa and England which is a omen for good bowling

.Their bowling economy was high against Afghanistan, Newzeland, Bangladesh and Netherland which is a sign of bad bowling.

.So we can also observe the performance of the rest of the teams by looking at the respective chart.

In [None]:
#a single bar chart for each bowler's performance against different oppositions - Average Economy
bowlers = df[df['bat_or_bowl'] == 'bowl']['player'].unique()

for bowler in bowlers:
    bowler_data = df[(df['player'] == bowler) & (df['bat_or_bowl'] == 'bowl')].groupby('opposition').agg({
        'econ': 'mean'
    }).reset_index().sort_values('econ')

    plt.figure(figsize=(10, 5))
    sns.barplot(x='econ', y='opposition', data=bowler_data, palette='muted')
    plt.title(f'{bowler} Performance Against Different Oppositions - Average Economy')
    plt.xlabel('Average Economy')
    plt.ylabel('Opposition')
    plt.show()


The above plot shows the bowling economy of each bowler against different oppositions. By observing this we can say about the performance of different bowler against different oppositions.
If we observe the performance of Shaheen Shah Afridi from Pakistan So:

.His economy was low against Bangladesh, South Africa and Netherland which is a sign of nice bolwing performance.

.His economy was high against England, Sirilanka and Newzeland which is a sign of bad bowling performance

. We can also observe the performance of the rest of players by looking at their economy rates against different oppositions.

In [None]:
# individual plots for each team's runs variation across different playing grounds
teams = df['team'].unique()

for team in teams:
    team_runs_data = df[(df['team'] == team) & (df['bat_or_bowl'] == 'bat')].groupby('ground').agg({
        'runs': 'mean'
    }).reset_index().sort_values('runs')

    plt.figure(figsize=(10, 5))
    sns.barplot(x='runs', y='ground', data=team_runs_data, palette='muted')
    plt.title(f"{team}'s Runs Variation Across Different Playing Grounds")
    plt.xlabel('Average Runs')
    plt.ylabel('Ground')
    plt.show()
    print(team_runs_data)


The above plots shows the average run's scored by players of each team across  different playing grounds. By observing this graph we can conclude about the performances of teams at different locations in term of their batting performance.

So if we observe the batting performance of team India at differnt locations so we would be able to conclude that:

.Indian batters scored at a good average at Bengaluru, Delhi and Pune with an average of 65, 64 and 51 respectively.

.Indian batters scored at a worst average at Lucknow, Chennai and Dharamsala.

.In this way we can also observe the batting performance of other teams at different locations and conclude which grounds are suitalbe for which playing team. It is good analysis technique.

# **4. Temporal Analysis:
- Study performance trends over time, considering start dates and overs played.
- Identify any temporal patterns or changes in team and player performance.**

In [None]:
#The line plots for batting performance trends over time - Player-wise
batsmen = df[df['bat_or_bowl'] == 'bat']['player'].unique()

for batsman in batsmen:
    batsman_data = df[(df['player'] == batsman) & (df['bat_or_bowl'] == 'bat')].groupby('start_date').agg({
        'runs': 'mean'
    }).reset_index()

    plt.figure(figsize=(12, 6))
    plt.plot(batsman_data['start_date'], batsman_data['runs'], label='Average Runs')
    plt.title(f'{batsman} Batting Performance Trends Over Time (Player-wise)')
    plt.xlabel('Start Date')
    plt.ylabel('Average Runs')
    plt.legend()
    plt.show()


The above line plots shows the batting performance graph of each batsman over time. By observing these plots we can see the changes in the performance of players with time.

In [None]:
# the line plots for batting performance trends over time - Team-wise
teams = df['team'].unique()

for team in teams:
    team_data = df[(df['team'] == team) & (df['bat_or_bowl'] == 'bat')].groupby('start_date').agg({
        'runs': 'mean'
    }).reset_index()

    plt.figure(figsize=(12, 6))
    plt.plot(team_data['start_date'], team_data['runs'], label='Average Runs')
    plt.title(f'{team} Batting Performance Trends Over Time (Team-wise)')
    plt.xlabel('Start Date')
    plt.ylabel('Average Runs')
    plt.legend()
    plt.show()
    print(team_data)


The above line plots shows the batting performance graph of each team over time. By observing these plots we can see the changes in the performances of teams with time.

In [None]:
# The line plots for bowling performance trends over time - Player-wise
bowlers = df[df['bat_or_bowl'] == 'bowl']['player'].unique()

for bowler in bowlers:
    bowler_data = df[(df['player'] == bowler) & (df['bat_or_bowl'] == 'bowl')].groupby('start_date').agg({
        'econ': 'mean',
        'wkts': 'sum'
    }).reset_index()

    plt.figure(figsize=(12, 6))
    plt.plot(bowler_data['start_date'], bowler_data['econ'], label='Average Economy')
    plt.plot(bowler_data['start_date'], bowler_data['wkts'], label='Total Wickets Taken')
    plt.title(f'{bowler} Bowling Performance Trends Over Time (Player-wise)')
    plt.xlabel('Start Date')
    plt.ylabel('Average Economy / Total Wickets Taken')
    plt.legend()
    plt.show()


The above line plots shows the bowling performance of each team player over time in term of his wickets taken and average economy during bowling.
By observing this we can conclude about the changes in bowling performances of players against time.

In [None]:
# The line plots for bowling performance trends over time - Team-wise
for team in teams:
    team_data = df[(df['team'] == team) & (df['bat_or_bowl'] == 'bowl')].groupby('start_date').agg({
        'econ': 'mean',
        'runs': 'sum',
        'wkts': 'sum'
    }).reset_index()

    # Calculate bowling average (runs conceded per wicket)
    team_data['bowling_avg'] = team_data['runs'] / team_data['wkts']

    plt.figure(figsize=(12, 6))
    plt.plot(team_data['start_date'], team_data['econ'], label='Average Economy')
    plt.plot(team_data['start_date'], team_data['bowling_avg'], label='Bowling Average')
    plt.title(f'{team} Bowling Performance Trends Over Time (Team-wise)')
    plt.xlabel('Start Date')
    plt.ylabel('Average Economy / Bowling Average')
    plt.legend()
    plt.show()
    print(team_data)


The above line plots shows the bowling performance of each team over time in term of their wickets taken and average economy during bowling.
By observing this we can conclude about the changes in bowling performances of teams against time.