In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
odi = pd.read_csv('https://raw.githubusercontent.com/Indupuvi/Chakradbi/main/ODI%20data.csv')

# Exploratory Data Analysis

In [4]:
#Number of rows and Cols
odi.shape

(2500, 15)

In [5]:
odi.head(20)

Unnamed: 0.1,Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,Unnamed: 13
0,0,SR Tendulkar (INDIA),1989-2012,463,452,41,18426,200*,44.83,21367,86.23,49,96,20,
1,1,KC Sangakkara (Asia/ICC/SL),2000-2015,404,380,41,14234,169,41.98,18048,78.86,25,93,15,
2,2,RT Ponting (AUS/ICC),1995-2012,375,365,39,13704,164,42.03,17046,80.39,30,82,20,
3,3,ST Jayasuriya (Asia/SL),1989-2011,445,433,18,13430,189,32.36,14725,91.2,28,68,34,
4,4,DPMD Jayawardene (Asia/SL),1998-2015,448,418,39,12650,144,33.37,16020,78.96,19,77,28,
5,5,Inzamam-ul-Haq (Asia/PAK),1991-2007,378,350,53,11739,137*,39.52,15812,74.24,10,83,20,
6,6,V Kohli (INDIA),2008-2019,242,233,39,11609,183,59.84,12445,93.28,43,55,13,
7,7,JH Kallis (Afr/ICC/SA),1996-2014,328,314,53,11579,139,44.36,15885,72.89,17,86,17,
8,8,SC Ganguly (Asia/INDIA),1992-2007,311,300,23,11363,183,41.02,15416,73.7,22,72,16,
9,9,R Dravid (Asia/ICC/INDIA),1996-2011,344,318,40,10889,153,39.16,15284,71.24,12,83,13,


In [None]:
#data types check
odi.info()

In [None]:
#summary check
odi.describe()

In [None]:
#check for null values
odi.columns[odi.isnull().any()], odi.isnull().sum()

In [None]:
#delete unnamed: 0 column
odi = odi.drop(['Unnamed: 0', 'Unnamed: 13'], axis=1)
odi.head()

In [None]:
#remove ( from column player and split into region column)
odi[['Player', 'Region']] = odi['Player'].str.split("(", n=1, expand=True)
odi.head()

In [None]:
#removing ) from region column
odi['Region'] = odi['Region'].map(lambda x: x.rstrip(')'))
odi.head(2)

In [None]:
#split span column into start and end year
odi[['Start', 'End']] = odi['Span'].str.split("-", n=1, expand=True)
odi.head(2)

In [None]:
#split region column into region1 and region2
odi[['Region1', 'Region']] = odi['Region'].str.split("/", n=1, expand=True)
odi.head(2)

In [None]:
#split region1 column into region2
odi[['Region2', 'Region']] = odi['Region'].str.split("/", n=1, expand=True)
odi.head(2)

In [None]:
#unique values in region1 
print("Region1 unique values are: ", odi.Region1.unique()) 
print("Region2 unique values are: ", odi.Region2.unique())
print("Region unique values are: ", odi.Region.unique())

In [None]:
def new_co(odi):
    if odi['Region1'] is not None:
        if odi['Region1'].isupper():
            if (odi.Region1=='ICC'):
                return odi['Region2']
            else:
                return odi['Region1']
                
        elif odi['Region'] is not None:
            if odi['Region'].isupper():
                if odi['Region'] is not "ICC":
                    return odi['Region']
            else:
                return odi['Region2']
        else:
                return odi['Region2']
    else:
        return "NA"

In [None]:
odi['Final Region'] = odi.apply(new_co, axis=1)
odi

In [None]:
odi["Final Region"].unique()

In [None]:
#check null values in final region
print(odi["Final Region"].isnull().sum())

In [None]:
#drop region1, region2, region columns
odi = odi.drop(["Region1", "Region2", "Region"], axis=1)
odi.head()

In [None]:
#check unique values in final region
odi["Final Region"].unique()

In [None]:
odi = odi.drop([2148, 949])
odi

In [None]:
odi["Final Region"].unique()

In [None]:
#Rmoving data of East African Region 
odi=odi[odi['Final Region'].isnull()==False]

In [None]:
odi["Final Region"].unique()

In [None]:
#check null values in final region

odi[odi["Final Region"].isnull()]
odi.isna().sum()

In [None]:
#removing * from HS column

odi["HS"] = odi["HS"].str.replace('*', '')
odi.tail(50)

In [None]:
#replacing - with 0 in odi data

odi.replace('-', 0, inplace=True)
odi.tail(20)

In [None]:
#check datatypes of Runs, Ave, HS, and SR
odi.info()

In [None]:
#Change datatype of Runs, Ave, HS, and SR to floats
odi[['Runs', 'Ave', 'HS', 'SR']] = odi[['Runs', 'Ave', 'HS', 'SR']].astype('float')

In [None]:
odi.info()

In [None]:
odi.groupby('Final Region').mean().sort_values('Mat').tail(10)

# Data Visualization

In [None]:
#plotting

plt.figure(figsize=(20,15))
# plt.bar(odi['Final Region'], odi['HS'], color='red')
plt.bar(odi['Final Region'], odi['HS'], color='cyan', width=0.5, edgecolor='black')

In [None]:
plt.figure(figsize=(20,5))
sns.pairplot(odi, hue='Final Region', palette='Set1')

In [None]:
sns.catplot(x='Final Region', y='HS', data=odi, kind='box', palette='Set3', height=10, aspect=2)

In [None]:
sns.displot(odi, x='HS', hue='Final Region', kind='kde', palette='Set1', height=10, aspect=2)

In [None]:
import plotly.express as px
px.scatter(odi, x="HS", y="Player", animation_frame="End", animation_group="Final Region",
           size="Runs", color="Final Region", hover_name="Final Region",
           log_x=False, size_max=200, range_x=[10,250], range_y=[0,90], height=1000)

In [None]:
import plotly.express as px
px.scatter(odi, x="HS", y="Player", animation_frame="Start", animation_group="Final Region",
           size="Runs", color="Final Region", hover_name="Final Region",
           log_x=False, size_max=200, range_x=[10,250], range_y=[0,90], height=1000)