# Imports

In [None]:
import pandas as pd
import numpy as np

from pandas.plotting import scatter_matrix

import matplotlib.pyplot as plt
from pylab import rcParams

In [None]:
%matplotlib inline
rcParams['figure.figsize'] = 5, 4

In [None]:
import seaborn as sb
sb.set_style('whitegrid')

# Getting Dataframe

In [None]:
path = r'C:\Users\Mohit\Desktop\ML-Cricket-Integration-Project\Data\Engineered_Balls_Data.csv'
balls_info = pd.read_csv(path)

# Data Filteration

In [None]:
balls_info.head()

In [None]:
# Removing the Unnamed: 0 row because it was useless
balls_info = balls_info.drop(columns = ['Unnamed: 0'])
balls_info.head()

In [None]:
balls_info.info()

In [None]:
# Want to remove the data where bowling style is null (in this case, the speed will be -1)
balls_info.dropna(inplace = True)

In [None]:
# We don't want any null values
balls_info.isnull().sum()

In [None]:
len(balls_info[balls_info['Speed'] == -1])

In [None]:
# Removing the rows where speed is not found
balls_info.drop(balls_info[balls_info['Speed'] == -1].index, inplace = True)

In [None]:
len(balls_info[balls_info['Speed'] == -1])

In [None]:
# Notice how the minimum for Speed is 0 which doesn't make sense
balls_info.describe()

In [None]:
# Removing the rows where the speed is 0
balls_info.drop(balls_info[balls_info['Speed'] == 0].index, inplace = True)

In [None]:
# Still, the minimum for speed is 1 which doesn't make any sense
balls_info.describe()

In [None]:
balls_info.drop(balls_info[balls_info['Speed'] == 1].index, inplace = True)

In [None]:
# Now all the values make sense
balls_info.describe()

In [None]:
# After all these edits, we need to fix the index of balls_info
balls_info.index = np.arange(0, len(balls_info))

# Data Exploration

In [None]:
balls_info.columns

In [None]:
na_speed = len(balls_info[balls_info['Speed'] == -1])
na_speed / len(balls_info) * 100 # percent of data for speed that is not available

In [None]:
# Creating the wickets dataframe, in which only the wicket balls consist
wicket_info = balls_info[balls_info['Out'] == 1]

# Fixing the index of the dataframe
wicket_info.index = np.arange(0, len(wicket_info))
wicket_info

In [None]:
wicket_na_speed = len(wicket_info[wicket_info['Speed'] == -1])
wicket_na_speed / len(wicket_info) * 100 # percent of wicket data for which speed is not available

In [None]:
# Is there a specific line which Virat Kohli gets out a lot on?
line = wicket_info['Line']
sb.displot(line)

In [None]:
# Is there a specific length which Virat Kohli gets out a lot on?
length = wicket_info['Length']
sb.histplot(length)

In [None]:
# Do faster balls get Virat Kohli out more often?
speed = wicket_info['Speed']
sb.histplot(speed)

# Logistic Regression

In [None]:
import sklearn
from sklearn import preprocessing

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [None]:
balls_info.info()

## Checking the Conditions for Logistic Regression

### Checking that Target Variable is Binary

In [None]:
sb.countplot(x = 'Out', data = balls_info, palette = 'hls')

### Checking and Taking Care of Missing Values

In [None]:
# No null values
balls_info.isnull().sum()

In [None]:
# Everything looks good
balls_info.describe()

### Converting Categorical Variables to Dummy Indicators

In [None]:
# Since there are a fixed number of columns, we can use One Hot Encoding in this case
# Also the data isn't ordinal
balls_info['Format'].unique()

In [None]:
# There are too many options so we shouldn't use One Hot Encoding as it can increase memory usage and computation times
balls_info['Game Location'].unique()

In [None]:
# We can use One Hot Encoding in this case because there are not too many choices
# The data isn't ordinal either
balls_info['Line'].unique()

In [None]:
# Here also we can use One Hot Encoding as the data is not ordinal and there are only 4 choices
balls_info['Length'].unique()

In [None]:
# Too many options so we shouldn't use One Hot Encoding
balls_info['Location'].unique()

In [None]:
# Again, too many options so using One Hot Encoding isn't going to be useful
balls_info['Bowling Style'].unique()

In [None]:
# Only a few options for this but this could be ordinal data so we shouldn't use One Hot Encoding
balls_info['Bounce'].unique()

In [None]:
label_encoder = LabelEncoder()
binary_encoder = OneHotEncoder(categories = 'auto')

In [None]:
# Converting Format Feature to Binary Features
format_categorical = balls_info['Format']
format_encoded = label_encoder.fit_transform(format_categorical)

format_1hot = binary_encoder.fit_transform(format_encoded.reshape(-1, 1))
format_1hot_matrix = format_1hot.toarray()
format_df = pd.DataFrame(format_1hot_matrix, columns = ['ODI', 'T20', 'Test'])
format_df.head()

In [None]:
# Converting Game Location Feature to Numbers
game_location_categorical = balls_info['Game Location']
game_location_encoded = label_encoder.fit_transform(game_location_categorical)
game_location_encoded[0:10]

In [None]:
# The index is the encoded value for the specific Game Location
label_encoder.classes_

In [None]:
# Converting Line Feature to Binary Features
line_categorical = balls_info['Line']
line_encoded = label_encoder.fit_transform(line_categorical)

line_1hot = binary_encoder.fit_transform(line_encoded.reshape(-1, 1))
line_1hot_matrix = line_1hot.toarray()
line_df = pd.DataFrame(line_1hot_matrix, columns = ['Line: Leg Stump', 'Line: Legside', 'Line: Middle Stump', 'Line: Off Stump', 'Line: Offside', 'Line: Wide Outside Off'])
line_df.head()

In [None]:
# Converting Length Feature to Binary Features
length_categorical = balls_info['Length']
length_encoded = label_encoder.fit_transform(length_categorical)

length_1hot = binary_encoder.fit_transform(length_encoded.reshape(-1, 1))
length_1hot_matrix = length_1hot.toarray()
length_df = pd.DataFrame(length_1hot_matrix, columns = ['Length: Back of a Length', 'Length: Full', 'Length: Good', 'Length: Short'])
length_df.head()

In [None]:
# Converting Location Feature to Numbers
location_categorical = balls_info['Location']
location_encoded = label_encoder.fit_transform(location_categorical)
location_encoded[0:10]

In [None]:
# The indexes are the encoded values for each of the locations
label_encoder.classes_

In [None]:
# Converting Bowling Style Feature to Numbers
bowling_style_categorical = balls_info['Bowling Style']
bowling_style_encoded = label_encoder.fit_transform(bowling_style_categorical)
bowling_style_encoded[0:10]

In [None]:
# The indexes are the encoded values for each of the bowling styles
label_encoder.classes_

In [None]:
# Converting Bounce Feature to Numbers
bounce_categorical = balls_info['Bounce']
bounce_encoded = label_encoder.fit_transform(bounce_categorical)
bounce_encoded[0:10]

In [None]:
# The indexes are encoded values for each of the bounce increments
label_encoder.classes_

In [None]:
# Getting the list of the Speed Column so we can add it in the order we want into the balls_info dataframe
speed = balls_info['Speed'].to_list()
speed[0:10]

In [None]:
# Getting the list of the Out Column so we can add it in the order we want into the balls_info dataframe
out = balls_info['Out'].to_list()
out[0:10]

In [None]:
# Removing all the features that are going to be revised
balls_info.drop(['Format'], axis = 1, inplace = True)
balls_info.drop(['Game Location'], axis = 1, inplace = True)
balls_info.drop(['Line'], axis = 1, inplace = True)
balls_info.drop(['Length'], axis = 1, inplace = True)
balls_info.drop(['Location'], axis = 1, inplace = True)
balls_info.drop(['Bowling Style'], axis = 1, inplace = True)
balls_info.drop(['Bounce'], axis = 1, inplace = True)

# Removing these so we can decide the order in which the columns appear in our dataframe
balls_info.drop(['Speed'], axis = 1, inplace = True)
balls_info.drop(['Out'], axis = 1, inplace = True)

In [None]:
# Adding all the features to the dataframe
balls_info = pd.concat([balls_info, format_df], axis = 1)
balls_info['Game Location'] = game_location_encoded
balls_info['Bowling Style'] = bowling_style_encoded
balls_info['Speed'] = speed
balls_info = pd.concat([balls_info, length_df], axis = 1)
balls_info = pd.concat([balls_info, line_df], axis = 1)
balls_info['Bounce'] = bounce_encoded
balls_info['Out'] = out
balls_info

### Checking for Independence Between Features

In [None]:
sb.heatmap(balls_info.corr())

In [None]:
# ODI and Test have high correlation but that doesn't matter because they are part of the same feature which we broke up
# Line: Middle Stump and Line: Offisde have moderate correlation but doesn't matter because they are part of the same initial feature
# Length: Full and Length: Good have high correlation but doesn't matter because they are part of the same initial feature
# Bounce and Length: Short are correlated so we can remove Bounce since that feature isn't that important
balls_info.drop(['Bounce'], axis = 1, inplace = True)

### Checking if the Dataset Size is Sufficient

In [None]:
# Logistic Regression assumes that you have at least 50 values per predictive feature
balls_info.head() # 16 features

In [None]:
# 16 * 50 = 800 rows need to be there
balls_info.info()

# Saving the Prepared Dataframe as CSV for ML

In [None]:
balls_info.to_csv('Prepared_for_ML_Data.csv')