In [None]:
!pip install pandas scikit-learn



Importing necessary Libraries


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import pickle

Load and Clean Data

In [None]:
matches = pd.read_csv('https://storage.googleapis.com/kagglesdsdata/datasets/990900/8637500/matches.csv?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240626%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240626T073752Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=11fbfdb3cd0b706c99465f4b068f0c00714936df73bbec74caf902ee52de2c90b68656e9768595f510932a8d472d1126559284e1c14b0eed6ef44acae0b643b58438a46c94204f59abab6da920e237ad04210b49d95c1fa978c37ba68e0088446ffa8a569289a862ca7cb89cc2a901c4103f655953904f582e767d777869a5b90682da9d8a7e891c763ee781ab32b8c525ef0cf651a7698415a3f8d6057773a61d540467d7246ece7f3e7888949aaa432d3747cfeef6bf0a2ae8fe7cc8b739cdf4eac0e351a9e895e1ca0cf2ea8772edb28e41aa0e6cf5e41453bd39d0434be4352de4994c3d88c086e4fbd6fb9ba95bfd16cf08249cdd304a12cd8f9a0e807b')
deliveries = pd.read_csv('deliveries.csv')
# Clean data
matches.dropna(subset=['winner'], inplace=True)
deliveries.fillna({'extras_type': 'None', 'dismissal_kind': 'None', 'fielder': 'None'}, inplace=True)

Feature Engineering

In [None]:
player_stats = deliveries.groupby('batter').agg({
    'batsman_runs': 'sum',
    'ball': 'count'
}).rename(columns={'batsman_runs': 'total_runs', 'ball': 'balls_faced'})
matches = matches.merge(player_stats, left_on='player_of_match', right_index=True, how='left')

Prepare Features and Labels

In [None]:
features = matches[['team1', 'team2', 'venue', 'total_runs', 'balls_faced']]
labels = matches['winner']
features = pd.get_dummies(features, columns=['team1', 'team2', 'venue'])

# Save the column names before transformation
model_columns = features.columns.tolist()

Handle Missing Values

In [None]:
imputer = SimpleImputer(strategy='median')
features = imputer.fit_transform(features)

Split Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

Train Model

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

Save Model and Columns

In [None]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('model_columns.pkl', 'wb') as f:
    pickle.dump(model_columns, f)

How many data rows considered

In [None]:
# Check number of rows in each dataset
matches_count = len(matches)
deliveries_count = len(deliveries)

print(f"Number of rows in matches.csv: {matches_count}")
print(f"Number of rows in deliveries.csv: {deliveries_count}")

Number of rows in matches.csv: 1090
Number of rows in deliveries.csv: 260920
