In [7]:
import json
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Step 1: Load JSON files and preprocess data
file_paths = glob.glob("*.json")  # Adjust if necessary
data_list = []

for file_path in file_paths:
    with open(file_path) as file:
        data = json.load(file)

    # Process each JSON and extract key features for each player
    for inning in data["innings"]:
        for over_data in inning["overs"]:
            for delivery in over_data["deliveries"]:
                batter = delivery["batter"]
                bowler = delivery["bowler"]

                # Collect batting stats
                runs = delivery["runs"]["batter"]
                total_runs = delivery["runs"]["total"]
                is_boundary = 1 if runs == 4 else 0
                is_six = 1 if runs == 6 else 0
                wicket_info = delivery.get("wicket", None)

                # Collect bowling stats
                bowler_runs_conceded = total_runs
                is_wicket = 1 if wicket_info else 0
                wicket_type = wicket_info["kind"] if wicket_info else None

                data_list.append({
                    "batter": batter,
                    "bowler": bowler,
                    "runs_scored": runs,
                    "total_runs": total_runs,
                    "is_boundary": is_boundary,
                    "is_six": is_six,
                    "is_wicket": is_wicket,
                    "wicket_type": wicket_type,
                    "bowler_runs_conceded": bowler_runs_conceded,
                    "over": over_data["over"],
                })

# Convert to DataFrame
df = pd.DataFrame(data_list)
df

Unnamed: 0,batter,bowler,runs_scored,total_runs,is_boundary,is_six,is_wicket,wicket_type,bowler_runs_conceded,over
0,SC Ganguly,P Kumar,0,1,0,0,0,,1,0
1,BB McCullum,P Kumar,0,0,0,0,0,,0,0
2,BB McCullum,P Kumar,0,1,0,0,0,,1,0
3,BB McCullum,P Kumar,0,0,0,0,0,,0,0
4,BB McCullum,P Kumar,0,0,0,0,0,,0,0
...,...,...,...,...,...,...,...,...,...,...
19353,SK Raina,P Kumar,0,0,0,0,0,,0,19
19354,SK Raina,P Kumar,1,1,0,0,0,,1,19
19355,M Ntini,P Kumar,0,0,0,0,0,,0,19
19356,M Ntini,P Kumar,0,0,0,0,0,,0,19


In [8]:
# Aggregate features by player
player_stats = df.groupby("batter").agg(
    total_runs_scored=('runs_scored', 'sum'),
    boundaries=('is_boundary', 'sum'),
    sixes=('is_six', 'sum'),
    balls_faced=('runs_scored', 'count')
).reset_index()

bowler_stats = df.groupby("bowler").agg(
    runs_conceded=('bowler_runs_conceded', 'sum'),
    wickets_taken=('is_wicket', 'sum'),
    balls_bowled=('bowler_runs_conceded', 'count')
).reset_index()

# Merge into a single DataFrame
player_stats_df = player_stats.merge(bowler_stats, left_on="batter", right_on="bowler", how="outer").fillna(0)

# Calculate additional metrics like Strike Rate, Economy Rate
player_stats_df['strike_rate'] = player_stats_df.apply(lambda x: (x['total_runs_scored'] / x['balls_faced']) * 100 if x['balls_faced'] > 0 else 0, axis=1)
player_stats_df['economy_rate'] = player_stats_df.apply(lambda x: (x['runs_conceded'] / (x['balls_bowled'] / 6)) if x['balls_bowled'] > 0 else 0, axis=1)

player_stats_df

Unnamed: 0,batter,total_runs_scored,boundaries,sixes,balls_faced,bowler,runs_conceded,wickets_taken,balls_bowled,strike_rate,economy_rate
0,A Chopra,53.0,7.0,0.0,75.0,0,0.0,0.0,0.0,70.666667,0.000000
1,A Flintoff,62.0,5.0,2.0,57.0,A Flintoff,106.0,0.0,66.0,108.771930,9.636364
2,A Kumble,21.0,2.0,0.0,30.0,A Kumble,468.0,0.0,400.0,70.000000,7.020000
3,A Mishra,48.0,4.0,1.0,54.0,A Mishra,251.0,0.0,217.0,88.888889,6.940092
4,A Mukund,0.0,0.0,0.0,1.0,0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
197,YV Takawale,70.0,7.0,2.0,69.0,0,0.0,0.0,0.0,101.449275,0.000000
198,Yashpal Singh,28.0,3.0,0.0,26.0,0,0.0,0.0,0.0,107.692308,0.000000
199,Younis Khan,3.0,0.0,0.0,7.0,0,0.0,0.0,0.0,42.857143,0.000000
200,Yuvraj Singh,445.0,34.0,28.0,308.0,Yuvraj Singh,144.0,0.0,111.0,144.480519,7.783784


In [3]:
# Selecting features and target
features = ["total_runs_scored", "boundaries", "sixes", "balls_faced", "wickets_taken", "strike_rate", "economy_rate"]
target = "total_runs_scored"  # Adjust based on what you want to predict, e.g., "wickets_taken"

X = player_stats_df[features].dropna()
y = player_stats_df[target].dropna()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training with Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Model evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 126.19600487804873


In [6]:
def predict_player_performance(json_file, model):
    with open('392231.json') as file:
        data = json.load(file)

    # Process JSON data and extract features
    runs_scored = 0
    boundaries = 0
    sixes = 0
    balls_faced = 0
    wickets_taken = 0
    balls_bowled = 0
    runs_conceded = 0

    for inning in data["innings"]:
        for over_data in inning["overs"]:
            for delivery in over_data["deliveries"]:
                runs = delivery["runs"]["batter"]
                batter = delivery["batter"]
                bowler = delivery["bowler"]

                runs_scored += runs
                balls_faced += 1
                if runs == 4:
                    boundaries += 1
                if runs == 6:
                    sixes += 1

                if "wicket" in delivery:
                    wickets_taken += 1
                runs_conceded += delivery["runs"]["total"]
                balls_bowled += 1

    # Calculate derived features
    strike_rate = (runs_scored / balls_faced * 100) if balls_faced > 0 else 0
    economy_rate = (runs_conceded / (balls_bowled / 6)) if balls_bowled > 0 else 0

    # Prepare feature array
    features = np.array([[runs_scored, boundaries, sixes, balls_faced, wickets_taken, strike_rate, economy_rate]])

    # Predict
    predicted_performance = model.predict(features)
    return predicted_performance

# Example usage
predicted_score = predict_player_performance("sample_match.json", model)
print(f"Predicted Player Performance Score: {predicted_score}")


Predicted Player Performance Score: [342.06]




In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# Assuming `player_stats_df` is already created with data from multiple matches

# Step 1: Add a column for match count in `player_stats_df`
# Calculate the number of matches each player has played
match_counts = player_stats_df.groupby('batter').size().reset_index(name='matches_played')

# Step 2: Aggregate performance metrics and include the match count
aggregated_stats_df = player_stats_df.groupby('batter').agg({
    'total_runs_scored': 'sum',
    'balls_faced': 'sum',
    'boundaries': 'sum',
    'sixes': 'sum',
    'wickets_taken': 'sum',
    'economy_rate': 'mean'
}).reset_index()

# Merge with match_counts to get the number of matches played for each player
aggregated_stats_df = pd.merge(aggregated_stats_df, match_counts, on='batter', how='left')

# Calculate average statistics by dividing the sums by the number of matches played
aggregated_stats_df['avg_runs_scored'] = aggregated_stats_df['total_runs_scored'] / aggregated_stats_df['matches_played']
aggregated_stats_df['avg_balls_faced'] = aggregated_stats_df['balls_faced'] / aggregated_stats_df['matches_played']
aggregated_stats_df['avg_boundaries'] = aggregated_stats_df['boundaries'] / aggregated_stats_df['matches_played']
aggregated_stats_df['avg_sixes'] = aggregated_stats_df['sixes'] / aggregated_stats_df['matches_played']

# Drop columns that won’t be used as features
aggregated_stats_df.drop(columns=['total_runs_scored', 'balls_faced', 'boundaries', 'sixes'], inplace=True)

# Define features and target variables
X = aggregated_stats_df.drop(columns=['batter', 'matches_played'])  # Drop non-numeric columns
y = aggregated_stats_df[['avg_runs_scored', 'avg_balls_faced', 'avg_boundaries', 'avg_sixes']]

# Step 3: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Define and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 5: Predict and evaluate on the test set
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Step 6: Make predictions for the next match for all players based on historical averages
next_match_predictions = model.predict(X)
aggregated_stats_df['predicted_runs_scored'] = next_match_predictions[:, 0]
aggregated_stats_df['predicted_balls_faced'] = next_match_predictions[:, 1]
aggregated_stats_df['predicted_boundaries'] = next_match_predictions[:, 2]
aggregated_stats_df['predicted_sixes'] = next_match_predictions[:, 3]

# Step 7: Save the predictions to a CSV file
aggregated_stats_df[['batter', 'matches_played', 'avg_runs_scored', 'avg_balls_faced', 'avg_boundaries', 'avg_sixes',
                     'predicted_runs_scored', 'predicted_balls_faced', 'predicted_boundaries', 'predicted_sixes']].to_csv('player_performance_predictions.csv', index=False)

print("Predictions saved to 'player_performance_predictions.csv'")


Mean Squared Error: 32.036100657894735
Predictions saved to 'player_performance_predictions.csv'
