## ***IPL AUCTION SALARY PREDICTOR***

#### ***Importing necessary libraries***

In [1]:
import pandas as pd
import numpy as np
import re
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

#### ***Making a helper function to Parse Salary***

In [2]:
def parse_price(price_str):
    try:
        if isinstance(price_str, str):
            price_str = price_str.lower().strip()
            if 'crore' in price_str:
                return float(price_str.replace(" crore", "").replace('₹', '').strip())
            elif re.match(r'^[\d,]+$', price_str):
                return round(int(price_str.replace(",","")) / 1e7, 2)
        elif isinstance(price_str, (int, float)):
            return round(price_str / 1e7, 2)
    except:
        return np.nan

#### ***Loading Auction Data***

In [3]:
auction_df = pd.read_csv("ipl_auction_data_2018_2025.csv")
auction_df['Player'] = auction_df['Player Name'].astype(str).str.strip().str.lower()
auction_df['Price(Cr)'] = auction_df['Price'].apply(parse_price)
auction_df = auction_df.dropna(subset=['Price(Cr)'])
auction_df = auction_df[['Player', 'Year', 'Price(Cr)']]

#### ***Loop over the 2018-2025 Performance Stat files and append the batting and bowling data in the list.***

In [4]:
batting_data = []
bowling_data = []

for year in range(2018, 2026):
    bat = pd.read_csv(f"top_run_scorers_{year}.csv")
    bowl = pd.read_csv(f"most_wickets_{year}.csv")

    bat['Player'] = bat['StrikerName'].astype(str).str.strip().str.lower()
    bowl['Player'] = bowl['BowlerName'].astype(str).str.strip().str.lower()

    bat['Year'] = year
    bowl['Year'] = year

    bat_cols = [
        'Player', 'Year', 'Matches', 'Innings', 'Extras', 'TotalRuns', 'Balls',
        'Dotballs', 'StrikeRate', 'DBPercent', 'DBFreq', 'BdryFreq', 'BdryPercent', 'RPSS', 'ScoringBalls',
        'Ones', 'Twos', 'Threes', 'Fours', 'Sixes', 'Outs', 'NotOuts', 'BattingAveragesss', 'FiftyPlusRuns',
        'Centuries', 'DoubleCenturies', 'HighestScore', 'BattingAverage', 'Catches', 'Stumpings']

    bowl_cols = [
        'Player', 'Year', 'Matches', 'Innings', 'LegalBallsBowled',
        'TotalRunsConceded', 'DotBallsBowled', 'DotBallPercent', 'ScoringBallsBowled', 'BowlingAverage',
        'StrikeRate', 'BowlingSR', 'BoundaryPercentage', 'BoundaryFrequency', 'EconomyRate', 'OversBowled',
        'Ones', 'Twos', 'Threes', 'Fours', 'Sixes', 'Wides', 'NoBalls', 'Byes', 'LegBye', 'Wickets',
        'InningsRuns', 'InningsWickets', 'MatchRuns', 'MatchWickets', 'Maidens', 'MaidenWickets',
        'FourWickets', 'FiveWickets', 'TenWickets']

    bat = bat[[col for col in bat_cols if col in bat.columns]].copy()
    bowl = bowl[[col for col in bowl_cols if col in bowl.columns]].copy()

    batting_data.append(bat)
    bowling_data.append(bowl)

batting_df = pd.concat(batting_data, ignore_index=True)
bowling_df = pd.concat(bowling_data, ignore_index=True)

#### ***Combine and Merge Performance Data of all year***

In [5]:
performance_df = pd.merge(batting_df, bowling_df, on=['Player', 'Year'], how='outer', suffixes=('_bat', '_bowl'))
performance_df.replace('-', pd.NA, inplace=True)

for col in performance_df.columns:
    if col not in ['Player', 'Year']:
        performance_df[col] = pd.to_numeric(performance_df[col], errors='coerce')

performance_df.fillna(0, inplace=True)

#### ***Merge all Performance data with Auction data***

In [6]:
model_data = pd.merge(performance_df, auction_df, on=['Player', 'Year'], how='inner')

#### ***Train on 2018-2024 data and Predict 2026 salaries from 2025***

In [7]:
train_data = model_data[model_data['Year'] < 2025]
predict_data = performance_df[performance_df['Year'] == 2025].copy()

X = train_data.drop(columns=['Player', 'Year', 'Price(Cr)'])
y = train_data['Price(Cr)']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=150, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print("Validation MAE:", round(mean_absolute_error(y_val, y_pred), 3))
print("Validation R² Score:", round(r2_score(y_val, y_pred), 3))

Validation MAE: 3.083
Validation R² Score: 0.24


#### ***Predict 2026 salaries***

In [8]:
predict_data.replace('-', pd.NA, inplace=True)
for col in predict_data.columns:
    if col not in ['Player', 'Year']:
        predict_data[col] = pd.to_numeric(predict_data[col], errors='coerce')
predict_data.fillna(0, inplace=True)

X_2026 = predict_data.drop(columns=['Player', 'Year'])
predictions = model.predict(X_2026)

result_2026 = predict_data[['Player']].copy()
result_2026['Predicted_Salary_2026_Cr'] = predictions.round(2)
result_2026 = result_2026.sort_values(by='Predicted_Salary_2026_Cr', ascending=False)

#### ***Saving Output as csv file***

In [11]:
result_2026.to_csv("predicted_ipl_auction_salaries_2026.csv", index=False)
print("Saved: predicted_ipl_auction_salaries_2026.csv")

Saved: predicted_ipl_auction_salaries_2026.csv
