In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import rankdata
import json
import os
from tqdm import tqdm
from collections import defaultdict

In [103]:
countries = ['England', 'Australia', 'Afghanistan', 'India', 'New Zealand', 'South Africa', 'West Indies', 'Bangladesh']
top_matches = []
scorecards = []

In [104]:
players = pd.read_csv('players_data_with_all_info.csv')
players = players[players['gender'] == 'm']
players = players[players['country_name'].isin(countries)]
players = players[['firstname', 'lastname', 'fullname', 'battingstyle', 'bowlingstyle', 'position', 'country_name']]
players.drop_duplicates(subset='fullname', inplace=True, keep='last')
players.head()

Unnamed: 0,firstname,lastname,fullname,battingstyle,bowlingstyle,position,country_name
20,Tim,Paine,Tim Paine,right-hand-bat,right-arm-fast-medium,Wicketkeeper,Australia
21,Ashton,Agar,Ashton Agar,left-hand-bat,slow-left-arm-orthodox,Bowler,Australia
22,Brendan,Doggett,Brendan Doggett,right-hand-bat,right-arm-fast-medium,Bowler,Australia
23,Aaron,Finch,Aaron Finch,right-hand-bat,slow-left-arm-orthodox,Batsman,Australia
24,Travis,Head,Travis Head,left-hand-bat,right-arm-offbreak,Middle Order Batter,Australia


In [105]:
folder_path = 't20s_male_json'
json_files = os.listdir(folder_path)

In [106]:
def process_match_data(data):
    player_stats = defaultdict(lambda: {
        'team': '',
        'player_name': '',
        'runs_scored': 0,
        'wickets_taken': 0,
        'balls_faced': 0,
        'runs_conceded': 0,
        'balls_bowled': 0,
        'fours_hit': 0,
        'sixes_hit': 0,
        'dot_balls_bowled': 0,
        'death_overs_runs_conceded': 0,
        'death_overs_balls_bowled': 0,
        'out': 0
    })
    
    for innings_idx, innings in enumerate(data['innings']):
        team_name = innings['team']

        for over_idx, over_data in enumerate(innings['overs']):
            over_num = over_data['over']
            is_death_over = over_num >= 16 and over_num <= 20
            
            for delivery in over_data['deliveries']:
                batter = delivery['batter']
                bowler = delivery['bowler']
                non_striker = delivery['non_striker']
                runs = delivery.get('runs', {})
                batter_runs = runs.get('batter', 0)
                extras = runs.get('extras', 0)
                total_runs = runs.get('total', 0)
                player_stats[bowler]['team'] = get_opposing_team(team_name, data)
                player_stats[bowler]['player_name'] = bowler
                
                if extras == 0:
                    player_stats[bowler]['balls_bowled'] += 1
                    if is_death_over:
                        player_stats[bowler]['death_overs_balls_bowled'] += 1
                
                player_stats[bowler]['runs_conceded'] += total_runs
                if is_death_over:
                    player_stats[bowler]['death_overs_runs_conceded'] += total_runs
                
                player_stats[batter]['team'] = team_name
                player_stats[batter]['player_name'] = batter
                player_stats[batter]['balls_faced'] += 1
                player_stats[batter]['runs_scored'] += batter_runs
                player_stats[non_striker]['team'] = team_name
                player_stats[non_striker]['player_name'] = non_striker
                
                if total_runs == 0:
                    player_stats[bowler]['dot_balls_bowled'] += 1
                if batter_runs == 4:
                    player_stats[batter]['fours_hit'] += 1
                elif batter_runs == 6:
                    player_stats[batter]['sixes_hit'] += 1
                if 'wickets' in delivery:
                    for wicket in delivery['wickets']:
                        player_out = wicket['player_out']
                        player_stats[bowler]['wickets_taken'] += 1
                        player_stats[player_out]['out'] = 1
    df = pd.DataFrame(player_stats.values())
    return df

def get_opposing_team(batting_team, data):
    all_teams = [innings['team'] for innings in data['innings']]
    for team in all_teams:
        if team != batting_team:
            return team
    return "Unknown Team"

In [107]:
for match_file in tqdm(json_files):
    if '.json' in match_file:
        with open(os.path.join(folder_path, match_file), 'r') as f:
            content = f.read()
            data = json.loads(content)
            if set(data['info']['teams']).issubset(countries):
                top_matches.append(match_file) 
                scorecard = process_match_data(data)
                date = data['info']['dates'][0]
                year = int(date.split('-')[0])
                venue = data['info']['venue']
                scorecard['date'] = date
                scorecard['year'] = year
                scorecard['boundary_runs'] = 4 * scorecard['fours_hit'].fillna(0) + 6 * scorecard['sixes_hit'].fillna(0)
                scorecard['boundaries'] = scorecard['fours_hit'].fillna(0) + scorecard['sixes_hit'].fillna(0)
                scorecard['venue'] = venue
                scorecards.append(scorecard)

100%|██████████| 2604/2604 [00:02<00:00, 1300.87it/s]


In [108]:
standard_venues = {
    'M Chinnaswamy Stadium': 'M. Chinnaswamy Stadium',
    'M.Chinnaswamy Stadium': 'M. Chinnaswamy Stadium',
    'R Premadasa Stadium': 'R. Premadasa Stadium',
    'R.Premadasa Stadium': 'R. Premadasa Stadium',
    'Darren Sammy National Cricket Stadium': 'Daren Sammy National Cricket Stadium',
    'Daren Sammy National Cricket Stadium': 'Daren Sammy National Cricket Stadium',
    'Feroz Shah Kotla': 'Arun Jaitley Stadium'
}

In [109]:
def clean_venue(venue):
    if venue in standard_venues.keys():
        return standard_venues[venue]
    else:
        return venue

In [110]:
df = pd.concat(scorecards, ignore_index=True)
df['venue'] = df['venue'].str.split(',').str[0]
df['venue'] = df['venue'].apply(clean_venue)
df.drop(columns=['fours_hit', 'sixes_hit'], inplace=True)

In [111]:
stats_players = df['player_name'].unique().tolist()
player_names = players['fullname'].unique().tolist()
mappings_df = pd.read_csv('mappings.csv')
keys = mappings_df['short-name'].tolist()
values = mappings_df['common-name'].tolist()

In [112]:
mappings = {}

for key, val in zip(keys, values):
    mappings[key] = val
    
stats_players = [name for name in stats_players if name not in mappings.keys()]
player_names = [name for name in player_names if name not in mappings.values()]

In [113]:
change = [player for player in mappings.values() if player not in players['fullname'].tolist()]
change_keys = [k for k,v in mappings.items() if v in change]

df = df[~df['player_name'].isin(change_keys)]

In [114]:
def replace_name(name):
    return mappings[name]

In [115]:
df['player_name'] = df['player_name'].apply(replace_name)
df = df.merge(
    players[['fullname', 'battingstyle', 'bowlingstyle', 'position']],
    left_on='player_name',
    right_on='fullname',
    how='left'
)
df.drop(columns=['fullname'], inplace=True)

In [116]:
batting_metrics = ['runs_scored', 'balls_faced', 'out', 'boundary_runs', 'boundaries']
bowling_metrics = ['wickets_taken', 'runs_conceded', 'balls_bowled', 'dot_balls_bowled', 'death_overs_runs_conceded', 'death_overs_balls_bowled']

venues = df['venue'].unique()

venue_metrics = {}

for venue in venues:
    venue_metrics[venue] = {}
    for metric in batting_metrics:
        venue_metrics[venue][metric] = []
    for metric in bowling_metrics:
        venue_metrics[venue][metric] = []

In [117]:
for _, row in tqdm(df.iterrows()):
    if row['position'] in ['Bowler', 'Allrounder', 'Batting Allrounder','Bowling Allrounder']:
        for metric in bowling_metrics:
            if np.isnan(row[metric]):
                continue
            else:
                venue_metrics[row['venue']][metric].append(row[metric])
    elif row['position'] != 'Bowler':
        for metric in batting_metrics:
            if np.isnan(row[metric]):
                continue
            else:
                venue_metrics[row['venue']][metric].append(row[metric])

9614it [00:00, 23938.34it/s]


In [118]:
venue_averages = {}

for venue in tqdm(venue_metrics.keys()):
    avg_dict = {}
    total_runs = sum(venue_metrics[venue]['runs_scored'])
    balls_faced = sum(venue_metrics[venue]['balls_faced'])
    got_out = sum(venue_metrics[venue]['out'])
    total_wickets = sum(venue_metrics[venue]['wickets_taken'])
    balls_bowled = sum(venue_metrics[venue]['balls_bowled'])
    death_balls = sum(venue_metrics[venue]['death_overs_balls_bowled'])
    dot_balls = sum(venue_metrics[venue]['dot_balls_bowled'])
    runs_conceded = sum(venue_metrics[venue]['runs_conceded'])
    death_runs = sum(venue_metrics[venue]['death_overs_runs_conceded'])
    boundary_runs = sum(venue_metrics[venue]['boundary_runs'])
    avg_dict['average'] = total_runs / got_out
    avg_dict['balls_faced'] = balls_faced / len(venue_metrics[venue]['balls_faced'])
    avg_dict['boundary_runs'] = boundary_runs / len(venue_metrics[venue]['boundary_runs'])
    avg_dict['boundaries'] = sum(venue_metrics[venue]['boundaries']) / len(venue_metrics[venue]['boundaries'])
    avg_dict['wickets_taken'] = sum(venue_metrics[venue]['wickets_taken']) / len(venue_metrics[venue]['wickets_taken'])
    avg_dict['runs_conceded'] = runs_conceded / len(venue_metrics[venue]['runs_conceded'])
    avg_dict['balls_bowled'] = balls_bowled / len(venue_metrics[venue]['balls_bowled'])
    avg_dict['dot_balls_bowled'] = dot_balls / len(venue_metrics[venue]['dot_balls_bowled'])
    avg_dict['death_overs_runs_conceded'] = death_runs / len(venue_metrics[venue]['death_overs_runs_conceded'])
    avg_dict['death_overs_balls_bowled'] = death_balls / len(venue_metrics[venue]['death_overs_balls_bowled'])
    venue_averages[venue] = avg_dict

100%|██████████| 89/89 [00:00<?, ?it/s]


In [119]:
venues_df = pd.DataFrame.from_dict(venue_averages, orient='index')
venues_df = venues_df.reset_index().rename(columns={'index': 'venue'})
venues_df.head()

Unnamed: 0,venue,average,balls_faced,boundary_runs,boundaries,wickets_taken,runs_conceded,balls_bowled,dot_balls_bowled,death_overs_runs_conceded,death_overs_balls_bowled
0,McLean Park,25.333333,15.928571,12.952381,2.738095,0.984375,22.40625,15.40625,5.6875,4.0,2.09375
1,Bay Oval,29.442623,17.432432,15.756757,3.364865,0.960396,22.70297,15.653465,5.970297,4.346535,2.455446
2,Eden Park,27.259259,16.772152,14.898734,3.088608,1.080952,25.047619,16.280952,5.842857,3.809524,2.495238
3,The Rose Bowl,26.220588,17.48,14.453333,3.24,1.017391,24.53913,17.113043,5.930435,4.686957,2.904348
4,County Ground,32.0,19.607143,21.357143,4.464286,1.111111,31.972222,19.194444,6.083333,6.861111,3.611111


In [120]:
for idx, row in df.iterrows():
    venue = row['venue']
    venue_stats = venue_averages[venue]
    df.loc[idx, 'runs_scored'] /= venue_stats['average']
    for col in ['balls_faced', 'boundary_runs', 'boundaries', 'wickets_taken', 'balls_bowled', 'runs_conceded', 'dot_balls_bowled', 'death_overs_runs_conceded', 'death_overs_balls_bowled']:
        df.loc[idx, col] /= venue_stats[col]

  df.loc[idx, col] /= venue_stats[col]
  df.loc[idx, col] /= venue_stats[col]
  df.loc[idx, col] /= venue_stats[col]
  df.loc[idx, col] /= venue_stats[col]
  df.loc[idx, col] /= venue_stats[col]
  df.loc[idx, col] /= venue_stats[col]
  df.loc[idx, 'runs_scored'] /= venue_stats['average']
  df.loc[idx, col] /= venue_stats[col]
  df.loc[idx, col] /= venue_stats[col]
  df.loc[idx, col] /= venue_stats[col]


In [123]:
years = df['year'].unique().tolist()
years = [year for year in years if year in range(2013, 2023)]
years.sort()

In [125]:
yearwise = []
for year in years:
    year_df = df[df['year'] == year]
    year_df.drop(columns=['team', 'date', 'year'], inplace=True)
    grouped = year_df.groupby(by='player_name').sum()
    grouped['year'] = year
    grouped['strike_rate'] = (grouped['runs_scored'] / grouped['balls_faced']).round(2) * 100
    grouped['economy'] = ((grouped['runs_conceded'] / grouped['balls_bowled']) * 6).round(2)
    grouped['dot_ball_pct'] = (grouped['dot_balls_bowled'] / grouped['balls_bowled']).round(2) * 100
    grouped['bowling_strike_rate'] = (grouped['balls_bowled'] / grouped['wickets_taken']).round(2)
    grouped['bowling_average'] = (grouped['runs_conceded'] / grouped['wickets_taken']).round(2)
    grouped['death_economy'] = ((grouped['death_overs_runs_conceded'] / grouped['death_overs_balls_bowled']) * 6).round(2)
    grouped['average'] = grouped.apply(lambda x: x['runs_scored'] if x['out'] == 0 else round((x['runs_scored'] / x['out']), 2), axis=1)
    grouped['boundary_pct'] = (grouped['boundary_runs'] / grouped['runs_scored']) * 100
    grouped['balls_per_boundary'] = grouped['balls_faced'] / grouped['boundaries']
    yearwise.append(grouped[['year', 'strike_rate', 'economy', 'dot_ball_pct', 'bowling_strike_rate', 'death_economy', 'average', 
                             'boundary_pct', 'balls_per_boundary', 'runs_scored', 'wickets_taken', 'bowling_average', 'balls_bowled', 'balls_faced']])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df.drop(columns=['team', 'date', 'year'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df.drop(columns=['team', 'date', 'year'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df.drop(columns=['team', 'date', 'year'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

In [126]:
yearwise = [df.reset_index() for df in yearwise]
sum_cols = ['runs_scored', 'wickets_taken', 'balls_bowled', 'balls_faced']

In [127]:
def combine_dataframes_with_weighted_avg(dataframes, years, sum_cols):
    df_dict = {year: df.copy() for year, df in zip(years, dataframes)}
    
    all_players = set()
    for df in dataframes:
        all_players.update(df['player_name'].tolist())
    
    all_columns = set()
    for df in dataframes:
        all_columns.update(df.columns)
    
    data_columns = [col for col in all_columns if col not in ['player_name', 'year']]
    
    result_data = []
    for player in all_players:
        player_data = {'player_name': player}
        player_years = []
        for year in years:
            df = df_dict[year]
            if player in df['player_name'].values:
                player_years.append(year)
        player_years.sort(reverse=True)
        
        if len(player_years) == 1:
            weights = [1.0]
        elif len(player_years) == 2:
            weights = [0.9, 0.1]
        elif len(player_years) == 3:
            weights = [0.9, 0.09, 0.01]
        else:
            weights = [0.9 * (0.1 ** i) for i in range(len(player_years))]
            weights = [w / sum(weights) for w in weights]
        
        for column in data_columns:
            weighted_sum = 0
            total_weight = 0
            for i, year in enumerate(player_years):
                df = df_dict[year]
                player_row = df[df['player_name'] == player]
                
                if not player_row.empty and column in player_row.columns:
                    value = player_row[column].values[0]
                    if not (pd.isna(value) or value == float('inf') or value == float('-inf')):
                        if column not in sum_cols:
                            weighted_sum += value * weights[i]
                        else:
                            weighted_sum += value
                        total_weight += weights[i]
            if total_weight > 0:
                player_data[column] = weighted_sum / total_weight
            else:
                player_data[column] = np.nan   
        result_data.append(player_data)
    
    result_df = pd.DataFrame(result_data)
    return result_df

In [128]:
yearwise_combined = []
for i in range(len(years)):
    yearwise_combined.append(combine_dataframes_with_weighted_avg(yearwise[:i+1], years[:i+1], sum_cols))

In [129]:
for i in range(len(yearwise_combined)):
    yearwise_combined[i] = yearwise_combined[i].merge(
    df.drop_duplicates(subset='player_name')[['player_name', 'battingstyle', 'bowlingstyle', 'position']],
    left_on='player_name',
    right_on='player_name',
    how='left'
)

In [130]:
def calculate_percentiles(df, lower_is_better_stats, higher_is_better_stats):
    percentile_df = df.copy()
    for stat in lower_is_better_stats:
        if stat in df.columns:
            percentile_df[stat] = df[stat].rank(pct=True, ascending=False) * 100
    
    for stat in higher_is_better_stats:
        if stat in df.columns:
            percentile_df[stat] = df[stat].rank(pct=True) * 100
    return percentile_df

In [131]:
positions = df['position'].unique().tolist()
positions

['Bowler',
 'Batsman',
 'Allrounder',
 'Middle Order Batter',
 'Wicketkeeper',
 'Top Order Batter',
 'Batting Allrounder',
 'Bowling Allrounder']

In [132]:
batting = ['runs_scored', 'average', 'boundary_pct', 'strike_rate', 'balls_per_boundary']
bowling = ['wickets_taken', 'economy', 'dot_ball_pct', 'bowling_strike_rate', 'death_economy']

lower_batting = ['balls_per_boundary']
lower_bowling = ['economy', 'bowling_strike_rate', 'death_economy']

higher_batting = ['runs_scored', 'average', 'boundary_pct', 'strike_rate']
higher_bowling = ['wickets_taken', 'dot_ball_pct']

In [133]:
percentiles = {}

for i in range(len(years)):
    temp_dict = {}
    for pos in positions:
        if pos == 'Bowler':
            lower = lower_bowling
            higher = higher_bowling
        elif pos in ['Batsman', 'Top Order Batter', 'Middle Order Batter', 'Wicketkeeper']:
            lower = lower_batting
            higher = higher_batting
        else:
            lower = lower_bowling + lower_batting
            higher = higher_bowling + higher_batting
        temp_df = yearwise_combined[i]
        temp_df = temp_df[temp_df['position'] == pos]
        personal = temp_df[['player_name', 'battingstyle', 'bowlingstyle', 'position']]
        temp_df = temp_df.drop(columns=['player_name', 'position', 'battingstyle', 'bowlingstyle'])
        temp_df = temp_df[lower + higher]
        percentile = calculate_percentiles(temp_df, lower, higher)
        for col in personal.keys():
            percentile[col] = personal[col]
        temp_dict[pos] = percentile
    percentiles[years[i]] = temp_dict

In [134]:
yearwise_percentiles = []

for key in percentiles.keys():
    year_percentiles = percentiles[key]
    temp_df = year_percentiles['Allrounder']
    for pos in ['Bowler', 'Batsman', 'Middle Order Batter', 'Wicketkeeper', 'Top Order Batter', 'Batting Allrounder', 'Bowling Allrounder']:
        temp_df = pd.concat([temp_df, year_percentiles[pos]])
    temp_df['Year'] = key
    yearwise_percentiles.append(temp_df)

In [135]:
mappings = {v : k for k, v in mappings.items()}

for i in range(len(year_percentiles)):
    yearwise_percentiles[i]['player_name'] = yearwise_percentiles[i]['player_name'].apply(replace_name)

In [136]:
final_mappings = pd.read_excel('name-mappings/players_with_roles.xlsx')
final_mappings.drop_duplicates(subset='statistics[1]', inplace=True)
auction_mappings = {}

for _, row in final_mappings.iterrows():
    auction_mappings[row['statistics[1]']] = row['auction_name']

In [137]:
def final_replace(name):
    if name in auction_mappings.keys():
        return auction_mappings[name]
    else:
        return name

In [138]:
for i in range(len(year_percentiles)):
    yearwise_percentiles[i]['player_name'] = yearwise_percentiles[i]['player_name'].apply(final_replace)

In [139]:
stats = pd.concat(yearwise_percentiles, axis=0)

In [140]:
auction_df = pd.read_csv('IPLPlayerAuctionData.csv')
auction_df.head()

Unnamed: 0,Player,Role,Amount,Team,Year,Player Origin
0,Aaron Finch,Batsman,40000000,Sunrisers Hyderabad,2014.0,Overseas
1,Aaron Finch,Batsman,32000000,Mumbai Indians,2015.0,Overseas
2,Aaron Finch,Batsman,10000000,Gujarat Lions,2016.0,Overseas
3,Aaron Finch,Batsman,62000000,Kings XI Punjab,2018.0,Overseas
4,Aaron Finch,Batsman,44000000,Royal Challengers Bangalore,2020.0,Overseas


In [141]:
stats = stats.merge(
    auction_df.drop(columns=['Role', 'Team']), 
    left_on=['player_name', 'Year'],
    right_on=['Player', 'Year'],
    how='right'
)

In [None]:
stats.to_csv('auction_stats_data.csv')

In [143]:
stats.drop(columns=['player_name', 'Player'], inplace=True)

In [160]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, root_mean_squared_error

In [145]:
X = stats.drop(columns=['Amount'])
Y = stats['Amount']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

oe = OrdinalEncoder()

cols = ['battingstyle', 'bowlingstyle', 'position', 'Player Origin']
x_train[cols] = oe.fit_transform(x_train[cols])
x_test[cols]  = oe.transform(x_test[cols])

In [169]:
x_train.head()

Unnamed: 0,economy,bowling_strike_rate,death_economy,balls_per_boundary,wickets_taken,dot_ball_pct,runs_scored,average,boundary_pct,strike_rate,battingstyle,bowlingstyle,position,Year,Player Origin
86,,,,,,,,,,,,,,2018.0,0.0
645,,,,,,,,,,,,,,2016.0,1.0
713,,,,,,,,,,,,,,2022.0,0.0
3,,,,84.90566,,,96.491228,87.719298,67.857143,92.857143,1.0,8.0,1.0,2018.0,1.0
68,,,,,,,,,,,,,,2020.0,0.0


In [170]:
x_train.columns

Index(['economy', 'bowling_strike_rate', 'death_economy', 'balls_per_boundary',
       'wickets_taken', 'dot_ball_pct', 'runs_scored', 'average',
       'boundary_pct', 'strike_rate', 'battingstyle', 'bowlingstyle',
       'position', 'Year', 'Player Origin'],
      dtype='object')

In [166]:
rfr = RandomForestRegressor(max_depth=6, max_features='sqrt', n_estimators=200)
rfr.fit(x_train, y_train)
train_preds = rfr.predict(x_train)
print(f"Train R2 score : {r2_score(y_train, train_preds)}")
print(f"Train RSME is : {root_mean_squared_error(y_train, train_preds)}")
preds = rfr.predict(x_test)
r2_score(y_test, preds)
print(f"Test R2 score : {r2_score(y_test, preds)}")
print(f"Test RSME is : {root_mean_squared_error(y_test, preds)}")

Train R2 score : 0.47513643293032337
Train RSME is : 20036065.356033783
Test R2 score : 0.20351278150478658
Test RSME is : 26119446.162619554


In [163]:
from xgboost import XGBRegressor

reg = XGBRegressor(max_depth=5, n_estimators=200)
reg.fit(x_train, y_train)
train_preds = reg.predict(x_train)
print(f"Train R2 score : {r2_score(y_train, train_preds)}")
print(f"Train RSME is : {root_mean_squared_error(y_train, train_preds)}")
preds = reg.predict(x_test)
r2_score(y_test, preds)
print(f"Test R2 score : {r2_score(y_test, preds)}")
print(f"Test RSME is : {root_mean_squared_error(y_test, preds)}")

Train R2 score : 0.6871816515922546
Train RSME is : 15468043.0
Test R2 score : 0.028395235538482666
Test RSME is : 28848242.0


In [164]:
from lightgbm import LGBMRegressor

reg = LGBMRegressor(max_depth=10)
reg.fit(x_train, y_train)
train_preds = reg.predict(x_train)
print(f"Train R2 score : {r2_score(y_train, train_preds)}")
print(f"Train RSME is : {root_mean_squared_error(y_train, train_preds)}")
preds = reg.predict(x_test)
r2_score(y_test, preds)
print(f"Test R2 score : {r2_score(y_test, preds)}")
print(f"Test RSME is : {root_mean_squared_error(y_test, preds)}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001095 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 582
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 15
[LightGBM] [Info] Start training from score 20941688.144330
Train R2 score : 0.603518319710576
Train RSME is : 17414090.15170034
Test R2 score : 0.13659674596852178
Test RSME is : 27194520.084608745


In [168]:
import joblib
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('encoder', oe),
    ('model', rfr)
])
joblib.dump(pipeline, 'model_pipeline.pkl')

['model_pipeline.pkl']