### Import Libraries

In [1]:
import pandas as pd
import warnings
from sklearn.preprocessing import OrdinalEncoder
warnings.filterwarnings(action='ignore')


### Reading data

In [18]:
df = pd.read_csv("ipl_Data//matches.csv")

In [19]:
df.head(5)

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,241.0,20.0,N,,MR Benson,SL Shastri
2,335984,2007/08,Delhi,2008-04-19,League,MF Maharoof,Feroz Shah Kotla,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9.0,130.0,20.0,N,,Aleem Dar,GA Pratapkumar
3,335985,2007/08,Mumbai,2008-04-20,League,MV Boucher,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5.0,166.0,20.0,N,,SJ Davis,DJ Harper
4,335986,2007/08,Kolkata,2008-04-20,League,DJ Hussey,Eden Gardens,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knight Riders,wickets,5.0,111.0,20.0,N,,BF Bowden,K Hariharan


### Dropping uncessary data

In [20]:
df.drop(columns=['umpire2','umpire1','method','target_overs','super_over','target_overs','result_margin','result','player_of_match','match_type','id'],axis=1,inplace=True)

In [21]:
df.head(5)

Unnamed: 0,season,city,date,venue,team1,team2,toss_winner,toss_decision,winner,target_runs
0,2007/08,Bangalore,2008-04-18,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,223.0
1,2007/08,Chandigarh,2008-04-19,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,241.0
2,2007/08,Delhi,2008-04-19,Feroz Shah Kotla,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,130.0
3,2007/08,Mumbai,2008-04-20,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,166.0
4,2007/08,Kolkata,2008-04-20,Eden Gardens,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knight Riders,111.0


### Analyzing redundant data and removing it

In [23]:
df['team1'].unique()

array(['Royal Challengers Bangalore', 'Kings XI Punjab',
       'Delhi Daredevils', 'Mumbai Indians', 'Kolkata Knight Riders',
       'Rajasthan Royals', 'Deccan Chargers', 'Chennai Super Kings',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Gujarat Lions', 'Rising Pune Supergiants',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Lucknow Super Giants', 'Gujarat Titans',
       'Royal Challengers Bengaluru'], dtype=object)

In [24]:
df["team2"].unique()

array(['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
       'Royal Challengers Bangalore', 'Deccan Chargers',
       'Kings XI Punjab', 'Delhi Daredevils', 'Mumbai Indians',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Gujarat Titans', 'Lucknow Super Giants',
       'Royal Challengers Bengaluru'], dtype=object)

In [25]:
df['winner'].unique()

array(['Kolkata Knight Riders', 'Chennai Super Kings', 'Delhi Daredevils',
       'Royal Challengers Bangalore', 'Rajasthan Royals',
       'Kings XI Punjab', 'Deccan Chargers', 'Mumbai Indians',
       'Pune Warriors', 'Kochi Tuskers Kerala', nan,
       'Sunrisers Hyderabad', 'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Gujarat Titans', 'Lucknow Super Giants',
       'Royal Challengers Bengaluru'], dtype=object)

In [26]:
df['toss_winner'].unique()

array(['Royal Challengers Bangalore', 'Chennai Super Kings',
       'Rajasthan Royals', 'Mumbai Indians', 'Deccan Chargers',
       'Kings XI Punjab', 'Kolkata Knight Riders', 'Delhi Daredevils',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Gujarat Lions', 'Rising Pune Supergiants',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Gujarat Titans', 'Lucknow Super Giants',
       'Royal Challengers Bengaluru'], dtype=object)

In [27]:
team_mapping = {'Royal Challengers Bengaluru':'Royal Challengers Bangalore','Deccan Chargers':'Sunrisers Hyderabad','Kings XI Punjab':'Punjab Kings','Delhi Daredevils':'Delhi Capitals','Gujarat Lions':'Gujarat Titans'}

In [28]:
team_mapping

{'Royal Challengers Bengaluru': 'Royal Challengers Bangalore',
 'Deccan Chargers': 'Sunrisers Hyderabad',
 'Kings XI Punjab': 'Punjab Kings',
 'Delhi Daredevils': 'Delhi Capitals',
 'Gujarat Lions': 'Gujarat Titans'}

In [29]:
cols_to_update = ['team1','team2','winner','toss_winner']

In [30]:
cols_to_update

['team1', 'team2', 'winner', 'toss_winner']

### function correct the name

In [31]:
def replace_duplicates(df,cols,arr:dict):
    for col in cols:
        df[col] = df[col].replace(arr)

In [33]:
replace_duplicates(df,cols_to_update,team_mapping)

In [49]:
df.shape

(1006, 10)

In [50]:
df['toss_winner'].unique()

array(['Royal Challengers Bangalore', 'Chennai Super Kings',
       'Rajasthan Royals', 'Mumbai Indians', 'Sunrisers Hyderabad',
       'Punjab Kings', 'Kolkata Knight Riders', 'Delhi Capitals',
       'Gujarat Titans', 'Lucknow Super Giants'], dtype=object)

### Removing discontnued teams 

In [48]:
df = df[~df['winner'].str.contains('Rising Pune Supergiant|Rising Pune Supergiants|Kochi Tuskers Kerala|Pune Warriors', na=False)]

### Removing rows with no result

In [53]:
df[df['winner'].isna()]

Unnamed: 0,season,city,date,venue,team1,team2,toss_winner,toss_decision,winner,target_runs
485,2015,Bangalore,2015-04-29,M Chinnaswamy Stadium,Royal Challengers Bangalore,Rajasthan Royals,Rajasthan Royals,field,,
511,2015,Bangalore,2015-05-17,M Chinnaswamy Stadium,Royal Challengers Bangalore,Delhi Capitals,Royal Challengers Bangalore,field,,188.0
744,2019,Bengaluru,2019-04-30,M.Chinnaswamy Stadium,Royal Challengers Bangalore,Rajasthan Royals,Rajasthan Royals,field,,63.0
994,2023,Lucknow,2023-05-03,Bharat Ratna Shri Atal Bihari Vajpayee Ekana C...,Lucknow Super Giants,Chennai Super Kings,Chennai Super Kings,field,,


In [54]:
df = df.dropna(subset=['winner'])

In [55]:
df[df['winner'].isna()]

Unnamed: 0,season,city,date,venue,team1,team2,toss_winner,toss_decision,winner,target_runs


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1002 entries, 0 to 1094
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   season         1002 non-null   object 
 1   city           951 non-null    object 
 2   date           1002 non-null   object 
 3   venue          1002 non-null   object 
 4   team1          1002 non-null   object 
 5   team2          1002 non-null   object 
 6   toss_winner    1002 non-null   object 
 7   toss_decision  1002 non-null   object 
 8   winner         1002 non-null   object 
 9   target_runs    1002 non-null   float64
dtypes: float64(1), object(9)
memory usage: 86.1+ KB


In [67]:
df.drop(columns=['venue'],axis=1,inplace=True)

In [68]:
df.head()

Unnamed: 0,season,city,date,team1,team2,toss_winner,toss_decision,winner,target_runs
0,2007/08,Bangalore,2008-04-18,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,223.0
1,2007/08,Chandigarh,2008-04-19,Punjab Kings,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,241.0
2,2007/08,Delhi,2008-04-19,Delhi Capitals,Rajasthan Royals,Rajasthan Royals,bat,Delhi Capitals,130.0
3,2007/08,Mumbai,2008-04-20,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,166.0
4,2007/08,Kolkata,2008-04-20,Kolkata Knight Riders,Sunrisers Hyderabad,Sunrisers Hyderabad,bat,Kolkata Knight Riders,111.0


### Checking null values

In [69]:
df[df['city'].isna()]

Unnamed: 0,season,city,date,team1,team2,toss_winner,toss_decision,winner,target_runs
399,2014,,2014-04-17,Delhi Capitals,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Royal Challengers Bangalore,146.0
402,2014,,2014-04-19,Royal Challengers Bangalore,Mumbai Indians,Royal Challengers Bangalore,field,Royal Challengers Bangalore,116.0
403,2014,,2014-04-19,Kolkata Knight Riders,Delhi Capitals,Kolkata Knight Riders,bat,Delhi Capitals,167.0
404,2014,,2014-04-20,Rajasthan Royals,Punjab Kings,Punjab Kings,field,Punjab Kings,192.0
406,2014,,2014-04-22,Punjab Kings,Sunrisers Hyderabad,Sunrisers Hyderabad,field,Punjab Kings,194.0
407,2014,,2014-04-23,Rajasthan Royals,Chennai Super Kings,Rajasthan Royals,field,Chennai Super Kings,141.0
408,2014,,2014-04-24,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,151.0
409,2014,,2014-04-25,Sunrisers Hyderabad,Delhi Capitals,Sunrisers Hyderabad,bat,Sunrisers Hyderabad,185.0
410,2014,,2014-04-25,Chennai Super Kings,Mumbai Indians,Mumbai Indians,bat,Chennai Super Kings,142.0
413,2014,,2014-04-27,Delhi Capitals,Mumbai Indians,Mumbai Indians,bat,Delhi Capitals,126.0


#### Null values are present only in city columns for 20/21 season. That competion was help in Dubai due to Covid-19

In [70]:
df['city'] = df['city'].fillna('Dubai')

In [71]:
df['city'].unique()

array(['Bangalore', 'Chandigarh', 'Delhi', 'Mumbai', 'Kolkata', 'Jaipur',
       'Hyderabad', 'Chennai', 'Cape Town', 'Port Elizabeth', 'Durban',
       'Centurion', 'East London', 'Johannesburg', 'Kimberley',
       'Bloemfontein', 'Ahmedabad', 'Cuttack', 'Nagpur', 'Dharamsala',
       'Visakhapatnam', 'Pune', 'Raipur', 'Ranchi', 'Abu Dhabi', 'Dubai',
       'Rajkot', 'Kanpur', 'Bengaluru', 'Indore', 'Sharjah',
       'Navi Mumbai', 'Lucknow', 'Guwahati', 'Mohali'], dtype=object)

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1002 entries, 0 to 1094
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   season         1002 non-null   object 
 1   city           1002 non-null   object 
 2   date           1002 non-null   object 
 3   team1          1002 non-null   object 
 4   team2          1002 non-null   object 
 5   toss_winner    1002 non-null   object 
 6   toss_decision  1002 non-null   object 
 7   winner         1002 non-null   object 
 8   target_runs    1002 non-null   float64
dtypes: float64(1), object(8)
memory usage: 78.3+ KB


### Saving cleaned data

In [77]:
df.to_csv("ipl_Data//ipl_data.csv",index=False)

### Encoding columns with same classes

In [79]:
cols_with_same_class = ['team1','team2','winner','toss_winner']

In [80]:
df_combined = pd.concat([df[col] for col in cols_with_same_class],axis=0)

In [81]:
df_combined

0       Royal Challengers Bangalore
1                      Punjab Kings
2                    Delhi Capitals
3                    Mumbai Indians
4             Kolkata Knight Riders
                   ...             
1090                   Punjab Kings
1091            Sunrisers Hyderabad
1092               Rajasthan Royals
1093               Rajasthan Royals
1094            Sunrisers Hyderabad
Length: 4008, dtype: object

In [83]:
unique_classes = pd.Series(df_combined.unique())

In [84]:
unique_classes

0    Royal Challengers Bangalore
1                   Punjab Kings
2                 Delhi Capitals
3                 Mumbai Indians
4          Kolkata Knight Riders
5               Rajasthan Royals
6            Sunrisers Hyderabad
7            Chennai Super Kings
8                 Gujarat Titans
9           Lucknow Super Giants
dtype: object

In [92]:
unique_classes = unique_classes.values.reshape(-1,1)

In [93]:
encoder = OrdinalEncoder()

In [94]:
encoder.fit(unique_classes)

In [96]:
for col in cols_with_same_class:
    df[col+'_enc'] = encoder.transform(df[[col]])

In [98]:
df_encoded = df.drop(columns=['team1','team2','toss_winner','winner'],axis=1)

In [105]:
df_encoded['toss_winner_enc'].nunique()

10

In [107]:
df_encoded.head()

Unnamed: 0,season,city,date,toss_decision,target_runs,team1_enc,team2_enc,winner_enc,toss_winner_enc
0,2007/08,Bangalore,2008-04-18,field,223.0,8.0,3.0,3.0,8.0
1,2007/08,Chandigarh,2008-04-19,bat,241.0,6.0,0.0,0.0,0.0
2,2007/08,Delhi,2008-04-19,bat,130.0,1.0,7.0,1.0,7.0
3,2007/08,Mumbai,2008-04-20,bat,166.0,5.0,8.0,8.0,5.0
4,2007/08,Kolkata,2008-04-20,bat,111.0,3.0,9.0,3.0,9.0


### Converting date to year month and day

In [108]:
df_encoded['date']=pd.to_datetime(df_encoded['date'])

In [109]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1002 entries, 0 to 1094
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   season           1002 non-null   object        
 1   city             1002 non-null   object        
 2   date             1002 non-null   datetime64[ns]
 3   toss_decision    1002 non-null   object        
 4   target_runs      1002 non-null   float64       
 5   team1_enc        1002 non-null   float64       
 6   team2_enc        1002 non-null   float64       
 7   winner_enc       1002 non-null   float64       
 8   toss_winner_enc  1002 non-null   float64       
dtypes: datetime64[ns](1), float64(5), object(3)
memory usage: 78.3+ KB


In [110]:
df_encoded['year'] = df_encoded['date'].dt.year
df_encoded['month'] = df_encoded['date'].dt.month
df_encoded['day'] = df_encoded['date'].dt.day

In [115]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1002 entries, 0 to 1094
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   season           1002 non-null   object        
 1   city             1002 non-null   object        
 2   date             1002 non-null   datetime64[ns]
 3   toss_decision    1002 non-null   object        
 4   target_runs      1002 non-null   float64       
 5   team1_enc        1002 non-null   float64       
 6   team2_enc        1002 non-null   float64       
 7   winner_enc       1002 non-null   float64       
 8   toss_winner_enc  1002 non-null   float64       
 9   year             1002 non-null   int32         
 10  month            1002 non-null   int32         
 11  day              1002 non-null   int32         
dtypes: datetime64[ns](1), float64(5), int32(3), object(3)
memory usage: 90.0+ KB


In [117]:
df_encoded.drop(columns=['season','date'],axis=1,inplace=True)

In [118]:
df_encoded

Unnamed: 0,city,toss_decision,target_runs,team1_enc,team2_enc,winner_enc,toss_winner_enc,year,month,day
0,Bangalore,field,223.0,8.0,3.0,3.0,8.0,2008,4,18
1,Chandigarh,bat,241.0,6.0,0.0,0.0,0.0,2008,4,19
2,Delhi,bat,130.0,1.0,7.0,1.0,7.0,2008,4,19
3,Mumbai,bat,166.0,5.0,8.0,8.0,5.0,2008,4,20
4,Kolkata,bat,111.0,3.0,9.0,3.0,9.0,2008,4,20
...,...,...,...,...,...,...,...,...,...,...
1090,Hyderabad,bat,215.0,6.0,9.0,9.0,6.0,2024,5,19
1091,Ahmedabad,bat,160.0,9.0,3.0,3.0,9.0,2024,5,21
1092,Ahmedabad,field,173.0,8.0,7.0,7.0,7.0,2024,5,22
1093,Chennai,field,176.0,9.0,7.0,9.0,7.0,2024,5,24


### Saving encoded data

In [129]:
df_encoded.to_csv("ipl_Data//ipl_data_enc.csv",index=False)

In [3]:
df_encoded = pd.read_csv("ipl_Data//ipl_data_enc.csv")

### Train and Test Split for time series data

In [4]:
train_data = df_encoded[df_encoded['year']<2024]

In [5]:
train_data['year'].unique()

array([2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018,
       2019, 2020, 2021, 2022, 2023])

In [19]:
test_data = df_encoded[df_encoded['year']>2023]

In [20]:
test_data['year'].value_counts()

year
2024    71
Name: count, dtype: int64

In [6]:
cat_col = [col for col in df_encoded.columns if df_encoded[col].dtype=='O']
num_col = ["target_runs"]

In [7]:
cat_col

['city', 'toss_decision']

In [8]:
num_col

['target_runs']

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [5]:
import pickle
import os

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [12]:
num_pipeline = Pipeline(
    steps=[
        ("scaler",StandardScaler())
    ]
)

In [15]:
cat_pipeline = Pipeline(
    steps=[
        ("ohe",OneHotEncoder(handle_unknown="ignore",sparse_output=False))
    ]
)

In [16]:
preprocessor = ColumnTransformer(
    transformers = [
        ("num_pipeline",num_pipeline,num_col),
        ("cat_pipeline",cat_pipeline,cat_col)
    ]
)

In [17]:
TARGET_COLUMN='winner_enc'

In [21]:
input_data_train = train_data.drop(columns=[TARGET_COLUMN],axis=1)
output_data_train = train_data[TARGET_COLUMN]
input_data_test = test_data.drop(columns=[TARGET_COLUMN],axis=1)
output_data_test = test_data[TARGET_COLUMN]

In [22]:
input_data_train.shape,output_data_train.shape,input_data_test.shape,output_data_test.shape

((931, 9), (931,), (71, 9), (71,))

In [23]:
input_data_train_arr = preprocessor.fit_transform(input_data_train)
input_data_test_arr = preprocessor.transform(input_data_test)

In [24]:
input_data_test_arr.shape

(71, 37)

In [25]:
def save_object(file_path,obj):
    dir_path = os.path.dirname(file_path)
    os.makedirs(dir_path,exist_ok=True)
        
    with open(file_path,"wb") as file_obj:
        pickle.dump(obj,file_obj)

In [26]:
save_object("Artifacts/preprocessor.pkl",preprocessor)

In [4]:
import numpy as np

In [28]:
def save_as_numpy_array(filepath:str,arr:np.array):

    os.makedirs(os.path.dirname(filepath),exist_ok=True)
    with open(filepath,"wb") as file:
        np.save(file,arr)


In [6]:
def load_object(filepath)-> object:

    with open(filepath,"rb") as file_obj:
        return pickle.load(file_obj)


def load_numpy_array(filepath)->np.array:
    
    with open(filepath,"rb") as file_obj:
        return np.load(file_obj)


In [30]:
print(input_data_train_arr.shape)
print(np.array(output_data_train).shape)


(931, 37)
(931,)


In [31]:
output_data_train = np.array(output_data_train).reshape(-1, 1)
output_data_test = np.array(output_data_test).reshape(-1, 1)



In [32]:
print(input_data_train_arr.shape)
print(np.array(output_data_train).shape)


(931, 37)
(931, 1)


In [199]:
print(input_data_train_arr.shape)
print(np.array(output_data_train).shape)


(931, 37)
(931, 1)


In [41]:
input_data_train_arr.shape

(931, 37)

In [33]:
train_arr = np.c_[input_data_train_arr,output_data_train]
test_arr = np.c_[input_data_test_arr,output_data_test]

In [40]:
train_arr.shape

(931, 38)

In [34]:
save_as_numpy_array("Artifacts/train_arr.npy",train_arr)
save_as_numpy_array("Artifacts/test_arr.npy",test_arr)

In [13]:
train_arr = load_numpy_array("Artifacts/train_arr.npy")
test_arr = load_numpy_array("Artifacts/test_arr.npy")

In [8]:
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [9]:
from sklearn.metrics import f1_score,precision_score,recall_score

In [10]:
models = {
    "Random_Forest":RandomForestClassifier(verbose=1),
    "Decision_Tree":DecisionTreeClassifier(),
    "AdaBoostClassifier":AdaBoostClassifier(),
    "GradientBoostingClassifier":GradientBoostingClassifier(verbose=1),
    "KNeighborsClassifier":KNeighborsClassifier(),
    "XGBClassifier":XGBClassifier(verbose=1)
}

In [14]:
X_train = train_arr[:,:-1]
y_train = train_arr[:,-1]
X_test = test_arr[:,:-1]
y_test = test_arr[:,-1]

In [15]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((931, 37), (931,), (71, 37), (71,))

In [62]:
for model in models.keys():
    model_name = model
    model_obj = models[model]
    model_obj.fit(X_train,y_train)
    y_pred = model_obj.predict(X_test)
    print(f"f1 score macro for model {model_name} is {f1_score(y_pred,y_test,average='macro')}")
    print(f"f1 score for weighted model {model_name} is {f1_score(y_pred,y_test,average='weighted')}")
    print(f"recall score macro for model {model_name} is {recall_score(y_pred,y_test,average='macro')}")
    print(f"recall score for weighted model {model_name} is {recall_score(y_pred,y_test,average='weighted')}")
    print(f"precision score macro for model {model_name} is {precision_score(y_pred,y_test,average='macro')}")
    print(f"precision score weighted for model {model_name} is {precision_score(y_pred,y_test,average='weighted')}")
    

f1 score macro for model Random_Forest is 0.2529610740137056
f1 score for weighted model Random_Forest is 0.2519254898795299
recall score macro for model Random_Forest is 0.28787878787878785
recall score for weighted model Random_Forest is 0.2535211267605634
precision score macro for model Random_Forest is 0.2561976911976912
precision score weighted for model Random_Forest is 0.27898197264394453
f1 score macro for model Decision_Tree is 0.23061473614105193
f1 score for weighted model Decision_Tree is 0.21586467646512122
recall score macro for model Decision_Tree is 0.2772222222222222
recall score for weighted model Decision_Tree is 0.22535211267605634
precision score macro for model Decision_Tree is 0.2219119769119769
precision score weighted for model Decision_Tree is 0.2376796130317257
f1 score macro for model AdaBoostClassifier is 0.4204934685197843
f1 score for weighted model AdaBoostClassifier is 0.4778093066640138
recall score macro for model AdaBoostClassifier is 0.4366269841269

In [18]:
param_grids = {
    "Random_Forest": {
        'n_estimators': [100, 150,200,250,300],
        'max_depth': [10, 15,20,25,30],
        'min_samples_split': [2, 5,3,6]
    },
    "Decision_Tree": {
        'max_depth': [10, 15, 20,30, None],
        'criterion': ['gini', 'entropy']
    },
    "AdaBoostClassifier": {
        'n_estimators': [50, 75,100,150,200],
        'learning_rate': [0.01, 0.1,0.7,1.0,1.5]
    },
    "GradientBoostingClassifier": {
        'n_estimators': [150,250, 200],
        'learning_rate': [0.05,0.01,0.07],
        'max_depth': [3,4]
    },
    "KNeighborsClassifier": {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance']
    },
    "XGBClassifier": {
        'n_estimators': [50,100,150,200,250,300],
        'max_depth': [3, 5,6,4],
        'learning_rate': [0.1, 0.2,0.01,0.07,0.3,0.4]
    }
}


In [19]:
best_models = {}

for name in models:
    print(f"\n🔍 Tuning {name}...")
    model = models[name]
    params = param_grids[name]

    grid_search = GridSearchCV(model, params, cv=5, scoring='f1_weighted', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    best_models[name] = best_model

    y_pred = best_model.predict(X_test)
    score = f1_score(y_test, y_pred, average='weighted')

    print(f"✅ Best F1 Score for {name}: {score:.4f}")
    print(f"🏆 Best Params: {grid_search.best_params_}")


🔍 Tuning Random_Forest...


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.8s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.0s


✅ Best F1 Score for Random_Forest: 0.4941
🏆 Best Params: {'max_depth': 10, 'min_samples_split': 6, 'n_estimators': 250}

🔍 Tuning Decision_Tree...
✅ Best F1 Score for Decision_Tree: 0.2457
🏆 Best Params: {'criterion': 'gini', 'max_depth': 10}

🔍 Tuning AdaBoostClassifier...
✅ Best F1 Score for AdaBoostClassifier: 0.4569
🏆 Best Params: {'learning_rate': 1.0, 'n_estimators': 100}

🔍 Tuning GradientBoostingClassifier...
      Iter       Train Loss   Remaining Time 
         1           2.1900           12.48s
         2           2.1757           11.02s
         3           2.1626           10.19s
         4           2.1504           10.32s
         5           2.1389           10.84s
         6           2.1280           10.54s
         7           2.1176           10.15s
         8           2.1078            9.87s
         9           2.0983            9.69s
        10           2.0893            9.56s
        20           2.0158            8.26s
        30           1.9629           