## Importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from Feature_modifications import create_seasons , poly_degree2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score , make_scorer
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold , cross_val_score

#MODELS
from xgboost import XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from MultiLR_compatible import MultiLogisticReg

#FOR FEATURE SELECTION
from sklearn.feature_selection import RFECV,RFE
from sklearn.tree import DecisionTreeClassifier

#FOR IMBALANCED DATASET
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as pipe_imb

import joblib
import warnings
warnings.filterwarnings('ignore')

### Importing the training and test sets. 

After importing each data set, a new column called season in created using the Feature_modifications python file

In [None]:
train=pd.read_csv(r"train.csv")
test=pd.read_csv(r"test.csv")

train.loc[:,"season"]=create_seasons(train)
test.loc[:,"season"]=create_seasons(test)


Another new column is created, called "Locations" in both the training and test set

In [2]:
epsilon = 1.5
db = DBSCAN(eps=epsilon, min_samples=1).fit(np.array(train[['lat','lon']]))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))

#We will use this labels for our test set
train.loc[:,"Locations"] = cluster_labels
loc_dic = {}
for i in np.unique(cluster_labels):
    loc_dic[i]=np.unique(train.loc[train["Locations"]==i,["lat","lon"]],axis=0)    
#Apply to the TEST SET
for i in np.unique(cluster_labels):
    for j in loc_dic[i]:
        test.loc[(test["lat"]==j[0]) & (test["lon"]==j[1]),'Locations'] = i

In [3]:
kfold = RepeatedStratifiedKFold(n_splits=10,n_repeats=10)

## Experiments: modifications on the original data set

In this section we will apply each of the 4 model to the 6 different data sets that are described in the report

### Dataset 1

In [4]:
train1 = train.drop(labels=["SNo","lat","lon","time","Locations","months"],axis=1)
test1 = test.drop(labels=["SNo","lat","lon","time","Locations","months"],axis=1)

train_features = train1.drop(labels=["Label"],axis=1)
train_label = train1["Label"]

numerical_ix = train_features.iloc[:,:-1].columns
categorical_ix = ["season"]

t = [('cat', OneHotEncoder(handle_unknown = "ignore"), categorical_ix), ('num', StandardScaler(), numerical_ix)]

col_transform = ColumnTransformer(transformers=t)

### <font color= blue> MultiLogisticReg

In [6]:
pipeline = Pipeline(steps=[('prep', col_transform),('estimator', MultiLogisticReg())])

mean=np.mean(cross_val_score(pipeline, train_features, train_label, cv=kfold))

mean

0.7753529937444145

In [9]:
pipeline.fit(train_features, train_label)
pred = pipeline.predict(test1)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_LG1.csv",index=False)

filename = "MODELS/plain_LG1.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_LG1.sav']

### <font color= blue> RandomForestClassifier

In [11]:
pipeline = Pipeline(steps=[('prep', col_transform),('estimator', RandomForestClassifier())])

mean=np.mean(cross_val_score(pipeline, train_features, train_label, cv=kfold)
            )
mean

0.8623324396782841

In [12]:
pipeline.fit(train_features, train_label)
pred = pipeline.predict(test1)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_RF1.csv",index=False)

filename = "MODELS/plain_RF1.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_RF1.sav']

### <font color= blue> XGBClassifier

In [13]:
pipeline = Pipeline(steps=[('prep', col_transform),('estimator', XGBClassifier())])
mean=np.mean(cross_val_score(pipeline, train_features, train_label, cv=kfold))

mean

0.8798815907059874

In [14]:
pipeline.fit(train_features, train_label)
pred = pipeline.predict(test1)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_XG1.csv",index=False)

filename = "MODELS/plain_XG1.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_XG1.sav']

### <font color= blue> HistGradientBoostingClassifier

In [15]:
pipeline = Pipeline(steps=[('prep', col_transform),('estimator', HistGradientBoostingClassifier())])

mean=np.mean(cross_val_score(pipeline, train_features, train_label, cv=kfold))

mean

0.8856300268096514

In [16]:
pipeline.fit(train_features, train_label)
pred = pipeline.predict(test1)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_HISTGB1.csv",index=False)

filename = "MODELS/plain_HistGB1.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_HistGB1.sav']

### Dataset 2

In [12]:
train2 = train.drop(labels=["SNo","time","Locations","months"],axis=1)
test2 = test.drop(labels=["SNo","time","Locations","months"],axis=1)

train_features = train2.drop(labels=["Label"],axis=1)
train_label = train2["Label"]

numerical_ix = train_features.iloc[:,:-1].columns
categorical_ix = ["season"]

t = [('cat', OneHotEncoder(handle_unknown = "ignore"), categorical_ix), ('num', StandardScaler(), numerical_ix)]

col_transform = ColumnTransformer(transformers=t)

### <font color= blue> MultiLogisticReg

In [13]:
pipeline = Pipeline(steps=[('prep', col_transform),('estimator', MultiLogisticReg())])

mean=np.mean(cross_val_score(pipeline, train_features, train_label, cv=kfold))

mean

0.7796805183199286

In [14]:
pipeline.fit(train_features, train_label)
pred = pipeline.predict(test2)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_LG2.csv",index=False)

filename = "MODELS/plain_LG2.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_LG2.sav']

### <font color= blue> RandomForestClassifier

In [20]:
pipeline = Pipeline(steps=[('prep', col_transform),('estimator', RandomForestClassifier())])

mean=np.mean(cross_val_score(pipeline, train_features, train_label, cv=kfold))

mean

0.8623123324396783

In [21]:
pipeline.fit(train_features, train_label)
pred = pipeline.predict(test2)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_RF2.csv",index=False)

filename = "MODELS/plain_RF2.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_RF2.sav']

### <font color= blue> XGBClassifier

In [22]:
pipeline = Pipeline(steps=[('prep', col_transform),('estimator', XGBClassifier())])
mean=np.mean(cross_val_score(pipeline, train_features, train_label, cv=kfold))

mean

0.8797140303842719

In [23]:
pipeline.fit(train_features, train_label)
pred = pipeline.predict(test2)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_XG2.csv",index=False)

filename = "MODELS/plain_XG2.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_XG2.sav']

### <font color= blue> HistGradientBoostingClassifier

In [24]:
pipeline = Pipeline(steps=[('prep', col_transform),('estimator', HistGradientBoostingClassifier())])

mean=np.mean(cross_val_score(pipeline, train_features, train_label, cv=kfold))

mean

0.8863270777479891

In [25]:
pipeline.fit(train_features, train_label)
pred = pipeline.predict(test2)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_HISTGB2.csv",index=False)

filename = "MODELS/plain_HistGB2.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_HistGB2.sav']

### Dataset 3

In [15]:
train3 = train.copy().drop(labels=["SNo","time","months"],axis=1)
test3 = test.copy().drop(labels=["SNo","time","months"],axis=1)

train_features = train3.drop(labels=["Label"],axis=1)
train_label = train3["Label"]

numerical_ix = train_features.drop(labels=["Locations","season"],axis=1).columns 
categorical_ix = ["Locations","season"] 

t = [('cat', OneHotEncoder(handle_unknown = "ignore"), categorical_ix), ('num', StandardScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers=t)

### <font color= blue> MultiLogisticReg

In [16]:
pipeline = Pipeline(steps=[('prep', col_transform),('estimator', MultiLogisticReg())])

mean = np.mean(cross_val_score(pipeline, train_features, train_label, cv=kfold))

mean

0.7759763181411976

In [17]:
pipeline.fit(train_features, train_label)
pred = pipeline.predict(test3)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_LG3.csv",index=False)

filename = "MODELS/plain_LG3.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_LG3.sav']

### <font color= blue> RandomForestClassifier

In [29]:
pipeline = Pipeline(steps=[('prep', col_transform),('estimator', RandomForestClassifier())])

mean=np.mean(cross_val_score(pipeline, train_features, train_label, cv=kfold))

mean

0.8618655049151026

In [30]:
pipeline.fit(train_features, train_label)
pred = pipeline.predict(test3)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_RF3.csv",index=False)

filename = "MODELS/plain_RF3.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_RF3.sav']

### <font color= blue> XGBClassifier

In [31]:
pipeline = Pipeline(steps=[('prep', col_transform),('estimator', XGBClassifier())])
mean=np.mean(cross_val_score(pipeline, train_features, train_label, cv=kfold))

mean

0.8799374441465595

In [32]:
pipeline.fit(train_features, train_label)
pred = pipeline.predict(test3)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_XG3.csv",index=False)

filename = "MODELS/plain_XG3.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_XG3.sav']

### <font color= blue> HistGradientBoostingClassifier

In [33]:
pipeline = Pipeline(steps=[('prep', col_transform),('estimator', HistGradientBoostingClassifier())])

mean=np.mean(cross_val_score(pipeline, train_features, train_label, cv=kfold))

mean

0.886168453976765

In [34]:
pipeline.fit(train_features, train_label)
pred = pipeline.predict(test3)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_HISTGB3.csv",index=False)

filename = "MODELS/plain_HistGB3.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_HistGB3.sav']

### Dataset 4

In [18]:
selector = RFECV(DecisionTreeClassifier(random_state=0), step=1, cv=10, scoring="accuracy")

train_encoded = pd.get_dummies(train_features[categorical_ix], columns=["Locations","season"] )
test_encoded = pd.get_dummies(test3[categorical_ix], columns=["Locations","season"] )

st = StandardScaler()
train_features[numerical_ix] = st.fit_transform(train_features[numerical_ix])
test3[numerical_ix] = st.transform(test3[numerical_ix])

train_features_transf=train_features[numerical_ix].merge(train_encoded , right_index=True , left_index=True)
test_features_transf=test3[numerical_ix].merge(test_encoded , right_index=True , left_index=True)

In [19]:
selected_features = selector.fit(train_features_transf, train_label)

In [20]:
selected_features.ranking_

array([ 1,  8,  1,  1,  1,  1,  1,  4,  1,  1,  1,  1,  5,  1,  1,  2,  1,
        3, 10, 15, 17, 14,  9, 13, 16, 12, 11,  7,  6])

In [21]:
selected_features.n_features_

13

In [22]:
sel_columns=selected_features.get_feature_names_out() #name of selected columns

In [23]:
sel_columns

array(['lat', 'TMQ', 'U850', 'V850', 'UBOT', 'VBOT', 'PS', 'PSL', 'T200',
       'T500', 'TS', 'TREFHT', 'Z200'], dtype=object)

### <font color= blue> MultiLogisticReg

In [24]:
mean = np.mean(cross_val_score(MultiLogisticReg(),train_features_transf[sel_columns], train_label, cv=kfold))
mean

0.783505361930295

In [25]:
model=MultiLogisticReg()
model.fit(train_features_transf[sel_columns], train_label)
pred = model.predict(test_features_transf[sel_columns])
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_LG4.csv",index=False)
filename = "MODELS/plain_LG4.sav"
joblib.dump(model, filename)

['MODELS/plain_LG4.sav']

### <font color= blue> RandomForestClassifier

In [43]:
mean=np.mean(cross_val_score(RandomForestClassifier(), train_features_transf[sel_columns], train_label, cv=kfold))
mean

0.8618342269883825

In [44]:
model=RandomForestClassifier()
model.fit(train_features_transf[sel_columns], train_label)
pred = model.predict(test_features_transf[sel_columns])
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_RF4.csv",index=False)

filename = "MODELS/plain_RF4.sav"
joblib.dump(model, filename)

['MODELS/plain_RF4.sav']

### <font color= blue> XGBClassifier

In [45]:
mean=np.mean(cross_val_score(XGBClassifier(), train_features_transf[sel_columns], train_label, cv=kfold))
mean

0.8809428060768542

In [46]:

model=XGBClassifier()
model.fit(train_features_transf[sel_columns], train_label)

pred = model.predict(test_features_transf[sel_columns])
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_XG4.csv",index=False)
filename = "MODELS/plain_XG4.sav"
joblib.dump(model, filename)

['MODELS/plain_XG4.sav']

### <font color= blue> HistGradientBoostingClassifier

In [47]:
mean=np.mean(cross_val_score(HistGradientBoostingClassifier(), train_features_transf[sel_columns], train_label, cv=kfold))
mean

0.8867739052725647

In [48]:
model=HistGradientBoostingClassifier()
model.fit(train_features_transf[sel_columns], train_label)

pred = model.predict(test_features_transf[sel_columns])
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_HISTGB4.csv",index=False)
filename = "MODELS/plain_HistGB4.sav"
joblib.dump(model, filename)

['MODELS/plain_HistGB4.sav']

### Dataset 5

In [26]:
train_features_poly = poly_degree2(train_features[sel_columns],sel_columns)
test_features_poly = poly_degree2(test3[sel_columns],sel_columns)

### <font color= blue> MultiLogisticReg

In [27]:
pipeline = Pipeline(steps=[('prep', StandardScaler()),('estimator', MultiLogisticReg())])

mean = np.mean(cross_val_score(pipeline, train_features_poly, train_label, cv=kfold))

mean

0.7372319034852548

In [28]:
pipeline.fit(train_features_poly, train_label)
pred = pipeline.predict(test_features_poly)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_LG5.csv",index=False)
filename = "MODELS/plain_LG5.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_LG5.sav']

### <font color= blue> RandomForestClassifier

In [25]:
pipeline = Pipeline(steps=[('prep', StandardScaler()),('estimator', RandomForestClassifier())])

mean=np.mean(cross_val_score(pipeline, train_features_poly, train_label, cv=kfold))

mean

0.8618833780160857

In [26]:
pipeline.fit(train_features_poly, train_label)
pred = pipeline.predict(test_features_poly)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_RF5.csv",index=False)
filename = "MODELS/plain_RF5.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_RF5.sav']

### <font color= blue> XGBClassifier

In [28]:
pipeline = Pipeline(steps=[('prep', StandardScaler()),('estimator', XGBClassifier())])
mean=np.mean(cross_val_score(pipeline, train_features_poly, train_label, cv=kfold))
mean

0.8777747989276139

In [29]:
pipeline.fit(train_features_poly, train_label)
pred = pipeline.predict(test_features_poly)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_XG5.csv",index=False)
filename = "MODELS/plain_XG5.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_XG5.sav']

### <font color= blue> HistGradientBoostingClassifier

In [30]:
pipeline = Pipeline(steps=[('prep', StandardScaler()),('estimator', HistGradientBoostingClassifier())])
mean=np.mean(cross_val_score(pipeline, train_features_poly, train_label, cv=kfold))

mean

0.8856456657730115

In [31]:
pipeline.fit(train_features_poly, train_label)
pred = pipeline.predict(test_features_poly)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_HISTGB5.csv",index=False)
filename = "MODELS/plain_HistGB5.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_HistGB5.sav']

### Dataset 6

### <font color= blue> MultiLogisticReg

In [29]:
pipeline = pipe_imb(steps=[('prep', StandardScaler()),("balance",SMOTE()),('estimator', MultiLogisticReg())])

mean = np.mean(cross_val_score(pipeline, train_features_poly, train_label, cv=kfold))

mean

0.5110790884718497

In [30]:
pipeline.fit(train_features_poly, train_label)
pred = pipeline.predict(test_features_poly)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_LG6.csv",index=False)
filename = "MODELS/plain_LG6.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_LG6.sav']

### <font color= blue> RandomForestClassifier

In [16]:
pipeline = pipe_imb(steps=[('prep', StandardScaler()),("balance",SMOTE()),('estimator', RandomForestClassifier())])

mean=np.mean(cross_val_score(pipeline, train_features_poly, train_label, cv=kfold))

mean

0.8489320822162647

In [17]:
pipeline.fit(train_features_poly, train_label)
pred = pipeline.predict(test_features_poly)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_RF6.csv",index=False)
filename = "MODELS/plain_RF6.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_RF6.sav']

### <font color= blue> XGBClassifier

In [18]:
pipeline = pipe_imb(steps=[('prep', StandardScaler()),("balance",SMOTE()),('estimator', XGBClassifier())])
mean=np.mean(cross_val_score(pipeline, train_features_poly, train_label, cv=kfold))
mean

0.8685835567470954

In [19]:
pipeline.fit(train_features_poly, train_label)
pred = pipeline.predict(test_features_poly)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_XG6.csv",index=False)
filename = "MODELS/plain_XG6.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_XG6.sav']

### <font color= blue> HistGradientBoostingClassifier

In [20]:
pipeline = pipe_imb(steps=[('prep', StandardScaler()),("balance",SMOTE()),('estimator', HistGradientBoostingClassifier())])
mean=np.mean(cross_val_score(pipeline, train_features_poly, train_label, cv=kfold))

mean

0.8730764075067023

In [21]:
pipeline.fit(train_features_poly, train_label)
pred = pipeline.predict(test_features_poly)
sub=pd.read_csv(r"test.csv")
sub = sub.copy()[["SNo"]]
sub["Label"]=pred
sub.to_csv("PREDICTIONS/Submission_HISTGB6.csv",index=False)
filename = "MODELS/plain_HistGB6.sav"
joblib.dump(pipeline, filename)

['MODELS/plain_HistGB6.sav']

# Hyperparameter tunning for Histogram Gradiend Boosting

In [31]:
#pipeline = Pipeline(steps=[('estimator', HistGradientBoostingClassifier())])

param_grid = { 'learning_rate': [0.1,0.01,0.5,0.05],
    'max_iter': [100, 150, 200],
             "l2_regularization":[0,0.001,0.01],
             "max_bins":[225,255]}

kfold2 = RepeatedStratifiedKFold(n_splits=5,n_repeats=3)

scoring = make_scorer(accuracy_score)

grid_search = GridSearchCV(estimator=HistGradientBoostingClassifier() ,
                            param_grid=param_grid , cv=kfold2 , scoring=scoring)

grid_search.fit(train_features_transf[sel_columns], train_label)

In [32]:
grid_search.best_score_

0.8918156091748586

In [52]:
grid_search.best_params_

{'l2_regularization': 0,
 'learning_rate': 0.05,
 'max_bins': 225,
 'max_iter': 100}

In [None]:
pred = grid_search.predict(test_features_transf[sel_columns])
sub = test.copy()[["SNo"]]
sub["Label"] = pred
sub.to_csv("PREDICTIONS/Submission_HISTGB_hyper.csv",index=False)
filename = "MODELS/hyper_HistGB.sav"
joblib.dump(grid_search.best_estimator_, filename)

# Hyperparameter tunning for Logistic Regression

In [34]:
param_grid = {
    'reg': [0.01,0.05,0.1,0.5],
    'stepsize': [0.01,0.05,0.1,0.5],
    'n_steps': [1000,1500] }

scoring = make_scorer(accuracy_score)

grid_search2 = GridSearchCV(estimator=MultiLogisticReg() , param_grid=param_grid , cv=kfold2 , scoring=scoring)

grid_search2.fit(train_features_transf[sel_columns], train_label)

In [35]:
grid_search2.best_score_

0.8104259755734285

In [36]:
grid_search2.best_params_

{'n_steps': 1000, 'reg': 0.01, 'stepsize': 0.5}

In [92]:
pred = grid_search2.predict(test_features_transf[sel_columns])
sub = test.copy()[["SNo"]]
sub["Label"] = pred
sub.to_csv("PREDICTIONS/Submission_LG_hyper.csv",index=False)
filename = "MODELS/hyper_LG.sav"
joblib.dump(grid_search.best_estimator_, filename)

['MODELS/hyper_LG.sav']

## Resources

* https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
* https://machinelearningmastery.com/gradient-boosting-with-scikit-learn-xgboost-lightgbm-and-catboost/