# Gradient Boosting and GridSearch


In [32]:


from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score, f1_score, recall_score

def input_mlflow(cv_r_v,increment_number,X_train, y_train,pipeline,run_name,experiment_id,X_test,y_test):
    

    for wi in range(len(cv_r_v)):
        
        #====================skema===================
        cols_spec = []
        data_map = {
                'int64': 'integer',
                'float64': 'double',
                'bool': 'boolean',
                'str': 'string',
                'object': 'string',
                "date": 'datetime'
            }

        for name, dtype in X_train.dtypes.to_dict().items():
            cols_spec.append(ColSpec(name=name, type=data_map[str(dtype)]))
        input_schema = Schema(inputs=cols_spec)
        output_schema = Schema([ColSpec(name="label", type="string")])
        #parameter = ParamSpec(name="model_name", dtype="string", default="model1")
        #param_schema = ParamSchema(params=[parameter])
        model_signature = ModelSignature(inputs=input_schema, outputs=output_schema)#, params=param_schema)
        #print("MODEL SIGNATURE")
        #print(model_signature.to_dict())

        model_signature = infer_signature(X_train, y_train)#, params={"model_name": "model1"})
        #print("MODEL SIGNATURE")
        #print(model_signature.to_dict())

        
        
        
        
        #====================buat run baru===================
        run_name_with_increment = f"{run_name}__{increment_number}"
            # Membuka run MLflow
        with mlflow.start_run(run_name=run_name_with_increment, experiment_id=experiment_id) as run:
            # Mendapatkan run_id
            run_id = mlflow.active_run().info.run_id
        increment_number=increment_number+1
        print("run_id:", run_id)

        
        
        #====================buat parameter model dan metric===================

        # Definisikan grid parameter untuk dicari
        param_grid = cv_r_v[wi]
        
        #====================model===================

        # Inisialisasi GridSearchCV
        grid_search = GridSearchCV(pipeline, param_grid, cv=5,  scoring='accuracy')

        # Lakukan pencarian grid
        grid_search.fit(X_train, y_train)
        
            
        #====================parameter===================
        pipe=grid_search.best_estimator_
        
        
        first_step_name = list(pipe.named_steps.keys())[0:len(pipe)]
        

        for i in range(len(pipe)):
            # Mendapatkan parameter dari langkah 'sca'
            nama=first_step_name[i]
            sca_params = pipe.get_params()[nama]

                # Membuka run MLflow
            with mlflow.start_run( experiment_id=experiment_id,run_id=run_id) as run:

                # Log parameter secara otomatis menggunakan loop
                for param_name, param_value in sca_params.get_params().items():
                    param_name=nama+'__'+param_name
                    #print(param_name,param_value)
                    mlflow.log_param(param_name, param_value)
                    
                    
                #====================metric===================
                #matric
                hasil_test=grid_search.predict(X_test)
 
                
                
                
                                # Menghitung akurasi
                accuracy = accuracy_score(y_test, hasil_test)

                # Menghitung F1 score
                f1 = f1_score(y_test, hasil_test,average='macro')

                # Menghitung recall
                recall = recall_score(y_test, hasil_test,average='macro')

                
                
                                # Menghitung precision
                precision = precision_score(y_test, hasil_test,average='macro')


   
                

                # log model 
                mlflow.sklearn.log_model(sk_model=grid_search, artifact_path="grid_search__"+str(increment_number-1),signature=model_signature)

                metrics = {
                    'mean_test_score':pd.DataFrame(grid_search.cv_results_)['mean_test_score'].values[0],
                    "accuracy": accuracy,
                    "f1": f1,
                    "recall": recall,
                    "precision": precision,
         
                }

                mlflow.log_metrics(metrics)
                
                
        mlflow.end_run()
    print('selesai')

        
    return increment_number


 

def ambil_best(grid_search,n):
    cv_r=grid_search.copy()
    cv_r_v=cv_r.sort_values(['rank_test_score'])
    cv_r_v=cv_r_v.head(n)['params'].values

    for wi in range(len(cv_r_v)):

        for i in list(cv_r_v[wi]):

            cv_r_v[wi][i]=[cv_r_v[wi][i]]


    return cv_r_v


In [33]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from mlflow.models.signature import infer_signature
from mlflow.types.schema import ColSpec, Schema
import pandas as pd

def input_mlflow_kecil(cv_r_v, increment_number, X_train, y_train, pipeline, run_name, experiment_id, X_test, y_test):
    

    for wi in range(len(cv_r_v)):
        
        
        # Define data schema
        data_map = {
            'int64': 'integer',
            'float64': 'double',
            'bool': 'boolean',
            'str': 'string',
            'object': 'string',
            'datetime64': 'datetime'
        }
        
        cols_spec = [ColSpec(name=name, type=data_map[str(dtype)]) for name, dtype in X_train.dtypes.items()]
        input_schema = Schema(inputs=cols_spec)
        
        output_schema = Schema([ColSpec(name="label", type="string")])
       
        model_signature = infer_signature(X_train.sample(n=100), y_train.sample(n=100))
       
        run_name_with_increment = f"{run_name}__{increment_number}"
        
        
        with mlflow.start_run(run_name=run_name_with_increment, experiment_id=experiment_id) as run:
            run_id = run.info.run_id
            increment_number += 1
            print(f" {increment_number}  --> run_id:", run_id)
            
            # Define the grid parameters
            param_grid = cv_r_v[wi]
            
            # Initialize and fit GridSearchCV
            grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
            grid_search.fit(X_train, y_train)
           
            
            # Log best parameters for each step in the pipeline
            best_pipeline = grid_search.best_estimator_
            for step_name, step in best_pipeline.named_steps.items():
                step_params = step.get_params()
                for param_name, param_value in step_params.items():
                    full_param_name = f"{step_name}__{param_name}"
                    mlflow.log_param(full_param_name, param_value)
            
            # Predict on test set and calculate metrics
          
            predictions = grid_search.predict(X_test)
            accuracy = accuracy_score(y_test, predictions)
            f1 = f1_score(y_test, predictions, average='macro')
            recall = recall_score(y_test, predictions, average='macro')
            precision = precision_score(y_test, predictions, average='macro')
            
            # Log model
            mlflow.sklearn.log_model(sk_model=best_pipeline, artifact_path=f"model_{increment_number-1}", signature=model_signature)
            
            # Log metrics
            metrics = {
                'mean_test_score': grid_search.best_score_,
                'accuracy': accuracy,
                'f1': f1,
                'recall': recall,
                'precision': precision,
            }
            mlflow.log_metrics(metrics)
           
            
        mlflow.end_run()
    
    print('Finished logging all runs')
    
    return increment_number

# Example usage
# Assuming you have your data (X_train, y_train, X_test, y_test) and pipeline ready
# cv_r_v = [{'param1': [values], 'param2': [values]}, ...]
# increment_number = input_mlflow(cv_r_v, increment_number, X_train, y_train, pipeline, run_name, experiment_id, X_test, y_test)




Attribute Information:

1. cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
2. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r, pink=p,purple=u,red=e,white=w,yellow=y
4. bruises?: bruises=t,no=f
5. odor: almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s
6. gill-attachment: attached=a,descending=d,free=f,notched=n
7. gill-spacing: close=c,crowded=w,distant=d
8. gill-size: broad=b,narrow=n
9. gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e, white=w,yellow=y
10. stalk-shape: enlarging=e,tapering=t
11. stalk-root: bulbous=b,club=c,cup=u,equal=e, rhizomorphs=z,rooted=r,missing=?
12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
14. stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
15. stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
16. veil-type: partial=p,universal=u
17. veil-color: brown=n,orange=o,white=w,yellow=y
18. ring-number: none=n,one=o,two=t
19. ring-type: cobwebby=c,evanescent=e,flaring=f,large=l, none=n,pendant=p,sheathing=s,zone=z
20. spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r, orange=o,purple=u,white=w,yellow=y
21. population: abundant=a,clustered=c,numerous=n, scattered=s,several=v,solitary=y
22. habitat: grasses=g,leaves=l,meadows=m,paths=p, urban=u,waste=w,woods=d

## Imports

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score, f1_score, recall_score
# import libraries

import scipy.stats as stats




import statsmodels.api as sm




from scipy.stats import shapiro,normaltest,kstest,jarque_bera
import pingouin as pg
from pingouin import kruskal




from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import mlflow



from mlflow.models.signature import ModelSignature
from mlflow.models.signature import infer_signature
from mlflow.types.schema import Schema
from mlflow.types.schema import ParamSchema
from mlflow.types.schema import ParamSpec
from mlflow.types.schema import ColSpec
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import pandas as pd
from typing import Tuple


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score


import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from scipy.stats import chi2_contingency, fisher_exact

In [4]:
df = pd.read_csv("data/data_csv/data_bersih.csv")

In [43]:
len(df)

8124

In [5]:
df.head()

Unnamed: 0,class,cap-shape,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,n,t,p,f,c,n,k,e,e,s,s,w,w,o,p,k,s,u
1,e,x,y,t,a,f,c,b,k,e,c,s,s,w,w,o,p,n,n,g
2,e,b,w,t,l,f,c,b,n,e,c,s,s,w,w,o,p,n,n,m
3,p,x,w,t,p,f,c,n,n,e,e,s,s,w,w,o,p,k,s,u
4,e,x,g,f,n,f,w,b,k,t,e,s,s,w,w,o,e,n,a,g


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-color                 8124 non-null   object
 3   bruises                   8124 non-null   object
 4   odor                      8124 non-null   object
 5   gill-attachment           8124 non-null   object
 6   gill-spacing              8124 non-null   object
 7   gill-size                 8124 non-null   object
 8   gill-color                8124 non-null   object
 9   stalk-shape               8124 non-null   object
 10  stalk-root                8124 non-null   object
 11  stalk-surface-above-ring  8124 non-null   object
 12  stalk-surface-below-ring  8124 non-null   object
 13  stalk-color-above-ring    8124 non-null   object
 14  stalk-color-below-ring  

In [7]:
df.describe().T

Unnamed: 0,count,unique,top,freq
class,8124,2,e,4208
cap-shape,8124,6,x,3656
cap-color,8124,10,n,2284
bruises,8124,2,f,4748
odor,8124,9,n,3528
gill-attachment,8124,2,f,7914
gill-spacing,8124,2,c,6812
gill-size,8124,2,b,5612
gill-color,8124,12,b,1728
stalk-shape,8124,2,t,4608


## Data Prep

In [17]:
X = df.drop('class',axis=1)
y = df['class']

print(y.value_counts())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=11)

y_test.value_counts()

e    4208
p    3916
Name: class, dtype: int64


e    859
p    766
Name: class, dtype: int64

In [18]:
from sklearn.ensemble import GradientBoostingClassifier

In [19]:
categorical_features = selector(dtype_include='object')



categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', handle_unknown='error'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ]
)


pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('gbc', GradientBoostingClassifier())])


In [20]:
pipeline 

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(drop='first'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x00000164925EEA00>)])),
                ('gbc', GradientBoostingClassifier())])

In [21]:
param_grid = {'gbc__learning_rate':[0.001,0.1,0.01] ,
'gbc__tol':[0.001,0.1,0.01],
'gbc__n_estimators':[64,100,128,150,90,70],
'gbc__criterion':['friedman_mse', 'squared_error'],
'gbc__max_depth':[2,3,4],
'gbc__ccp_alpha':[0.001,0.1,0.01],
'gbc__random_state':[101]}

In [22]:
grid_search_rfc = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search_rfc .fit(X_train, y_train)


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('onehot',
                                                                                          OneHotEncoder(drop='first'))]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x00000164925EEA00>)])),
                                       ('gbc', GradientBoostingClassifier())]),
             param_grid={'gbc__ccp_alpha': [0.001, 0.1, 0.01],
                         'gbc__criterion': ['friedman_mse', 'squared_error'],
                         'gbc__learning_rate': [0.001, 0.1, 0.01],
                         'gbc__max_depth': [2, 3, 4],
                         'gbc__n_estimators': [64, 100, 128, 150, 90, 70],
                         'gbc__random_st

In [23]:
df_1 = pd.DataFrame(grid_search_rfc.cv_results_)
df_1.drop_duplicates(subset=['mean_test_score', 'std_test_score', 'rank_test_score'], inplace=True)
df_1 = df_1.sort_values(by=['mean_test_score', 'std_test_score'], ascending=[False, True])
df_1.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gbc__ccp_alpha,param_gbc__criterion,param_gbc__learning_rate,param_gbc__max_depth,param_gbc__n_estimators,param_gbc__random_state,param_gbc__tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
156,0.616325,0.025459,0.008989,0.005252,0.001,friedman_mse,0.01,4,90,101,0.001,"{'gbc__ccp_alpha': 0.001, 'gbc__criterion': 'f...",0.999231,0.996154,0.998462,0.998462,0.996151,0.997692,0.001288,1
147,0.661114,0.039916,0.008196,0.006365,0.001,friedman_mse,0.01,4,100,101,0.001,"{'gbc__ccp_alpha': 0.001, 'gbc__criterion': 'f...",0.999231,0.996154,0.997692,0.998462,0.996151,0.997538,0.001231,7
90,0.403594,0.012301,0.013352,0.005017,0.001,friedman_mse,0.1,4,64,101,0.001,"{'gbc__ccp_alpha': 0.001, 'gbc__criterion': 'f...",0.997692,0.999231,0.993846,0.997692,0.996151,0.996922,0.001821,13
144,0.440739,0.014596,0.0104,0.005908,0.001,friedman_mse,0.01,4,64,101,0.001,"{'gbc__ccp_alpha': 0.001, 'gbc__criterion': 'f...",0.999231,0.99,0.998462,0.998462,0.996921,0.996615,0.003392,49
159,0.474496,0.047194,0.010153,0.007282,0.001,friedman_mse,0.01,4,70,101,0.001,"{'gbc__ccp_alpha': 0.001, 'gbc__criterion': 'f...",0.999231,0.99,0.998462,0.998462,0.993072,0.995845,0.003661,55


In [24]:


data_ambil=ambil_best(df_1.head(10),10)


In [25]:
data_ambil

array([{'gbc__ccp_alpha': [0.001], 'gbc__criterion': ['friedman_mse'], 'gbc__learning_rate': [0.01], 'gbc__max_depth': [4], 'gbc__n_estimators': [90], 'gbc__random_state': [101], 'gbc__tol': [0.001]},
       {'gbc__ccp_alpha': [0.001], 'gbc__criterion': ['friedman_mse'], 'gbc__learning_rate': [0.01], 'gbc__max_depth': [4], 'gbc__n_estimators': [100], 'gbc__random_state': [101], 'gbc__tol': [0.001]},
       {'gbc__ccp_alpha': [0.001], 'gbc__criterion': ['friedman_mse'], 'gbc__learning_rate': [0.1], 'gbc__max_depth': [4], 'gbc__n_estimators': [64], 'gbc__random_state': [101], 'gbc__tol': [0.001]},
       {'gbc__ccp_alpha': [0.001], 'gbc__criterion': ['friedman_mse'], 'gbc__learning_rate': [0.01], 'gbc__max_depth': [4], 'gbc__n_estimators': [64], 'gbc__random_state': [101], 'gbc__tol': [0.001]},
       {'gbc__ccp_alpha': [0.001], 'gbc__criterion': ['friedman_mse'], 'gbc__learning_rate': [0.01], 'gbc__max_depth': [4], 'gbc__n_estimators': [70], 'gbc__random_state': [101], 'gbc__tol': [0.00

In [39]:
if __name__ == "__main__":

    experiment_id = mlflow.create_experiment(
        name="project_mush_GradientBoostingClassifier_new",
        tags={"env": "dev", "version": "1.0.0"},
    )

    print(experiment_id)
    

mlflow.end_run()


826357787638646353


In [40]:
run_name = "log___"
increment_number = 1


In [41]:
data_ambil[:6]

array([{'gbc__ccp_alpha': [0.001], 'gbc__criterion': ['friedman_mse'], 'gbc__learning_rate': [0.01], 'gbc__max_depth': [4], 'gbc__n_estimators': [90], 'gbc__random_state': [101], 'gbc__tol': [0.001]},
       {'gbc__ccp_alpha': [0.001], 'gbc__criterion': ['friedman_mse'], 'gbc__learning_rate': [0.01], 'gbc__max_depth': [4], 'gbc__n_estimators': [100], 'gbc__random_state': [101], 'gbc__tol': [0.001]},
       {'gbc__ccp_alpha': [0.001], 'gbc__criterion': ['friedman_mse'], 'gbc__learning_rate': [0.1], 'gbc__max_depth': [4], 'gbc__n_estimators': [64], 'gbc__random_state': [101], 'gbc__tol': [0.001]},
       {'gbc__ccp_alpha': [0.001], 'gbc__criterion': ['friedman_mse'], 'gbc__learning_rate': [0.01], 'gbc__max_depth': [4], 'gbc__n_estimators': [64], 'gbc__random_state': [101], 'gbc__tol': [0.001]},
       {'gbc__ccp_alpha': [0.001], 'gbc__criterion': ['friedman_mse'], 'gbc__learning_rate': [0.01], 'gbc__max_depth': [4], 'gbc__n_estimators': [70], 'gbc__random_state': [101], 'gbc__tol': [0.00

In [42]:
increment_number_small=input_mlflow(data_ambil,increment_number,X_train, y_train,pipeline,run_name,experiment_id,X_test,y_test)

run_id: c309b63b8a9542bb89d53d7d342d7863
run_id: d3c592caaa234123a16c2616715186c1
run_id: e010283eeac747d7a4591861fe7f7a0b
run_id: 1b4a82f7784a41c394ee3e7d4fd06806
run_id: 8bc7e6c4fa9a4730b521d294153a3202
run_id: 94142850a01d4104920082de121c163c
run_id: dd4f9523b39647368e8568472c5b5b9c
run_id: 6c843cae0fd4430084caa31aa90c8756
run_id: 5ce1fcd2ff364430b486421ea107c53a
run_id: 5056c0b7e18e4b7392be2576c42020f4
selesai


In [110]:
df_simpan=pd.concat([X_test,y_test],axis=1)

df_simpan.to_csv('pakai.csv',index=False)

In [124]:
df_simpan

Unnamed: 0,cap-shape,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,ring-number,ring-type,spore-print-color,population,habitat,class
4242,x,y,f,f,f,c,b,p,e,b,k,k,n,b,o,l,h,y,d,p
2057,x,n,t,n,f,c,b,p,t,b,s,s,w,g,o,p,n,y,d,e
4016,x,y,f,f,f,c,b,g,e,b,k,k,n,p,o,l,h,v,p,p
7246,b,n,f,n,a,c,b,o,e,?,s,s,o,o,o,p,o,c,l,e
7746,x,w,f,n,f,w,b,p,e,?,k,k,w,w,t,p,w,n,g,e
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6795,x,e,f,s,f,c,n,b,t,?,k,k,w,w,o,e,w,v,l,p
6016,f,n,f,y,f,c,n,b,t,?,s,k,p,p,o,e,w,v,p,p
7131,k,e,f,y,f,c,n,b,t,?,s,k,w,w,o,e,w,v,d,p
5339,f,y,f,f,f,c,b,p,e,b,k,k,p,b,o,l,h,y,p,p
