In [1]:
import streamlit as st
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
import joblib

In [2]:
df = pd.read_csv("penguins_cleaned.csv")
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181,3750,male
1,Adelie,Torgersen,39.5,17.4,186,3800,female
2,Adelie,Torgersen,40.3,18.0,195,3250,female
3,Adelie,Torgersen,36.7,19.3,193,3450,female
4,Adelie,Torgersen,39.3,20.6,190,3650,male


In [3]:
df["species"].value_counts()

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

In [4]:
mapper = {
    "Adelie" : 0,
    "Gentoo" : 1,
    "Chinstrap" :2
}
df["species"] = df["species"].map(mapper)
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,Torgersen,39.1,18.7,181,3750,male
1,0,Torgersen,39.5,17.4,186,3800,female
2,0,Torgersen,40.3,18.0,195,3250,female
3,0,Torgersen,36.7,19.3,193,3450,female
4,0,Torgersen,39.3,20.6,190,3650,male


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    int64  
 1   island             333 non-null    object 
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    int64  
 5   body_mass_g        333 non-null    int64  
 6   sex                333 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 18.3+ KB


In [6]:
X = df.drop("species",axis="columns")
y = df["species"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0,test_size=0.2)

In [8]:
categorical_pipeline = Pipeline([("scaler",OneHotEncoder(handle_unknown="error"))])
numeric_pipeline = Pipeline([("scaler",StandardScaler())])
transformer = ColumnTransformer([("categorical",categorical_pipeline,["island","sex"]),
                                 ("num",numeric_pipeline,["bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g"])])

In [9]:
mlpipe = Pipeline([("transformer",transformer),("GRB",GradientBoostingClassifier(random_state=0,n_estimators=70))])

In [10]:
mlpipe

In [11]:
mlpipe.fit(X_train,y_train)

In [12]:
y_hat = mlpipe.predict(X_test)

In [13]:
#joblib.dump(mlpipe,"GradientBoostingClassifier.joblib")

In [23]:
mlpipe.predict_proba(X_test)

array([[9.99741654e-01, 1.35789412e-04, 1.22556821e-04],
       [9.99776312e-01, 1.35794120e-04, 8.78938019e-05],
       [3.23781617e-04, 4.88651131e-05, 9.99627353e-01],
       [9.99921007e-01, 4.22393154e-05, 3.67537731e-05],
       [9.99229428e-01, 3.77556579e-04, 3.93015253e-04],
       [9.99842691e-01, 6.19165643e-05, 9.53924648e-05],
       [9.59097136e-05, 9.99871701e-01, 3.23897300e-05],
       [1.55524778e-04, 5.39687729e-05, 9.99790506e-01],
       [9.59097136e-05, 9.99871701e-01, 3.23897300e-05],
       [9.59097136e-05, 9.99871701e-01, 3.23897300e-05],
       [1.10720132e-04, 5.89486517e-05, 9.99830331e-01],
       [9.99871819e-01, 6.80300081e-05, 6.01506164e-05],
       [8.34150893e-01, 3.47049873e-04, 1.65502057e-01],
       [8.90012338e-05, 9.99878609e-01, 3.23899538e-05],
       [9.99888024e-01, 6.79726860e-05, 4.40030143e-05],
       [8.32395220e-01, 3.46322727e-04, 1.67258457e-01],
       [8.90012338e-05, 9.99878609e-01, 3.23899538e-05],
       [9.99838027e-01, 9.83749