In [9]:
# import libraries
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats



import seaborn as sns
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix


In [6]:
df = pd.read_csv("penguins_size.csv")
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [12]:
df.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [15]:
df=df.dropna()

In [16]:
df.isnull().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [20]:
df['sex'].unique()

array(['MALE', 'FEMALE', '.'], dtype=object)

In [21]:
df['island'].unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [22]:
df = df[df['sex']!='.']

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=101)


# START TRAINING DATA

## DecisionTreeClassifier (grid search)

In [58]:

param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth':[3, 4, 5, 6],
    'max_leaf_nodes':[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
    'random_state': [101]
}

X_dum=pd.get_dummies(X,drop_first=True)
X_train_dump, X_test_dump, y_train_dump, y_test_dump = train_test_split(X_dum, y, test_size=0.15, random_state=101)


dt_classifier = DecisionTreeClassifier()


grid_search_ONLY = GridSearchCV(dt_classifier, param_grid=param_grid, cv=5)


grid_search_ONLY.fit(X_train_dump, y_train_dump)

# Print best parameters and best score
print("\nBest parameters found: ", grid_search_ONLY.best_params_)
print("\nBest score found: ", grid_search_ONLY.best_score_)

Ypred=grid_search_ONLY.predict(X_test_dump)
print()
print(classification_report(y_test_dump,Ypred))


Best parameters found:  {'criterion': 'gini', 'max_depth': 5, 'max_leaf_nodes': 9, 'random_state': 101, 'splitter': 'random'}

Best score found:  0.9823934837092733

              precision    recall  f1-score   support

      Adelie       0.96      0.96      0.96        24
   Chinstrap       0.92      0.92      0.92        12
      Gentoo       1.00      1.00      1.00        14

    accuracy                           0.96        50
   macro avg       0.96      0.96      0.96        50
weighted avg       0.96      0.96      0.96        50



In [59]:
grid_search_ONLY

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 4, 5, 6],
                         'max_leaf_nodes': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                            13],
                         'random_state': [101],
                         'splitter': ['best', 'random']})

## DecisionTreeClassifier (Pipeline and grid search)

In [51]:

# Define the pipeline steps for preprocessing
numeric_features = df.select_dtypes(include=['float64']).columns
categorical_features = df.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='if_binary', handle_unknown='error'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, selector(dtype_include='float64')),
        ('cat', categorical_transformer, selector(dtype_include='object'))])

# Define parameters grid
param_grid = {
    'decisiontreeclassifier__criterion': ['gini', 'entropy'],
    'decisiontreeclassifier__splitter': ['best', 'random'],
    'decisiontreeclassifier__max_depth':[3,4,5,6],
    'decisiontreeclassifier__max_leaf_nodes':[3,4,5,6,7,8,9,10,11,12,13],
    'decisiontreeclassifier__random_state': [101],

}

# Combine preprocessor and model in a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('decisiontreeclassifier', DecisionTreeClassifier())])
# GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

# Fit the model
grid_search.fit(X_train, y_train)

# Print best parameters and best score
print("\nBest parameters found: ", grid_search.best_params_)
print("\nBest score found: ", grid_search.best_score_)


Ypred=grid_search.predict(X_test)
print()
print(classification_report(y_test,Ypred))


Best parameters found:  {'decisiontreeclassifier__criterion': 'entropy', 'decisiontreeclassifier__max_depth': 6, 'decisiontreeclassifier__max_leaf_nodes': 13, 'decisiontreeclassifier__random_state': 101, 'decisiontreeclassifier__splitter': 'random'}

Best score found:  0.9859022556390977

              precision    recall  f1-score   support

      Adelie       0.96      0.96      0.96        24
   Chinstrap       0.92      1.00      0.96        12
      Gentoo       1.00      0.93      0.96        14

    accuracy                           0.96        50
   macro avg       0.96      0.96      0.96        50
weighted avg       0.96      0.96      0.96        50



In [56]:
grid_search

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x000002B9C031D8E0>),
                                                                        ('cat',
                                                                         Pipeline(steps=[('onehot',
                                                                                          OneHotEncoder(drop='if_binary'))]),
                                                                         <sklearn.compose._column_transformer.make_column_se...9C031D820>)])),
                      

## svmc (Pipeline and grid search)

In [37]:
from sklearn.svm import SVC

# Combine preprocessor and model in a pipeline for SVM
pipeline_svm = Pipeline(steps=[('preprocessor', preprocessor),
                               ('svc', SVC(probability=True))])

# Define parameters grid for SVM
param_grid_svm = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__kernel': ['linear', 'rbf', 'poly'],
    'svc__gamma': ['scale', 'auto']
}

# Define GridSearchCV with pipeline for SVM
cv_model_svm = GridSearchCV(estimator=pipeline_svm, param_grid=param_grid_svm, cv=5)

# Fit the model for SVM
cv_model_svm.fit(X_train, y_train)

# Get the best parameters for SVM
best_params_svm = cv_model_svm.best_params_
print("Best Parameters for SVM:", best_params_svm)

# Get the best model for SVM
best_model_svm = cv_model_svm.best_estimator_

# Evaluate the best model for SVM
accuracy_svm = best_model_svm.score(X_test, y_test)
print("Accuracy for SVM:", accuracy_svm)

Ypred=cv_model_svm.predict(X_test)
print()
print(classification_report(y_test,Ypred))

Best Parameters for SVM: {'svc__C': 0.1, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
Accuracy for SVM: 1.0

              precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00        24
   Chinstrap       1.00      1.00      1.00        12
      Gentoo       1.00      1.00      1.00        14

    accuracy                           1.00        50
   macro avg       1.00      1.00      1.00        50
weighted avg       1.00      1.00      1.00        50



In [54]:
best_model_svm

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002B9BE579160>),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(drop='if_binary'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002B9BDAB8D90>)])),
                ('svc', SVC(C=0.1, kernel='linear', probability=True))])

## VotingClassifier (Pipeline)


In [52]:
from sklearn.ensemble import VotingClassifier

# Define Voting Classifier
voting_classifier = VotingClassifier(estimators=[
    ('svm', cv_model_svm.best_estimator_),
    ('decision_tree', grid_search.best_estimator_)
    
], voting='hard')  # Change 'hard' to 'soft' for soft voting

# Fit the Voting Classifier
voting_classifier.fit(X_train, y_train)

# Evaluate the Voting Classifier
accuracy_voting = voting_classifier.score(X_test, y_test)
print("Accuracy for Voting Classifier:", accuracy_voting)
Ypred=voting_classifier.predict(X_test)
print()
print(classification_report(y_test,Ypred))

Accuracy for Voting Classifier: 0.98

              precision    recall  f1-score   support

      Adelie       0.96      1.00      0.98        24
   Chinstrap       1.00      1.00      1.00        12
      Gentoo       1.00      0.93      0.96        14

    accuracy                           0.98        50
   macro avg       0.99      0.98      0.98        50
weighted avg       0.98      0.98      0.98        50



In [53]:
voting_classifier.estimators_

[Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('scaler',
                                                                    StandardScaler())]),
                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x000002B9C0281DF0>),
                                                  ('cat',
                                                   Pipeline(steps=[('onehot',
                                                                    OneHotEncoder(drop='if_binary'))]),
                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x000002B9C0281EB0>)])),
                 ('svc', SVC(C=0.1, kernel='linear', probability=True))]),
 Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                 