In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
drive.mount('/content/drive')
caminho_arquivo = '/content/drive/My Drive/Cursos/TIC/penguins.csv'

Mounted at /content/drive


In [3]:
df = pd.read_csv(caminho_arquivo)

In [4]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [5]:
print(df.dtypes)

species               object
island                object
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
year                   int64
dtype: object


In [6]:
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

O dataset contém 11 campos com algum valor ausente. Portanto, iremos realizar a limpeza desse dataframe removendo essas linhas contendo os campos ausentes.

In [7]:
df.dropna(inplace = True)

In [8]:
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
year                 0
dtype: int64

In [9]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007


Fazendo Label Encoding para a variável resposta com o objetivo de transformar cada classe categórica em uma representação númerica e única.

In [10]:
y = df['species']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [11]:
df.columns

Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex', 'year'],
      dtype='object')

Normalização das features numéricas independentes utilizando o método MinMaxScaler do scikit-learn.

In [12]:
numeric_features = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
X_numeric_features = df[numeric_features]
scaler = MinMaxScaler()
X_numeric_features_scaled = scaler.fit_transform(X_numeric_features)
X_numeric_features_scaled_df = pd.DataFrame(X_numeric_features_scaled, columns = numeric_features)
X_numeric_features_scaled_df

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,0.254545,0.666667,0.152542,0.291667
1,0.269091,0.511905,0.237288,0.305556
2,0.298182,0.583333,0.389831,0.152778
3,0.167273,0.738095,0.355932,0.208333
4,0.261818,0.892857,0.305085,0.263889
...,...,...,...,...
328,0.861818,0.797619,0.593220,0.361111
329,0.414545,0.595238,0.508475,0.194444
330,0.636364,0.607143,0.355932,0.298611
331,0.680000,0.702381,0.644068,0.388889


Codificação das features categóricas utilizando o OneHotEncoder e eliminação da primeira primeira coluna resultante desse processo de codificação para evitar a armadilha das variáveis dummies.

In [13]:
categorical_features = ['island', 'sex']
X_categorical_features = df[categorical_features]
encoder = OneHotEncoder(drop = 'first', sparse=False)
X_categorical_features_encoded = encoder.fit_transform(X_categorical_features)
X_categorical_features_encoded_df = pd.DataFrame(X_categorical_features_encoded, columns=encoder.get_feature_names_out(categorical_features))
X_categorical_features_encoded_df



Unnamed: 0,island_Dream,island_Torgersen,sex_male
0,0.0,1.0,1.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,1.0,1.0
...,...,...,...
328,1.0,0.0,1.0
329,1.0,0.0,0.0
330,1.0,0.0,1.0
331,1.0,0.0,1.0


Concatenação dos DataFrames com as features categóricas que foram codificadas para númerico e com as features independentes que foram normalizadas utilizando MinMaxScaler.

In [14]:
X_final = pd.concat([X_categorical_features_encoded_df, X_numeric_features_scaled_df], axis = 1)
X_final

Unnamed: 0,island_Dream,island_Torgersen,sex_male,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,0.0,1.0,1.0,0.254545,0.666667,0.152542,0.291667
1,0.0,1.0,0.0,0.269091,0.511905,0.237288,0.305556
2,0.0,1.0,0.0,0.298182,0.583333,0.389831,0.152778
3,0.0,1.0,0.0,0.167273,0.738095,0.355932,0.208333
4,0.0,1.0,1.0,0.261818,0.892857,0.305085,0.263889
...,...,...,...,...,...,...,...
328,1.0,0.0,1.0,0.861818,0.797619,0.593220,0.361111
329,1.0,0.0,0.0,0.414545,0.595238,0.508475,0.194444
330,1.0,0.0,1.0,0.636364,0.607143,0.355932,0.298611
331,1.0,0.0,1.0,0.680000,0.702381,0.644068,0.388889


Codificação da variável resposta (dependente).

In [15]:
y = df['species']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

Divisão dos dados entre treinamento de teste

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size = 0.2, random_state = 42)

Treinamento do modelo com Forest Random.

In [17]:
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)

In [18]:
y_pred_model_rt = model_rf.predict(X_test)
y_pred_model_rt

array([0, 1, 0, 2, 0, 1, 1, 2, 2, 2, 0, 0, 1, 0, 1, 0, 0, 2, 0, 1, 0, 0,
       1, 2, 0, 0, 2, 1, 2, 1, 2, 1, 0, 0, 1, 1, 2, 2, 0, 0, 0, 0, 2, 2,
       0, 0, 1, 0, 0, 1, 0, 2, 2, 0, 0, 2, 0, 0, 2, 2, 1, 1, 1, 0, 0, 1,
       0])

In [19]:
print(f'Random Forest Accuracy: {accuracy_score(y_test, y_pred_model_rt)}')

Random Forest Accuracy: 1.0


Treinamento do modelo com KNN

In [20]:
model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)

In [21]:
y_pred_model_knn = model_knn.predict(X_test)
y_pred_model_knn

array([0, 1, 0, 2, 0, 1, 1, 2, 2, 2, 0, 0, 1, 0, 1, 0, 0, 2, 0, 1, 0, 0,
       1, 2, 0, 0, 2, 1, 2, 1, 2, 1, 0, 0, 1, 1, 2, 2, 0, 0, 0, 0, 2, 2,
       0, 0, 1, 0, 0, 1, 0, 2, 2, 0, 0, 2, 0, 0, 2, 2, 1, 1, 1, 0, 0, 1,
       0])

In [22]:
print(f'Acurácia do Random Forest: {accuracy_score(y_test, y_pred_model_knn)}')

Acurácia do Random Forest: 1.0


Utilizando a técnica de Validação Cruzada com k-Fold Cross-Validation

Criando o objeto KFold com 5 folds

In [23]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)

Calculando a acurácia utilizando validação cruzada para o Random Forest

In [24]:
scores_rf = cross_val_score(model_rf, X_final, y, cv = kf, scoring = 'accuracy')
print(f'Random Forest Acurácia (Validação Cruzada): {scores_rf.mean():.3f} +/- {scores_rf.std():.3f}')

Random Forest Acurácia (Validação Cruzada): 0.988 +/- 0.011


In [25]:
scoring_metrics = ['precision_macro', 'recall_macro', 'f1_macro']
for metric in scoring_metrics:
    scores_rf = cross_val_score(model_rf, X_final, y, cv=kf, scoring=metric)
    print(f'Random Forest {metric.capitalize()}: {scores_rf.mean():.3f} +/- {scores_rf.std():.3f}')

Random Forest Precision_macro: 0.991 +/- 0.008
Random Forest Recall_macro: 0.984 +/- 0.016
Random Forest F1_macro: 0.984 +/- 0.016


Calculando a acurácia utilizando validação cruzada para o KNN

In [26]:
scores_knn = cross_val_score(model_knn, X_final, y, cv=kf, scoring='accuracy')
print(f'KNN Acurácia (Validação Cruzada): {scores_knn.mean():.3f} +/- {scores_knn.std():.3f}')

KNN Acurácia (Validação Cruzada): 1.000 +/- 0.000


In [27]:
for metric in scoring_metrics:
    scores_knn = cross_val_score(model_knn, X_final, y, cv=kf, scoring=metric)
    print(f'KNN {metric.capitalize()}: {scores_knn.mean():.3f} +/- {scores_knn.std():.3f}')

KNN Precision_macro: 1.000 +/- 0.000
KNN Recall_macro: 1.000 +/- 0.000
KNN F1_macro: 1.000 +/- 0.000


O resultado de ambos os modelos desenvolvidos em uma primeira análise é muito promissora. O modelo Random Forest obteve acurácia na validação cruzada de 0.988 +/- 0.011, enquanto que o modelo KNN obteve 1.000 +/- 0.000. É importante observar que o dataset é simples e pequeno, o que ajuda a explicar os resultados obtidos. A validação cruzada foi utilizada para ajudar a compreender os resultados obtidos do accuracy_score e para analisar com mais precisão e robustez a performance dos modelos, a fim de evitar o overfitting.
