In [4410]:
import os
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.metrics import precision_score, recall_score, classification_report
import numpy as np

In [4380]:
FOLDER = os.path.join('C:\\Users', 'Hugo Martins', 'downloads', 'penguins_size.csv')
data = pd.read_csv(FOLDER)
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [4381]:
data.shape

(344, 7)

In [4382]:
data.describe(include='all')

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
count,344,344,342.0,342.0,342.0,342.0,334
unique,3,3,,,,,3
top,Adelie,Biscoe,,,,,MALE
freq,152,168,,,,,168
mean,,,43.92193,17.15117,200.915205,4201.754386,
std,,,5.459584,1.974793,14.061714,801.954536,
min,,,32.1,13.1,172.0,2700.0,
25%,,,39.225,15.6,190.0,3550.0,
50%,,,44.45,17.3,197.0,4050.0,
75%,,,48.5,18.7,213.0,4750.0,


In [4383]:
data.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [4384]:
data[data.sex == '.']

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,.


In [4385]:
# data.iloc[[3,339],:]
# Acá yo retiré las filas totalmente faltantes
data = data.drop([3,339], axis=0)
# Acá yo cambié el punto por FEMALE porqué los valores eran 
# parecidos con otros datos
data['sex'].replace('.', np.nan, inplace=True)

In [4386]:
data[data.sex.isnull()]

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,
10,Adelie,Torgersen,37.8,17.1,186.0,3300.0,
11,Adelie,Torgersen,37.8,17.3,180.0,3700.0,
47,Adelie,Dream,37.5,18.9,179.0,2975.0,
246,Gentoo,Biscoe,44.5,14.3,216.0,4100.0,
286,Gentoo,Biscoe,46.2,14.4,214.0,4650.0,
324,Gentoo,Biscoe,47.3,13.8,216.0,4725.0,
336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,


In [4387]:
# Yo eligió los sexos faltantes como datos de testing
nulls = data[data['sex'].isnull()].index.to_list()
data_test_c = [data.loc[nulls], data.sample(frac=0.1)]
data_test = pd.concat(data_test_c)
data_train  = data.drop(data_test.index,axis=0)

In [4388]:
# Cambio los datos categoricos para datos numericos
cols_cat = data.select_dtypes(include=['object']).columns.to_list()
cols_cat.remove('sex')
oc = OrdinalEncoder()

train_encode = oc.fit_transform(data_train[cols_cat])
test_encode = oc.transform(data_test[cols_cat])

In [4389]:
# Cambio el rango de los datos numericos
cols_num = data.select_dtypes(include=['int', 'float']).columns.to_list()

minmax = MinMaxScaler()

train_scaled = minmax.fit_transform(data_train[cols_num])
test_scaled = minmax.transform(data_test[cols_num])

In [4390]:
data_train_f = pd.DataFrame(np.concatenate((train_encode,train_scaled), axis=1))
data_test_f = pd.DataFrame(np.concatenate((test_encode, test_scaled), axis=1))

In [4391]:
gbc = GradientBoostingClassifier()
gbc.fit(data_train_f, data_train['sex'])

In [4392]:
gbc_pred = gbc.predict(data_test_f)
gbc_eval = gbc.predict(data_train_f)

In [4404]:
# Cambio los datos faltantes para los datos que fueran predictos
for i in gbc_pred:
    data['sex'].fillna(i, inplace=True)

In [4405]:
data.reset_index(inplace=True, drop=True)
data.head()


Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
4,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE


In [4409]:
data.to_csv('data_clean.csv')

In [4396]:
analise = pd.DataFrame({'valores':data_test['sex'], 'previsión':gbc_pred.squeeze()})
analise.dropna(inplace=True)

In [4412]:
print(classification_report(analise['valores'], analise['previsión']))

              precision    recall  f1-score   support

      FEMALE       0.94      0.94      0.94        17
        MALE       0.93      0.93      0.93        15

    accuracy                           0.94        32
   macro avg       0.94      0.94      0.94        32
weighted avg       0.94      0.94      0.94        32

