In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

# Análise Exploratória

## Carregar e visualizar as primeiras linhas dos dados

In [2]:
df = pd.read_csv("../../datasets/raw/gender_classification.csv")
df.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female


## Verificar tipos das colunas

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   long_hair                  5001 non-null   int64  
 1   forehead_width_cm          5001 non-null   float64
 2   forehead_height_cm         5001 non-null   float64
 3   nose_wide                  5001 non-null   int64  
 4   nose_long                  5001 non-null   int64  
 5   lips_thin                  5001 non-null   int64  
 6   distance_nose_to_lip_long  5001 non-null   int64  
 7   gender                     5001 non-null   object 
dtypes: float64(2), int64(5), object(1)
memory usage: 312.7+ KB


## Verificar valores de máximo e minimo

In [4]:
df.describe()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long
count,5001.0,5001.0,5001.0,5001.0,5001.0,5001.0,5001.0
mean,0.869626,13.181484,5.946311,0.493901,0.507898,0.493101,0.4989
std,0.336748,1.107128,0.541268,0.500013,0.499988,0.500002,0.500049
min,0.0,11.4,5.1,0.0,0.0,0.0,0.0
25%,1.0,12.2,5.5,0.0,0.0,0.0,0.0
50%,1.0,13.1,5.9,0.0,1.0,0.0,0.0
75%,1.0,14.0,6.4,1.0,1.0,1.0,1.0
max,1.0,15.5,7.1,1.0,1.0,1.0,1.0


## Fazer downcasting das colunas

In [5]:
def downcasting (df):
    df_processed = df.copy(deep=True)
    
    for column in df_processed.columns.tolist():
        if df_processed[column].dtype == 'float64':
            df_processed[column] = df_processed[column].astype("float16")
        elif df_processed[column].dtype == 'int64':
            df_processed[column] = df_processed[column].astype("int8")
            
    return df_processed
            
df_processed = downcasting(df)
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   long_hair                  5001 non-null   int8   
 1   forehead_width_cm          5001 non-null   float16
 2   forehead_height_cm         5001 non-null   float16
 3   nose_wide                  5001 non-null   int8   
 4   nose_long                  5001 non-null   int8   
 5   lips_thin                  5001 non-null   int8   
 6   distance_nose_to_lip_long  5001 non-null   int8   
 7   gender                     5001 non-null   object 
dtypes: float16(2), int8(5), object(1)
memory usage: 83.1+ KB


## Função para criar os gráficos de bar

In [6]:
def create_bar(df, column_to_group, column_to_rename):
    df_grouped = df.groupby(by=[column_to_group, 'gender'], as_index=False).count()
    df_grouped = df_grouped.iloc[::, :3:].rename(columns={column_to_rename: 'count'})
    
    fig = px.bar(df_grouped, x=column_to_group, y="count", color='gender', barmode='group', 
                 title=f"Count of {column_to_group} per gender", width=800, text_auto=True)
    fig.show()

## Gráficos de barra count feature per gender

In [7]:
create_bar(df_processed, 'long_hair', 'forehead_width_cm')

In [8]:
create_bar(df_processed, 'nose_wide', 'long_hair')

In [9]:
create_bar(df_processed, 'nose_long', 'long_hair')

In [10]:
create_bar(df_processed, 'lips_thin', 'long_hair')

In [11]:
create_bar(df_processed, 'distance_nose_to_lip_long', 'long_hair')

## Boxplot features per Gender

In [12]:
fig = px.box(df_processed, x="gender", y="forehead_width_cm", title="Forehead width cm per gender", color='gender')
fig.show()

In [13]:
fig = px.box(df_processed, x="gender", y="forehead_height_cm", title="Forehead height cm per gender", color='gender')
fig.show()