In [6]:
# EDA
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

In [7]:
# Carregar dados para o dataframe
df_frutas = pd.read_csv('./datasets/apple_quality.csv')

In [15]:
# 2. Tratamento inicial
# Padronizar nomes das colunas (lowercase e sem espaços)
df_frutas.columns = df_frutas.columns.str.strip().str.lower()

In [16]:
# Visualizar dataframe
df_frutas.head(10)

Unnamed: 0,size,weight,sweetness,crunchiness,juiciness,ripeness,acidity,quality
0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.49159,1
1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809,1
2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,0
3,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723,1
4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,1
5,-3.4254,-1.409082,-1.913511,-0.555775,-3.853071,1.914616,-2.981523,0
6,1.331606,1.635956,0.875974,-1.677798,3.106344,-1.847417,2.414171,1
7,-1.995462,-0.428958,1.530644,-0.742972,0.158834,0.974438,-1.470125,1
8,-3.867632,-3.734514,0.986429,-1.207655,2.292873,4.080921,-4.871905,0
9,-0.727983,-0.44282,-4.092223,0.597513,0.393714,1.620857,2.185608,0


In [17]:
# Verificar valores nulos
print("Valores nulos por coluna:")
print(df_frutas.isnull().sum())

Valores nulos por coluna:
size           0
weight         0
sweetness      0
crunchiness    0
juiciness      0
ripeness       0
acidity        0
quality        0
dtype: int64


In [19]:
# Se houver valores nulos, exemplo de tratamento (remoção ou preenchimento)
df_frutas = df_frutas.dropna()

In [9]:
# Estrutura do dataframe
df_frutas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   A_id         4000 non-null   int64  
 1   Size         4000 non-null   float64
 2   Weight       4000 non-null   float64
 3   Sweetness    4000 non-null   float64
 4   Crunchiness  4000 non-null   float64
 5   Juiciness    4000 non-null   float64
 6   Ripeness     4000 non-null   float64
 7   Acidity      4000 non-null   float64
 8   Quality      4000 non-null   object 
dtypes: float64(7), int64(1), object(1)
memory usage: 281.4+ KB


In [10]:
# Transformar a variável quality em numérica (0 e 1)
df_frutas['Quality'] = (df_frutas['Quality'] == 'good').astype(int)

In [11]:
# Remover a coluna A_id, pois não tem poder preditivo
df_frutas.drop(columns=['A_id'], axis=1, inplace=True)

In [12]:
# BoxPlot Quality x Weight
# Será que tem diferença de peso, entre as frutas boas e ruims? Vemos isso atráves de um boxplot
px.box(df_frutas, x='Quality', y='Weight', color='Quality')

In [13]:
# Matriz de correlação
corr_matrix = df_frutas.corr()
corr_matrix

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
Size,1.0,-0.170702,-0.32468,0.169868,-0.018892,-0.134773,0.196218,0.244007
Weight,-0.170702,1.0,-0.154246,-0.095882,-0.092263,-0.243824,0.016414,0.001421
Sweetness,-0.32468,-0.154246,1.0,-0.037552,0.095882,-0.2738,0.085999,0.250998
Crunchiness,0.169868,-0.095882,-0.037552,1.0,-0.259607,-0.201982,0.069943,-0.012376
Juiciness,-0.018892,-0.092263,0.095882,-0.259607,1.0,-0.097144,0.248714,0.260223
Ripeness,-0.134773,-0.243824,-0.2738,-0.201982,-0.097144,1.0,-0.202669,-0.264315
Acidity,0.196218,0.016414,0.085999,0.069943,0.248714,-0.202669,1.0,-0.007697
Quality,0.244007,0.001421,0.250998,-0.012376,0.260223,-0.264315,-0.007697,1.0


In [14]:
# Plot Heatmap
fig = go.Figure()

fig.add_trace(
  go.Heatmap(
    x=corr_matrix.columns,
    y=corr_matrix.index,
    z=np.array(corr_matrix),
    text=corr_matrix.values,
    texttemplate='%{text:.2f}',
    colorscale=px.colors.diverging.RdBu,
    zmin=-1,
    zmax=1
  )
)

fig.show()