# Mini Projeto Clamed - Análise de Dados em Saúde
## Etapa 1 - Entendimento do Projeto e Importação dos Dados

In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

plt.style.use('ggplot')

output_path = r"c:\Users\lucas\Desktop\mini_projeto"

df = pd.read_csv('healthcare_dataset.csv')

print("\nPrimeiras 5 linhas do dataset:")
print(df.head())

print("\nTipos de dados e valores não-nulos:")
df.info()

print("\nDimensões do dataset:")
print(df.shape)



Primeiras 5 linhas do dataset:
            Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby JacksOn   30    Male         B-            Cancer        2024-01-31   
1   LesLie TErRy   62    Male         A+           Obesity        2019-08-20   
2    DaNnY sMitH   76  Female         A-           Obesity        2022-09-22   
3   andrEw waTtS   28  Female         O+          Diabetes        2020-11-18   
4  adrIENNE bEll   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  Room Number Admission Type Discharge Date   Med

# Limpeza e Preparação dos Dados

- **Tratamento de valores ausentes (NA, NaN) .**
- **Remoção de registros duplicados**
- **Padronizacão de textos em colunas categóricas**


In [4]:
# Tratamento de valores ausentes
print("Valores ausentes antes do tratamento:")
print(df.isnull().sum())

for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna('Desconhecido', inplace=True)

for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col].fillna(df[col].median(), inplace=True)

print("\nValores ausentes após o tratamento:")
print(df.isnull().sum())

# Remoção de duplicados
registros_duplicados = df.duplicated().sum()
df_clean = df.drop_duplicates().copy()
print(f"\nRegistros duplicados removidos: {registros_duplicados}")
print(f"Novo tamanho do dataset limpo: {df_clean.shape}")


#Padronização de texto
df_clean['Gender'] = df_clean['Gender'].str.strip().str.title()
df_clean['Blood Type'] = df_clean['Blood Type'].str.strip().str.upper()
df_clean['Medical Condition'] = df_clean['Medical Condition'].str.strip().str.title()

print("\nValores únicos após padronização:")
print("Gender:", df_clean['Gender'].unique())
print("Blood Type:", df_clean['Blood Type'].unique())
print("Medical Condition:", df_clean['Medical Condition'].unique())


Valores ausentes antes do tratamento:
Name                  0
Age                   0
Gender                0
Blood Type            0
Medical Condition     0
Date of Admission     0
Doctor                0
Hospital              0
Insurance Provider    0
Billing Amount        0
Room Number           0
Admission Type        0
Discharge Date        0
Medication            0
Test Results          0
dtype: int64

Valores ausentes após o tratamento:
Name                  0
Age                   0
Gender                0
Blood Type            0
Medical Condition     0
Date of Admission     0
Doctor                0
Hospital              0
Insurance Provider    0
Billing Amount        0
Room Number           0
Admission Type        0
Discharge Date        0
Medication            0
Test Results          0
dtype: int64

Registros duplicados removidos: 534
Novo tamanho do dataset limpo: (54966, 15)

Valores únicos após padronização:
Gender: ['Male' 'Female']
Blood Type: ['B-' 'A+' 'A-' 'O+' 'AB+'

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Desconhecido', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values a

# Remoção de Outliers

- **Identificação e remoção de outliers usando método IQR (Intervalo Interquartil).**
- **Salvo dataset limpo para próximas análises**

In [None]:
# Remoção de outliers com IQR
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers_count = len(df[(df[column] < lower_bound) | (df[column] > upper_bound)])
    print(f"Coluna '{column}': {outliers_count} outliers removidos.")
    
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

numeric_cols = ['Age', 'Billing Amount', 'Room Number']
for col in numeric_cols:
    df_clean = remove_outliers_iqr(df_clean, col)

df_clean.to_csv(os.path.join(output_path, 'healthcare_dataset_clean.csv'), index=False)
print("\nDataset limpo salvo como 'healthcare_dataset_clean.csv'")
