In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from src.config import DADOS_ORIGINAIS
from src.graficos import PALETTE, SCATTER_ALPHA

sns.set_theme(palette="bright")

In [None]:
df = pd.read_csv(DADOS_ORIGINAIS, compression="zip")

df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(exclude="number")

In [None]:
sns.pairplot(df, diag_kind="kde", plot_kws=dict(alpha=SCATTER_ALPHA))

In [None]:
df.select_dtypes("number").skew()

In [None]:
df.select_dtypes("number").kurtosis()

In [None]:
df[df.duplicated()]

In [None]:
df[df.isnull().any(axis=1)]

In [None]:
df[df.isnull().any(axis=1)].describe()

In [None]:
df["ocean_proximity"].value_counts()

In [None]:
fig, axs = plt.subplots(3, 3, figsize=(10, 5))

for ax, coluna in zip(axs.flatten(), df.columns):
    sns.boxplot(data=df, x=coluna, ax=ax, showmeans=True)

plt.tight_layout()

plt.show()

In [None]:
matriz = np.triu(df.select_dtypes("number").corr())

fig, ax = plt.subplots()

sns.heatmap(
    df.select_dtypes("number").corr(),
    mask=matriz,
    annot=True,
    fmt=".2f",
    ax=ax,
    cmap=PALETTE
)

plt.show()

Algumas variáveis novas:

- criar classes em `median_income`
- cômodos por domicílio
- pessoas por domicílio
- quartos por cômodos

In [None]:
df["median_income_cat"] = pd.cut(
    df["median_income"],
    bins=[0, 1.5, 3, 4.5, 6, np.inf],
    labels=[1, 2, 3, 4, 5],
)

df.info()

In [None]:
df["median_income_cat"].value_counts().sort_index()

In [None]:
df["median_income_cat"].value_counts().sort_index().plot(kind="bar")

In [None]:
df.columns

In [None]:
df["rooms_per_household"] = df["total_rooms"] / df["households"]
df["population_per_household"] = df["population"] / df["households"]
df["bedrooms_per_room"] = df["total_bedrooms"] / df["total_rooms"]

df.info()

In [None]:
df.describe()

In [None]:
fig, axs = plt.subplots(4, 3, figsize=(10, 5))

for ax, coluna in zip(axs.flatten(), df.select_dtypes("number").columns):
    sns.boxplot(data=df, x=coluna, ax=ax, showmeans=True)

plt.tight_layout()

plt.show()

In [None]:
matriz = np.triu(df.select_dtypes("number").corr())

fig, ax = plt.subplots(figsize=(8, 8))

sns.heatmap(
    df.select_dtypes("number").corr(),
    mask=matriz,
    annot=True,
    fmt=".2f",
    ax=ax,
    cmap=PALETTE
)

plt.show()

In [None]:
df[
    df["median_house_value"] == df["median_house_value"].max()
]

In [None]:
965 / df.shape[0]

In [None]:
df["median_house_value"].quantile(0.95)

In [None]:
df_clean = df.copy()

df_clean.info()

In [None]:
QUANTIL = 0.99

df_clean = df_clean[
    (df["housing_median_age"] < df["housing_median_age"].quantile(QUANTIL))
    & (df["total_rooms"] < df["total_rooms"].quantile(QUANTIL))
    & (df["total_bedrooms"] < df["total_bedrooms"].quantile(QUANTIL))
    & (df["population"] < df["population"].quantile(QUANTIL))
    & (df["households"] < df["households"].quantile(QUANTIL))
    & (df["median_income"] < df["median_income"].quantile(QUANTIL))
    & (df["median_house_value"] < df["median_house_value"].quantile(QUANTIL))
    & (df["rooms_per_household"] < df["rooms_per_household"].quantile(QUANTIL))
    & (df["population_per_household"] < df["population_per_household"].quantile(QUANTIL))
    & (df["bedrooms_per_room"] < df["bedrooms_per_room"].quantile(QUANTIL))
]

df_clean.info()

In [None]:
1 - df_clean.shape[0] / df.shape[0]

In [None]:
df_clean.describe()

In [None]:
sns.pairplot(df_clean, diag_kind="kde", plot_kws=dict(alpha=SCATTER_ALPHA))

In [None]:
fig, axs = plt.subplots(4, 3, figsize=(10, 5))

for ax, coluna in zip(axs.flatten(), df_clean.select_dtypes("number").columns):
    sns.boxplot(data=df_clean, x=coluna, ax=ax, showmeans=True)

plt.tight_layout()

plt.show()

In [None]:
matriz = np.triu(df_clean.select_dtypes("number").corr())

fig, ax = plt.subplots(figsize=(8, 8))

sns.heatmap(
    df_clean.select_dtypes("number").corr(),
    mask=matriz,
    annot=True,
    fmt=".2f",
    ax=ax,
    cmap=PALETTE
)

plt.show()

In [None]:
df_clean["ocean_proximity"].value_counts()

In [None]:
df_clean = df_clean.loc[df_clean["ocean_proximity"] != "ISLAND"]

df_clean["ocean_proximity"].value_counts()

In [None]:
df_clean["ocean_proximity"] = df_clean["ocean_proximity"].astype("category")

df_clean.info()

In [None]:
colunas_valores_inteiros = []

for coluna in df_clean.select_dtypes("number").columns:
    if df_clean[coluna].apply(float.is_integer).all():
        colunas_valores_inteiros.append(coluna)

colunas_valores_inteiros

In [None]:
colunas_valores_float = df_clean.select_dtypes("number").columns.difference(colunas_valores_inteiros)

colunas_valores_float

In [None]:
df_clean[colunas_valores_inteiros] = df_clean[colunas_valores_inteiros].apply(
    pd.to_numeric, downcast="integer"
)

df_clean[colunas_valores_float] = df_clean[colunas_valores_float].apply(
    pd.to_numeric, downcast="float"
)

df_clean.info()

In [None]:
df_clean.describe()