In [1]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline as pyo
from ydata_profiling import ProfileReport


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



In [3]:
data_path = '../../arquivos/car_prices.csv'

df = pd.read_csv(data_path)

df = df[df['year'] >= 2000]

df = df[['year', 'make', 'state', 'color', 'mmr', 'sellingprice', 'odometer']]

profile = ProfileReport(df, title="Car Prices Report")

profile.to_file("report.html")

num_subplots_internos = 4
intervalo = (df['mmr'].max() - df['mmr'].min()) / num_subplots_internos

# Primeiro gráfico: Boxplot
df_normalizado = df.copy()

df_normalizado['mmr'] = pd.to_numeric(df_normalizado['mmr'], errors='coerce')

media_mmr = df_normalizado['mmr'].mean()
desvio_padrao_mmr = df_normalizado['mmr'].std()

df_normalizado['mmr'] = (df_normalizado['mmr'] - media_mmr) / desvio_padrao_mmr

fig1 = make_subplots(rows=2, cols=1, subplot_titles=("MMR Normalizado Geral", "MMR Normalizado por Marca"))

fig1.add_trace(go.Box(y=df_normalizado['mmr'], name='MMR Normalizado'), row=1, col=1)

for i, marca in enumerate(df_normalizado['make'].unique(), start=1):
    dados_marca = df_normalizado[df_normalizado['make'] == marca]['mmr']
    fig1.add_trace(go.Box(y=dados_marca, name=marca), row=2, col=1)

fig1.update_layout(
    title='Boxplot dos Preços MMR Normalizados',
    yaxis=dict(title='MMR Normalizado'),
    xaxis=dict(title='Marca'),
    showlegend=True,
    height=1600, 
    width=1600
)


# Segundo gráfico: Histogram
fig2 = make_subplots(rows=num_subplots_internos, cols=1, subplot_titles=[f"Intervalo {i+1}" for i in range(num_subplots_internos)])

for i in range(num_subplots_internos):
    mmr_min = df['mmr'].min() + i * intervalo
    mmr_max = df['mmr'].min() + (i + 1) * intervalo
    dados_intervalo = df[(df['mmr'] >= mmr_min) & (df['mmr'] < mmr_max)]

    for marca in dados_intervalo['make'].unique():
        dados_marca_intervalo = dados_intervalo[dados_intervalo['make'] == marca]
        fig2.add_trace(go.Histogram(x=dados_marca_intervalo['mmr'], name=marca, opacity=0.7), row=i+1, col=1)

fig2.update_layout(height=1600, width=1600, title_text="Histogram dos Preços MMR por Intervalo")
fig2.update_yaxes(title_text='Número de Veículos', tickfont=dict(size=10))
fig2.update_xaxes(title_text='Preços MMR', tickfont=dict(size=10))

html_boxplot = pyo.plot(fig1, include_plotlyjs=True, output_type='div')
html_histogram = pyo.plot(fig2, include_plotlyjs=True, output_type='div')

with open("report.html", "r", encoding='utf-8') as file:
    html_content = file.read()

css_centralizar = """
<style>
    .plotly-graph-div {
        display: block;
        margin-left: auto;
        margin-right: auto;
    }
</style>
"""

html_content += css_centralizar + html_boxplot + html_histogram

with open("report_with_plotly.html", "w", encoding='utf-8') as file:
    file.write(html_content)



There was an attempt to calculate the auto correlation, but this failed.
(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: 'white'')


Format strings passed to MaskedConstant are ignored, but in future may error or produce different behavior


There was an attempt to generate the Heatmap missing values diagrams, but this failed.
(using `df.profile_report(missing_diagrams={"Heatmap": False}`)
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: '--'')

Summarize dataset: 100%|██████████| 51/51 [00:40<00:00,  1.27it/s, Completed]               
Generate report structure: 100%|██████████| 1/1 [00:13<00:00, 13.96s/it]
Render HTML: 100%|██████████| 1/