# Importações

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Leitura dos dados

In [2]:
folder_data = '../data'
clima = pd.read_csv(f'{folder_data}/clima.csv',parse_dates=['date'])
consumo = pd.read_csv(f'{folder_data}/consumo.csv',parse_dates=['date'])
clientes = pd.read_csv(f'{folder_data}/clientes.csv')

# Dados de Consumo

In [3]:
display(consumo.head(10))
print(consumo.info())

# Tipos de dados corretos
# Sem dados faltantes

Unnamed: 0,client_id,date,consumption_kwh
0,C0000,2023-01-01,18.64
1,C0000,2023-01-02,16.63
2,C0000,2023-01-03,18.11
3,C0000,2023-01-04,18.25
4,C0000,2023-01-05,19.81
5,C0000,2023-01-06,15.87
6,C0000,2023-01-07,20.3
7,C0000,2023-01-08,19.35
8,C0000,2023-01-09,18.3
9,C0000,2023-01-10,13.34


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   client_id        18000 non-null  object        
 1   date             18000 non-null  datetime64[ns]
 2   consumption_kwh  18000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 422.0+ KB
None


In [4]:
display(consumo.describe())

# Dados diários sem dias futuros
# Dados de Consumo sem valores exorbitantes

Unnamed: 0,date,consumption_kwh
count,18000,18000.0
mean,2023-03-31 11:59:59.999999744,14.811876
min,2023-01-01 00:00:00,2.64
25%,2023-02-14 18:00:00,12.07
50%,2023-03-31 12:00:00,14.89
75%,2023-05-15 06:00:00,17.63
max,2023-06-29 00:00:00,27.92
std,,3.728855


In [5]:
dias_por_cliente = consumo.groupby('client_id')['date'].nunique()
todos_tem_180_dias = dias_por_cliente.eq(180).all()
print(f"Todos os client_id têm 180 dias únicos? {todos_tem_180_dias}")

# Não há gaps na série temporal de cada client_id

Todos os client_id têm 180 dias únicos? True


In [6]:
duplicados = consumo.duplicated(subset=['client_id', 'date'], keep=False)
tem_duplicados = duplicados.any()
print(f"Há valores de dias duplicados para algum client_id? {tem_duplicados}")

Há valores de dias duplicados para algum client_id? False


In [7]:
fig = px.histogram(
    consumo,
    x='consumption_kwh', 
    nbins=50, 
    title='Distribuição do consumo de energia (kWh)',
    labels={'consumption_kwh': 'Consumo (kWh)'},
    marginal='box',
    opacity=0.75
)

fig.update_layout(
    xaxis_title='Consumo (kWh)',
    yaxis_title='Frequência',
    bargap=0.05,
    template='simple_white'
)

fig.show()