In [23]:
# 26) CARREGAMENTO E INSPEÇÃO INICIAL

import os
import pandas as pd

local_path = "energydata_complete.csv"
uci_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv"

if os.path.exists(local_path):
    df = pd.read_csv(local_path)
else:
    df = pd.read_csv(uci_url)

# Parse da coluna de data (se existir) e set como índice temporal
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
    df = df.set_index('date')

print("=== [26] INFO DO DATAFRAME ===")
print(df.info())
print("\n=== [26] ESTATÍSTICAS DESCRITIVAS ===")
print(df.describe())


=== [26] INFO DO DATAFRAME ===
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 19735 entries, 2016-01-11 17:00:00 to 2016-05-27 18:00:00
Data columns (total 28 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Appliances   19735 non-null  int64  
 1   lights       19735 non-null  int64  
 2   T1           19735 non-null  float64
 3   RH_1         19735 non-null  float64
 4   T2           19735 non-null  float64
 5   RH_2         19735 non-null  float64
 6   T3           19735 non-null  float64
 7   RH_3         19735 non-null  float64
 8   T4           19735 non-null  float64
 9   RH_4         19735 non-null  float64
 10  T5           19735 non-null  float64
 11  RH_5         19735 non-null  float64
 12  T6           19735 non-null  float64
 13  RH_6         19735 non-null  float64
 14  T7           19735 non-null  float64
 15  RH_7         19735 non-null  float64
 16  T8           19735 non-null  float64
 17  RH_8         19735 non

In [None]:
# 27) DISTRIBUIÇÃO DO CONSUMO (Appliances)

import matplotlib.pyplot as plt

plt.figure()
df['Appliances'].hist(bins=50, edgecolor='black')
plt.title("[27] Distribuição do Consumo - Appliances (Histograma)")
plt.xlabel("Consumo (Wh)")
plt.ylabel("Frequência")
plt.show()

plt.figure()
# Se o índice for datetime, podemos plotar um recorte temporal;
# caso contrário, usamos as primeiras N amostras em formato sequencial.
if isinstance(df.index, pd.DatetimeIndex):
    # Para reduzir ruído, mostramos as primeiras 48 horas (ou menos, se indisponível)
    recorte = df['Appliances'].iloc[:(48*60)] if len(df) >= (48*60) else df['Appliances']
    recorte.plot()
    plt.title("[27] Série Temporal - Appliances (recorte inicial)")
    plt.xlabel("Tempo")
else:
    df['Appliances'].iloc[:500].plot()
    plt.title("[27] Série Temporal - Appliances (primeiros 500 pontos)")
    plt.xlabel("Amostras")
plt.ylabel("Consumo (Wh)")
plt.show()



In [None]:
# 28) CORRELAÇÕES COM VARIÁVEIS AMBIENTAIS

env_vars = [
    'T1','RH_1','T2','RH_2','T3','RH_3','T4','RH_4','T5','RH_5',
    'T6','RH_6','T7','RH_7','T8','RH_8','T9','RH_9',
    'T_out','RH_out','Press_mm_hg','Windspeed','Visibility','Tdewpoint'
]
cols_existentes = [c for c in env_vars if c in df.columns]
corr_df = df[['Appliances'] + cols_existentes].corr()

print("\n=== [28] CORRELAÇÕES COM 'Appliances' (ordem decrescente) ===")
print(corr_df['Appliances'].sort_values(ascending=False))


In [None]:
# 29) NORMALIZAÇÃO (MIN-MAX SCALING)

from sklearn.preprocessing import MinMaxScaler
import numpy as np

num_df = df.select_dtypes(include=[np.number])  # só numéricas
scaler = MinMaxScaler()
scaled_values = scaler.fit_transform(num_df)
df_scaled = pd.DataFrame(scaled_values, columns=num_df.columns, index=num_df.index)

print("\n=== [29] EXEMPLO DE DADOS NORMALIZADOS (head) ===")
print(df_scaled.head())


In [None]:
# 30) PCA (2 COMPONENTES)

from sklearn.decomposition import PCA

pca = PCA(n_components=2, random_state=42)
pca_result = pca.fit_transform(df_scaled.drop(columns=['Appliances'], errors='ignore'))

print("\n=== [30] VARIÂNCIA EXPLICADA POR PC ===")
print("PC1 e PC2:", pca.explained_variance_ratio_,
      " | Soma:", pca.explained_variance_ratio_.sum())

plt.figure()
plt.scatter(pca_result[:, 0], pca_result[:, 1], s=5, alpha=0.2)
plt.title("[30] PCA - 2 Componentes Principais")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()


In [6]:
# ============================================================
# 31) REGRESSÃO LINEAR MÚLTIPLA
# ============================================================
# A) Modele Appliances em função das variáveis ambientais.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Carregar dataset (ajusta para o seu ambiente local se precisar)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv"
df = pd.read_csv(url)

# Features ambientais
env_vars = [
    'T1','RH_1','T2','RH_2','T3','RH_3','T4','RH_4','T5','RH_5',
    'T6','RH_6','T7','RH_7','T8','RH_8','T9','RH_9',
    'T_out','RH_out','Press_mm_hg','Windspeed','Visibility','Tdewpoint'
]
X = df[[c for c in env_vars if c in df.columns]]
y = df['Appliances']

# Divisão treino/teste (70/30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

# Modelo de Regressão Linear
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predições
y_pred = lr.predict(X_test)

# Avaliação
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("=== Regressão Linear Múltipla ===")
print(f"R²   : {r2:.3f}")
print(f"MAE  : {mae:.3f}")
print(f"RMSE : {rmse:.3f}")



=== [31] Regressão Linear Múltipla ===
R²   : 0.089
MAE  : 55.034
RMSE : 87.422


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
# 31
# B) Avalie o R² e erro médio

# Carregar dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv"
df = pd.read_csv(url, parse_dates=['date'], index_col='date')

# Variável dependente
y = df['Appliances']

# Variáveis independentes (ambientais)
X = df[['T1','RH_1','T2','RH_2','T3','RH_3','T4','RH_4','T5','RH_5',
        'T6','RH_6','T7','RH_7','T8','RH_8','T9','RH_9',
        'T_out','RH_out','Press_mm_hg','Windspeed','Visibility','Tdewpoint']]

# Divisão treino/teste (70/30, sem embaralhar por ser série temporal)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

# Modelo
model = LinearRegression()
model.fit(X_train, y_train)

# Predição
y_pred = model.predict(X_test)

# Avaliação
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Erro Absoluto Médio (MAE): {mae:.3f}")
print(f"Coeficiente de Determinação (R²): {r2:.3f}")


Erro Absoluto Médio (MAE): 55.034
Coeficiente de Determinação (R²): 0.089


In [8]:
# ================== CÉLULA DE SETUP COMUM ==================
import os, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

# Carrega dataset (local -> UCI)
local_path = "energydata_complete.csv"
uci_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv"
df = pd.read_csv(local_path) if os.path.exists(local_path) else pd.read_csv(uci_url)

# Índice temporal
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date').sort_index()

# Target e features ambientais
y = df['Appliances']
env_vars = [
    'T1','RH_1','T2','RH_2','T3','RH_3','T4','RH_4','T5','RH_5',
    'T6','RH_6','T7','RH_7','T8','RH_8','T9','RH_9',
    'T_out','RH_out','Press_mm_hg','Windspeed','Visibility','Tdewpoint'
]
X = df[env_vars].copy()

# Split temporal 70/30 (sem embaralhar)
cut = int(0.7*len(df))
X_train, X_test = X.iloc[:cut], X.iloc[cut:]
y_train, y_test = y.iloc[:cut], y.iloc[cut:]

# RMSE compatível com versões antigas do sklearn
def safe_rmse(y_true, y_pred):
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return np.sqrt(mean_squared_error(y_true, y_pred))


In [11]:
# 32
# A) Treine um modelo de Random Forest para prever Appliances.

from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(
    n_estimators=120,  # menor para acelerar
    max_depth=None,
    n_jobs=-1,
    random_state=42
)
rf_reg.fit(X_train, y_train)
y_pred_rf = rf_reg.predict(X_test)

rmse_rf = safe_rmse(y_test, y_pred_rf)
mae_rf  = mean_absolute_error(y_test, y_pred_rf)
r2_rf   = r2_score(y_test, y_pred_rf)

print("RF Regressor -> RMSE: %.3f | MAE: %.3f | R²: %.3f" % (rmse_rf, mae_rf, r2_rf))


[32-a] RF Regressor -> RMSE: 190.366 | MAE: 159.331 | R²: -3.321


In [12]:
# 32-b) Baseline Linear Regression para comparação
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression().fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)

rmse_lin = safe_rmse(y_test, y_pred_lin)
mae_lin  = mean_absolute_error(y_test, y_pred_lin)
r2_lin   = r2_score(y_test, y_pred_lin)

print("Linear Regression -> RMSE: %.3f | MAE: %.3f | R²: %.3f" % (rmse_lin, mae_lin, r2_lin))
print("Comparativo RMSE | RF: %.3f  vs  Linear: %.3f" % (rmse_rf, rmse_lin))



[32-b] Linear Regression -> RMSE: 87.422 | MAE: 55.034 | R²: 0.089
Comparativo RMSE | RF: 190.366  vs  Linear: 87.422


In [13]:
# 33-a) Aplique K-Means com 3 a 5 clusters.
from sklearn.cluster import KMeans

X_cluster = df[['Appliances']].copy()
for k in [3, 4, 5]:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    df[f'Cluster_k{k}'] = km.fit_predict(X_cluster)
    print(f"[33-a] k={k} -> tamanhos:", df[f'Cluster_k{k}'].value_counts().sort_index().to_dict())

[33-a] k=3 -> tamanhos: {0: 17597, 1: 467, 2: 1671}
[33-a] k=4 -> tamanhos: {0: 1462, 1: 13509, 2: 4379, 3: 385}
[33-a] k=5 -> tamanhos: {0: 12304, 1: 745, 2: 5293, 3: 1137, 4: 256}


In [15]:
# 33-b) Interprete os perfis de consumo.
for k in [3, 4, 5]:
    labels = df[f'Cluster_k{k}']
    perfis = df.groupby(labels)['Appliances'].agg(['count','mean','median','min','max']).sort_values('mean')
    perfis.index.name = f'Cluster_k{k}'
    print(f"\nPerfis (k={k}) ordenados por média:")
    print(perfis)
# Leitura: clusters com médias menores = baixo consumo; maiores = alto.



Perfis (k=3) ordenados por média:
            count        mean  median  min   max
Cluster_k3                                      
0           17597   67.209752    60.0   10   170
2            1671  287.701975   280.0  180   420
1             467  566.531049   540.0  430  1080

Perfis (k=4) ordenados por média:
            count        mean  median  min   max
Cluster_k4                                      
1           13509   54.061737    50.0   10    80
2            4379  116.200046   110.0   90   210
0            1462  314.876881   310.0  220   450
3             385  593.506494   580.0  460  1080

Perfis (k=5) ordenados por média:
            count        mean  median  min   max
Cluster_k5                                      
0           12304   51.521456    50.0   10    70
2            5293  103.678443   100.0   80   170
3            1137  249.076517   250.0  180   320
1             745  397.865772   390.0  330   520
4             256  647.304688   620.0  530  1080


In [16]:
# 34-a) Crie variável binária: alto (> mediana) vs baixo consumo.
median_val = y.median()
y_bin = (y > median_val).astype(int)
ybin_train, ybin_test = y_bin.iloc[:cut], y_bin.iloc[cut:]
print("Mediana:", median_val, "| Dist. treino:", ybin_train.value_counts().to_dict(), "| teste:", ybin_test.value_counts().to_dict())


[34-a] Mediana: 60.0 | Dist. treino: {0: 7697, 1: 6117} | teste: {0: 3047, 1: 2874}


In [22]:
# 34-b) Treine Logistic Regression e Random Forest Classifier.
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# --- Target binário (alto > mediana) ---
median_val = y.median()
y_bin = (y > median_val).astype(int)
ybin_train, ybin_test = y_bin.iloc[:cut], y_bin.iloc[cut:]

# --- Logistic Regression com scaling + mais iterações (sem warning) ---
log_pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(solver="lbfgs", max_iter=4000))  # margem folgada
])

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    log_pipeline.fit(X_train, ybin_train)

yp_log = log_pipeline.predict(X_test)

# --- Random Forest Classifier (rápido e robusto) ---
rf_clf = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1)
rf_clf.fit(X_train, ybin_train)
yp_rf = rf_clf.predict(X_test)

# --- Resultados limpos (somente métricas) ---
print("=== Logistic Regression (scaled) ===")
print("Accuracy:", f"{accuracy_score(ybin_test, yp_log):.3f}")
print("Matriz de confusão:\n", confusion_matrix(ybin_test, yp_log))
print(classification_report(ybin_test, yp_log, target_names=["baixo(0)","alto(1)"]))

print("\n=== Random Forest Classifier ===")
print("Accuracy:", f"{accuracy_score(ybin_test, yp_rf):.3f}")
print("Matriz de confusão:\n", confusion_matrix(ybin_test, yp_rf))
print(classification_report(ybin_test, yp_rf, target_names=["baixo(0)","alto(1)"]))



=== Logistic Regression (scaled) ===
Accuracy: 0.686
Matriz de confusão:
 [[2194  853]
 [1006 1868]]
              precision    recall  f1-score   support

    baixo(0)       0.69      0.72      0.70      3047
     alto(1)       0.69      0.65      0.67      2874

    accuracy                           0.69      5921
   macro avg       0.69      0.69      0.69      5921
weighted avg       0.69      0.69      0.69      5921


=== Random Forest Classifier ===
Accuracy: 0.595
Matriz de confusão:
 [[ 852 2195]
 [ 203 2671]]
              precision    recall  f1-score   support

    baixo(0)       0.81      0.28      0.42      3047
     alto(1)       0.55      0.93      0.69      2874

    accuracy                           0.60      5921
   macro avg       0.68      0.60      0.55      5921
weighted avg       0.68      0.60      0.55      5921



In [19]:
# 35-a) Matriz de confusão e métricas (accuracy, precision, recall, F1).
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

def eval_cls(nome, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred, labels=[0,1])
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=1)
    rec  = recall_score(y_true, y_pred, pos_label=1)
    f1   = f1_score(y_true, y_pred, pos_label=1)
    print(f"\n[35-a] {nome} -> Acc: {acc:.3f} | Precision(1): {prec:.3f} | Recall(1): {rec:.3f} | F1(1): {f1:.3f}")
    print("[35-a] Matriz de Confusão [linhas=verdadeiro, colunas=predito] (0/1):\n", cm)
    print("[35-a] Classification Report:\n", classification_report(y_true, y_pred, labels=[0,1], target_names=['baixo(0)','alto(1)']))

eval_cls("LogisticRegression", ybin_test, yp_log)
eval_cls("RandomForestClassifier", ybin_test, yp_rf)



[35-a] LogisticRegression -> Acc: 0.711 | Precision(1): 0.699 | Recall(1): 0.709 | F1(1): 0.704
[35-a] Matriz de Confusão [linhas=verdadeiro, colunas=predito] (0/1):
 [[2171  876]
 [ 835 2039]]
[35-a] Classification Report:
               precision    recall  f1-score   support

    baixo(0)       0.72      0.71      0.72      3047
     alto(1)       0.70      0.71      0.70      2874

    accuracy                           0.71      5921
   macro avg       0.71      0.71      0.71      5921
weighted avg       0.71      0.71      0.71      5921


[35-a] RandomForestClassifier -> Acc: 0.595 | Precision(1): 0.549 | Recall(1): 0.929 | F1(1): 0.690
[35-a] Matriz de Confusão [linhas=verdadeiro, colunas=predito] (0/1):
 [[ 852 2195]
 [ 203 2671]]
[35-a] Classification Report:
               precision    recall  f1-score   support

    baixo(0)       0.81      0.28      0.42      3047
     alto(1)       0.55      0.93      0.69      2874

    accuracy                           0.60      5921

In [20]:
# 35-b) O modelo erra mais para alto ou baixo consumo?
import numpy as np

def error_profile(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    err_baixo = fp / (tn + fp) if (tn+fp)>0 else np.nan  # baixo (0) rotulado errado como alto (1)
    err_alto  = fn / (tp + fn) if (tp+fn)>0 else np.nan  # alto (1) rotulado errado como baixo (0)
    return err_baixo, err_alto

for nome, yhat in [("LogisticRegression", yp_log), ("RandomForestClassifier", yp_rf)]:
    e0, e1 = error_profile(ybin_test, yhat)
    print(f"[35-b] {nome} -> Erro relativo baixo(0): {e0:.3f} | alto(1): {e1:.3f} ->",
          "ERRA MAIS EM ALTO (1)" if e1>e0 else "ERRO MAIOR EM BAIXO (0)")


[35-b] LogisticRegression -> Erro relativo baixo(0): 0.287 | alto(1): 0.291 -> ERRA MAIS EM ALTO (1)
[35-b] RandomForestClassifier -> Erro relativo baixo(0): 0.720 | alto(1): 0.071 -> ERRO MAIOR EM BAIXO (0)
