In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, accuracy_score

In [10]:
amostra_df = pd.read_csv(f"Tabelas/amostra_series_df.csv")
amostra_df = amostra_df.set_index("Data da Coleta")
amostra_df.head()

Unnamed: 0_level_0,Regiao - Sigla,Estado - Sigla,Municipio,Produto,Valor de Venda,Unidade de Medida,Bandeira,Ano,Mes,Dia
Data da Coleta,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2012-01-02,S,RS,TRAMANDAI,ETANOL,2.499,Litro,IPIRANGA,2012,1,2
2012-01-02,S,RS,NOVO HAMBURGO,DIESEL,2.049,Litro,PETROBRAS DISTRIBUIDORA S.A.,2012,1,2
2012-01-02,SE,SP,TUPA,GASOLINA,2.79,Litro,RAIZEN,2012,1,2
2012-01-02,SE,SP,ARARAQUARA,GASOLINA,2.529,Litro,BRANCA,2012,1,2
2012-01-02,NE,BA,CAMACARI,DIESEL,1.98,Litro,RAIZEN,2012,1,2


In [11]:
from scipy import stats


limite_z_score = 3.0

z_scores = np.abs(stats.zscore(amostra_df.select_dtypes(include=[np.number])))
outliers_indices = np.where(z_scores > limite_z_score)


cleaned_df = amostra_df.drop(amostra_df.index[outliers_indices[0]])

print("Número de outliers identificados:", len(outliers_indices[0]))
print("Tamanho do DataFrame após a remoção:", cleaned_df.shape)

Número de outliers identificados: 26462
Tamanho do DataFrame após a remoção: (1495552, 10)


In [12]:
machine_learning_df = pd.DataFrame()

media_valor = cleaned_df['Valor de Venda'].mean()
desvio_padrao_valor = cleaned_df['Valor de Venda'].std()

def padronizar_valor(valor):
    return (valor - media_valor) / desvio_padrao_valor

machine_learning_df['Valor de Venda'] = cleaned_df['Valor de Venda'].apply(padronizar_valor)

dummy_columns = pd.get_dummies(cleaned_df)

machine_learning_df = pd.concat([machine_learning_df, dummy_columns], axis=1)

column_order = ['Valor de Venda'] + [col for col in machine_learning_df.columns if col != 'Valor de Venda']
machine_learning_df = machine_learning_df[column_order]

In [13]:
machine_learning_df[['Ano', 'Mes', 'Dia']] = cleaned_df[['Ano', 'Mes', 'Dia']]

In [14]:
X = machine_learning_df.drop('Valor de Venda', axis=1)
y = machine_learning_df['Valor de Venda']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
machine_learning_df

Unnamed: 0_level_0,Valor de Venda,Valor de Venda,Ano,Mes,Dia,Regiao - Sigla_CO,Regiao - Sigla_N,Regiao - Sigla_NE,Regiao - Sigla_S,Regiao - Sigla_SE,...,Bandeira_TOBRAS,Bandeira_TORRAO,Bandeira_TOTAL BRASIL,Bandeira_UBERLANDIA,Bandeira_UBP PETRÓLEO,Bandeira_UNI,Bandeira_VIBRA ENERGIA,Bandeira_WALENDOWSKY,Bandeira_WATT,Bandeira_ZEMA
Data da Coleta,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-02,-0.718553,2.499,2012,1,2,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2012-01-02,-1.254238,2.049,2012,1,2,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2012-01-02,-0.372144,2.790,2012,1,2,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2012-01-02,-0.682841,2.529,2012,1,2,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2012-01-02,-1.336376,1.980,2012,1,2,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-30,2.425318,5.140,2022,12,30,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2022-12-30,2.246757,4.990,2022,12,30,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2022-12-30,2.175332,4.930,2022,12,30,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2022-12-30,2.425318,5.140,2022,12,30,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False


In [16]:
model = LinearRegression()

model.fit(X_train, y_train)

In [8]:
# acuracia em porcentagem r2
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = model.score(X_test, y_test)

print(f"Chance de erro (MAE): {mae * 100:.2f} %")
print(f"Chance de Acerto (R2): {r2 * 100:.2f} %")

Chance de erro (MAE): 1336108.87 %
Chance de Acerto (R2): -2030481196393080.50 %
