In [2]:
import requests
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import Lasso
import pickle


In [2]:
def remover_colunas_correlacionadas(df):
    matrix_corr = df.corr().abs()
    remove_columns = set()

    for i_corr in range(len(matrix_corr.columns)):
        for j_corr in range(i_corr + 1, len(matrix_corr.columns)):
            if matrix_corr.iloc[i_corr, j_corr] > 0.87 and matrix_corr.columns[j_corr] != 'close':
                remove_columns.add(matrix_corr.columns[j_corr])

    return df.drop(columns = remove_columns)

def filtra_colunas_zero(df):
    mean = df.describe().loc['mean']
    cols_to_keep = mean[mean != 0].index
    return df[cols_to_keep]

In [3]:
all_stocks = pd.read_csv('data/all_stocks_by_month.csv', sep=',')

FileNotFoundError: [Errno 2] No such file or directory: 'all_stocks_by_month.csv'

In [56]:
all_stocks.head()

Unnamed: 0,date,AccountsPayable,AccountsReceivable,AllowanceForDoubtfulAccountsReceivable,BasicAverageShares,BasicEPS,BeginningCashPosition,CapitalStock,CashAndCashEquivalents,CashCashEquivalentsAndShortTermInvestments,...,ImpairmentOfCapitalAssets,HedgingAssetsCurrent,CapitalLeaseObligations,CurrentCapitalLeaseObligation,LongTermCapitalLeaseObligation,ChangeInInventory,InvestmentsinJointVenturesatCost,ChangeInAccruedExpense,DividendPaidCFO,symbol
0,2022-07-31,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1462259000.0,80000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2022-08-31,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1462259000.0,80000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2022-09-30,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1462259000.0,80000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,2022-10-31,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1462259000.0,80000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2022-11-30,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1462259000.0,80000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [57]:
all_stocks.columns

Index(['date', 'AccountsPayable', 'AccountsReceivable',
       'AllowanceForDoubtfulAccountsReceivable', 'BasicAverageShares',
       'BasicEPS', 'BeginningCashPosition', 'CapitalStock',
       'CashAndCashEquivalents', 'CashCashEquivalentsAndShortTermInvestments',
       ...
       'ImpairmentOfCapitalAssets', 'HedgingAssetsCurrent',
       'CapitalLeaseObligations', 'CurrentCapitalLeaseObligation',
       'LongTermCapitalLeaseObligation', 'ChangeInInventory',
       'InvestmentsinJointVenturesatCost', 'ChangeInAccruedExpense',
       'DividendPaidCFO', 'symbol'],
      dtype='object', length=242)

In [1]:
def buscar_serie(codigo):
    url = f"https://api.bcb.gov.br/dados/serie/bcdata.sgs.{codigo}/dados?formato=json"
    response = requests.get(url)
    if response.status_code == 200:
        dados = response.json()
        df = pd.DataFrame(dados)
        df['data'] = pd.to_datetime(df['data'], dayfirst=True)
        df['valor'] = df['valor'].astype(float)
        return df
    else:
        print(f"Erro ao acessar a API para o código {codigo}: {response.status_code}")
        return None

def add_economic_indicators():
    indices_economicos = {
        "IPCA": 433,
        "INPC": 188,
        "IGPM": 189,
        "PIB": 4380,
        "DESEMPREGO": 24369,
    }

    dados_indices = {}
    for nome, codigo in indices_economicos.items():
        try:
            dados_indices[nome] = buscar_serie(codigo)
            print(f"Dados de {nome} carregados com sucesso!")
        except Exception as e:
            print(f"Erro ao carregar os dados de {nome}: {e}")

    df_final = None
    for nome, df in dados_indices.items():
        df = df.rename(columns={'valor': nome})
        if df_final is None:
            df_final = df
        else:
            df_final = pd.merge(df_final, df, on='data', how='outer')

    print(df_final.head())
    return df_final

In [3]:
df_indices = add_economic_indicators()
df_indices.rename(columns={'data': 'date'}, inplace=True)
df_indices_cleaned = df_indices.dropna(axis=1, how='all')
df_indices_cleaned.set_index('date', inplace=True)
monthly_indices = df_indices_cleaned.resample('ME').mean()
monthly_indices['date'] = pd.to_datetime(monthly_indices.index)
monthly_indices.reset_index(drop=True, inplace=True)

Dados de IPCA carregados com sucesso!
Dados de INPC carregados com sucesso!
Dados de IGPM carregados com sucesso!
Dados de PIB carregados com sucesso!
Dados de DESEMPREGO carregados com sucesso!
        data  IPCA  INPC  IGPM  PIB  DESEMPREGO
0 1979-05-01   NaN  1.76   NaN  NaN         NaN
1 1979-06-01   NaN  3.00   NaN  NaN         NaN
2 1979-07-01   NaN  5.36   NaN  NaN         NaN
3 1979-08-01   NaN  5.79   NaN  NaN         NaN
4 1979-09-01   NaN  6.61   NaN  NaN         NaN


In [5]:
monthly_indices.to_csv('monthly_indices.csv', index=False)

In [61]:
all_stocks['date']

0       2022-07-31
1       2022-08-31
2       2022-09-30
3       2022-10-31
4       2022-11-30
           ...    
6263    2023-08-31
6264    2023-09-30
6265    2023-10-31
6266    2023-11-30
6267    2023-12-31
Name: date, Length: 6268, dtype: object

In [62]:
def merge_indices_stocks(all_stocks_param, df_indices_cleaned_param):
    # Verificar o formato das datas nos DataFrames
    all_stocks_param['date'] = pd.to_datetime(all_stocks_param['date'])
    df_indices_cleaned_param['date'] = pd.to_datetime(df_indices_cleaned_param['date'])

    # Verificar se há datas comuns entre os DataFrames
    common_dates = set(all_stocks_param['date']).intersection(set(df_indices_cleaned_param['date']))
    if not common_dates:
        print("Não há datas comuns entre os DataFrames.")
    else:
        # Realizar o merge dos DataFrames utilizando a coluna de datas
        merged_df = pd.merge(all_stocks_param, df_indices_cleaned_param, on='date', how='inner')

        # Exibir o DataFrame resultante
        return merged_df


all_stocks = merge_indices_stocks(all_stocks, monthly_indices)
all_stocks

Unnamed: 0,date,AccountsPayable,AccountsReceivable,AllowanceForDoubtfulAccountsReceivable,BasicAverageShares,BasicEPS,BeginningCashPosition,CapitalStock,CashAndCashEquivalents,CashCashEquivalentsAndShortTermInvestments,...,ChangeInInventory,InvestmentsinJointVenturesatCost,ChangeInAccruedExpense,DividendPaidCFO,symbol,IPCA,INPC,IGPM,PIB,DESEMPREGO
0,2022-07-31,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1.462259e+09,80000.0,0.0,...,0.0,0.0,0.0,0.0,0,-0.68,-0.60,0.21,869133.2,9.1
1,2022-08-31,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1.462259e+09,80000.0,0.0,...,0.0,0.0,0.0,0.0,0,-0.36,-0.31,-0.70,873862.3,8.9
2,2022-09-30,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1.462259e+09,80000.0,0.0,...,0.0,0.0,0.0,0.0,0,-0.29,-0.32,-0.95,858186.5,8.7
3,2022-10-31,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1.462259e+09,80000.0,0.0,...,0.0,0.0,0.0,0.0,0,0.59,0.47,-0.97,880537.5,8.3
4,2022-11-30,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1.462259e+09,80000.0,0.0,...,0.0,0.0,0.0,0.0,0,0.41,0.38,-0.56,880820.2,8.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6263,2023-08-31,0.0,0.0,0.0,0.0,0.0,377000.0,5.282700e+07,3000.0,0.0,...,0.0,0.0,0.0,0.0,ZIFI11.SA,0.23,0.20,-0.14,932293.6,7.8
6264,2023-09-30,0.0,0.0,0.0,0.0,0.0,377000.0,5.282700e+07,3000.0,0.0,...,0.0,0.0,0.0,0.0,ZIFI11.SA,0.26,0.11,0.37,915855.2,7.7
6265,2023-10-31,0.0,0.0,0.0,0.0,0.0,377000.0,5.282700e+07,3000.0,0.0,...,0.0,0.0,0.0,0.0,ZIFI11.SA,0.24,0.12,0.50,953945.0,7.6
6266,2023-11-30,0.0,0.0,0.0,0.0,0.0,377000.0,5.282700e+07,3000.0,0.0,...,0.0,0.0,0.0,0.0,ZIFI11.SA,0.28,0.10,0.59,961832.6,7.5


In [63]:
df_fil = (all_stocks.pipe(filtra_colunas_zero)
               .pipe(remover_colunas_correlacionadas)
               )
df_fil

Unnamed: 0,date,AccountsPayable,AccountsReceivable,AllowanceForDoubtfulAccountsReceivable,BasicAverageShares,BasicEPS,BeginningCashPosition,CapitalStock,CashAndCashEquivalents,CashCashEquivalentsAndShortTermInvestments,...,InterestReceivedCFO,ImpairmentOfCapitalAssets,HedgingAssetsCurrent,CapitalLeaseObligations,ChangeInInventory,InvestmentsinJointVenturesatCost,ChangeInAccruedExpense,DividendPaidCFO,IPCA,IGPM
0,2022-07-31,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1.462259e+09,80000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.68,0.21
1,2022-08-31,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1.462259e+09,80000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.36,-0.70
2,2022-09-30,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1.462259e+09,80000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.29,-0.95
3,2022-10-31,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1.462259e+09,80000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.59,-0.97
4,2022-11-30,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1.462259e+09,80000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41,-0.56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6263,2023-08-31,0.0,0.0,0.0,0.0,0.0,377000.0,5.282700e+07,3000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23,-0.14
6264,2023-09-30,0.0,0.0,0.0,0.0,0.0,377000.0,5.282700e+07,3000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26,0.37
6265,2023-10-31,0.0,0.0,0.0,0.0,0.0,377000.0,5.282700e+07,3000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.24,0.50
6266,2023-11-30,0.0,0.0,0.0,0.0,0.0,377000.0,5.282700e+07,3000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.28,0.59


In [64]:
columns = df_fil.columns

In [65]:
columns.to_list()

['date',
 'AccountsPayable',
 'AccountsReceivable',
 'AllowanceForDoubtfulAccountsReceivable',
 'BasicAverageShares',
 'BasicEPS',
 'BeginningCashPosition',
 'CapitalStock',
 'CashAndCashEquivalents',
 'CashCashEquivalentsAndShortTermInvestments',
 'CashDividendsPaid',
 'CashFinancial',
 'CashFlowsfromusedinOperatingActivitiesDirect',
 'ChangeInCashSupplementalAsReported',
 'ClassesofCashPayments',
 'ClassesofCashReceiptsfromOperatingActivities',
 'CostOfRevenue',
 'CurrentLiabilities',
 'CurrentProvisions',
 'DilutedNIAvailtoComStockholders',
 'DividendsPayable',
 'EBIT',
 'EndCashPosition',
 'EnterpriseValue',
 'FinancingCashFlow',
 'FreeCashFlow',
 'GeneralAndAdministrativeExpense',
 'GrossAccountsReceivable',
 'GrossProfit',
 'InterestIncome',
 'InterestIncomeNonOperating',
 'InterestReceivedDirect',
 'InvestedCapital',
 'InvestmentProperties',
 'LongTermProvisions',
 'MarketCap',
 'NetInterestIncome',
 'NetNonOperatingInterestIncomeExpense',
 'NonCurrentDeferredRevenue',
 'Operati

In [66]:
df_fil = df_fil.dropna()

In [67]:
df_fil.describe()

Unnamed: 0,date,AccountsPayable,AccountsReceivable,AllowanceForDoubtfulAccountsReceivable,BasicAverageShares,BasicEPS,BeginningCashPosition,CapitalStock,CashAndCashEquivalents,CashCashEquivalentsAndShortTermInvestments,...,InterestReceivedCFO,ImpairmentOfCapitalAssets,HedgingAssetsCurrent,CapitalLeaseObligations,ChangeInInventory,InvestmentsinJointVenturesatCost,ChangeInAccruedExpense,DividendPaidCFO,IPCA,IGPM
count,6246,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,...,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0
mean,2022-10-10 20:39:11.585014528,4577433.0,8691191.0,-487505.0,25250760.0,20.258868,25615180.0,562233200.0,7547121.0,79132200.0,...,27093.5,-376.560999,606934.7,439004.8,50471.98,167331.1,319.884726,-114931.8,0.507707,0.401888
min,2020-06-30 00:00:00,0.0,0.0,-47350000.0,0.0,-112.82,0.0,0.0,0.0,0.0,...,0.0,-336000.0,0.0,0.0,-380000.0,0.0,-22000.0,-59822000.0,-0.68,-1.93
25%,2022-01-31 00:00:00,0.0,0.0,0.0,0.0,0.0,578000.0,75871000.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.24,-0.52
50%,2022-10-31 00:00:00,247000.0,0.0,0.0,758400.0,0.011344,4587000.0,215475000.0,54000.0,443000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.53,0.45
75%,2023-07-31 00:00:00,2541000.0,1525000.0,0.0,4533315.0,8.95,19804000.0,606328000.0,3656000.0,24173000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.83,0.81
max,2024-06-30 00:00:00,182944000.0,292647000.0,0.0,3924500000.0,4587.04,652792000.0,8246646000.0,667979000.0,3305701000.0,...,8722000.0,336000.0,493232000.0,51267000.0,25358000.0,104515000.0,120000.0,0.0,1.62,4.34
std,,16457040.0,33826270.0,3391159.0,236390900.0,214.83698,64693900.0,956391600.0,34897880.0,296239500.0,...,452028.1,27222.213203,15390590.0,4315120.0,1040333.0,4178924.0,5730.159425,2619798.0,0.448882,1.220795


In [68]:
df_fil

Unnamed: 0,date,AccountsPayable,AccountsReceivable,AllowanceForDoubtfulAccountsReceivable,BasicAverageShares,BasicEPS,BeginningCashPosition,CapitalStock,CashAndCashEquivalents,CashCashEquivalentsAndShortTermInvestments,...,InterestReceivedCFO,ImpairmentOfCapitalAssets,HedgingAssetsCurrent,CapitalLeaseObligations,ChangeInInventory,InvestmentsinJointVenturesatCost,ChangeInAccruedExpense,DividendPaidCFO,IPCA,IGPM
0,2022-07-31,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1.462259e+09,80000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.68,0.21
1,2022-08-31,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1.462259e+09,80000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.36,-0.70
2,2022-09-30,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1.462259e+09,80000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.29,-0.95
3,2022-10-31,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1.462259e+09,80000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.59,-0.97
4,2022-11-30,19141000.0,0.0,0.0,0.0,0.0,135160000.0,1.462259e+09,80000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41,-0.56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6263,2023-08-31,0.0,0.0,0.0,0.0,0.0,377000.0,5.282700e+07,3000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23,-0.14
6264,2023-09-30,0.0,0.0,0.0,0.0,0.0,377000.0,5.282700e+07,3000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26,0.37
6265,2023-10-31,0.0,0.0,0.0,0.0,0.0,377000.0,5.282700e+07,3000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.24,0.50
6266,2023-11-30,0.0,0.0,0.0,0.0,0.0,377000.0,5.282700e+07,3000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.28,0.59


In [69]:
info_target = df_fil[['open', 'close']]
info_target

Unnamed: 0,open,close
0,95.704001,95.372001
1,91.571305,91.498695
2,90.848095,90.780000
3,91.103499,91.320000
4,92.478000,92.439000
...,...,...
6263,929.541815,923.659995
6264,898.712856,898.855713
6265,897.997998,899.995996
6266,899.992493,897.992493


In [70]:
df_fil.set_index('date', inplace=True)
df_fil = df_fil.drop(columns=['open', 'close'])

In [72]:
# Supondo que 'df_fil' é o seu DataFrame de características
# E 'target' é a coluna alvo para a análise de investimentos
X = df_fil
y = info_target['close']

# Dividir os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Criar e treinar o modelo Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Obter a importância das características
importances = model.feature_importances_

# Criar um DataFrame com as importâncias das características
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})

# Ordenar as características pela importância
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Selecionar as 15 melhores características
top_15_features = feature_importances.head(15)['Feature'].tolist()

# Exibir as 15 melhores características
print(top_15_features)

['adjclose', 'TotalUnusualItems', 'CostOfRevenue', 'InvestedCapital', 'GrossProfit', 'OrdinarySharesNumber', 'OtherOperatingExpenses', 'FreeCashFlow', 'RetainedEarnings', 'ClassesofCashReceiptsfromOperatingActivities', 'ReceiptsfromCustomers', 'BasicAverageShares', 'TaxEffectOfUnusualItems', 'CashFlowsfromusedinOperatingActivitiesDirect', 'EBIT']


In [73]:
colunas_sugeridas_gpt = [
    "BasicEPS",
    "EBIT",
    "FreeCashFlow",
    "NetIncomeFromContinuingOperations",
    "GrossProfit",
    "CashAndCashEquivalents",
    "MarketCap",
    "EnterpriseValue",
    "TotalLiabilitiesNetMinorityInterest",
    "LongTermDebt",
    "RetainedEarnings",
    "CashFlowsfromusedinOperatingActivitiesDirect",
    "InvestedCapital",
    "AccountsReceivable",
    "dividends",

]

In [74]:
# Salvando
with open("../colunas_extraction/colunas_random_forest_classification.pkl", "wb") as f:
    pickle.dump(top_15_features, f)

# Salvando
with open("../colunas_extraction/colunas_sugeridas_gpt.pkl", "wb") as f:
    pickle.dump(colunas_sugeridas_gpt, f)


In [75]:
# Para ler depois
with open("../colunas_extraction/colunas_random_forest_classification.pkl", "rb") as f:
    feature_RFR = pickle.load(f)
print(feature_RFR)

# Para ler depois
with open("../colunas_extraction/colunas_sugeridas_gpt.pkl", "rb") as f:
    feature_GPT = pickle.load(f)
print(feature_GPT)

['adjclose', 'TotalUnusualItems', 'CostOfRevenue', 'InvestedCapital', 'GrossProfit', 'OrdinarySharesNumber', 'OtherOperatingExpenses', 'FreeCashFlow', 'RetainedEarnings', 'ClassesofCashReceiptsfromOperatingActivities', 'ReceiptsfromCustomers', 'BasicAverageShares', 'TaxEffectOfUnusualItems', 'CashFlowsfromusedinOperatingActivitiesDirect', 'EBIT']
['BasicEPS', 'EBIT', 'FreeCashFlow', 'NetIncomeFromContinuingOperations', 'GrossProfit', 'CashAndCashEquivalents', 'MarketCap', 'EnterpriseValue', 'TotalLiabilitiesNetMinorityInterest', 'LongTermDebt', 'RetainedEarnings', 'CashFlowsfromusedinOperatingActivitiesDirect', 'InvestedCapital', 'AccountsReceivable', 'dividends']


In [76]:
all_stocks_RFR = all_stocks[feature_RFR + ['open', 'close']]
all_stocks_GPT = all_stocks[feature_GPT + ['open', 'close']]

In [77]:
all_stocks_RFR.describe()

Unnamed: 0,adjclose,TotalUnusualItems,CostOfRevenue,InvestedCapital,GrossProfit,OrdinarySharesNumber,OtherOperatingExpenses,FreeCashFlow,RetainedEarnings,ClassesofCashReceiptsfromOperatingActivities,ReceiptsfromCustomers,BasicAverageShares,TaxEffectOfUnusualItems,CashFlowsfromusedinOperatingActivitiesDirect,EBIT,open,close
count,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0
mean,-1138435000.0,12914820.0,5367410.0,346812800.0,21247720.0,22508330.0,3162560.0,25425560.0,43795890.0,37957140.0,15502270.0,25250760.0,1923745.0,26031820.0,26481280.0,1359.882346,1359.876108
std,67562620000.0,78528220.0,24345370.0,691226000.0,54391520.0,103014200.0,14210190.0,106623900.0,169147800.0,98065520.0,41737330.0,236390900.0,9580953.0,77475460.0,63055530.0,13964.254194,13964.255043
min,-4791499000000.0,-275093000.0,-10215000.0,-16139000.0,-59243000.0,0.0,-41763000.0,-982842000.0,-665983000.0,-444000.0,0.0,0.0,-33747380.0,-180655000.0,-75947000.0,0.0,0.0
25%,47.81842,0.0,0.0,0.0,0.0,758400.0,0.0,-785000.0,-3619000.0,0.0,0.0,0.0,0.0,-165000.0,0.0,62.901125,62.810341
50%,65.96507,0.0,0.0,37763000.0,0.0,2443800.0,207500.0,6089000.0,1534000.0,1232000.0,0.0,758400.0,0.0,0.0,0.0,86.702861,86.630952
75%,79.70433,6104000.0,783000.0,290249200.0,12769750.0,10734100.0,1071500.0,36493000.0,31379000.0,32969000.0,252000.0,4533315.0,881025.0,22992000.0,19023000.0,101.551671,101.583613
max,38263240000.0,1224722000.0,292562000.0,5346516000.0,467086000.0,1086780000.0,173687000.0,1103672000.0,1212509000.0,1207763000.0,281130000.0,3924500000.0,179565600.0,1103672000.0,515033000.0,160000.0,160000.0


In [78]:
all_stocks_GPT.describe()

Unnamed: 0,BasicEPS,EBIT,FreeCashFlow,NetIncomeFromContinuingOperations,GrossProfit,CashAndCashEquivalents,MarketCap,EnterpriseValue,TotalLiabilitiesNetMinorityInterest,LongTermDebt,RetainedEarnings,CashFlowsfromusedinOperatingActivitiesDirect,InvestedCapital,AccountsReceivable,dividends,open,close
count,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0,6246.0
mean,20.258868,26481280.0,25425560.0,7966732.0,21247720.0,7547121.0,481020600.0,1623269000.0,84296580.0,3510207.0,43795890.0,26031820.0,346812800.0,8691191.0,0.377946,1359.882346,1359.876108
std,214.83698,63055530.0,106623900.0,37525640.0,54391520.0,34897880.0,3104652000.0,29328160000.0,207139200.0,23436920.0,169147800.0,77475460.0,691226000.0,33826270.0,7.994603,13964.254194,13964.255043
min,-112.82,-75947000.0,-982842000.0,-217375000.0,-59243000.0,0.0,0.0,-674085700.0,-1480000.0,0.0,-665983000.0,-180655000.0,-16139000.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,-785000.0,0.0,0.0,0.0,0.0,0.0,1344000.0,0.0,-3619000.0,-165000.0,0.0,0.0,0.0,62.901125,62.810341
50%,0.011344,0.0,6089000.0,0.0,0.0,54000.0,73800000.0,0.0,5446000.0,0.0,1534000.0,0.0,37763000.0,0.0,0.024317,86.702861,86.630952
75%,8.95,19023000.0,36493000.0,0.0,12769750.0,3656000.0,376252700.0,132328000.0,51633000.0,0.0,31379000.0,22992000.0,290249200.0,1525000.0,0.041666,101.551671,101.583613
max,4587.04,515033000.0,1103672000.0,323547000.0,467086000.0,667979000.0,96996090000.0,666329400000.0,1793707000.0,242769000.0,1212509000.0,1103672000.0,5346516000.0,292647000.0,388.412457,160000.0,160000.0


In [79]:
filtered_stocks_RFR = (all_stocks_RFR.pipe(filtra_colunas_zero)
               .pipe(remover_colunas_correlacionadas)
               )
filtered_stocks_RFR

Unnamed: 0,adjclose,TotalUnusualItems,CostOfRevenue,InvestedCapital,GrossProfit,OrdinarySharesNumber,OtherOperatingExpenses,FreeCashFlow,RetainedEarnings,ClassesofCashReceiptsfromOperatingActivities,ReceiptsfromCustomers,BasicAverageShares,TaxEffectOfUnusualItems,CashFlowsfromusedinOperatingActivitiesDirect,EBIT,open,close
0,70.472206,211894000.0,0.0,0.0,0.0,14723970.0,0.0,135643000.0,1804000.0,155315000.0,0.0,0.0,0.0,135643000.0,0.0,95.704001,95.372001
1,68.338002,211894000.0,0.0,0.0,0.0,14723970.0,0.0,135643000.0,1804000.0,155315000.0,0.0,0.0,0.0,135643000.0,0.0,91.571305,91.498695
2,68.326817,211894000.0,0.0,0.0,0.0,14723970.0,0.0,135643000.0,1804000.0,155315000.0,0.0,0.0,0.0,135643000.0,0.0,90.848095,90.780000
3,68.847863,211894000.0,0.0,0.0,0.0,14723970.0,0.0,135643000.0,1804000.0,155315000.0,0.0,0.0,0.0,135643000.0,0.0,91.103499,91.320000
4,69.878474,211894000.0,0.0,0.0,0.0,14723970.0,0.0,135643000.0,1804000.0,155315000.0,0.0,0.0,0.0,135643000.0,0.0,92.478000,92.439000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6263,923.659995,37506000.0,0.0,0.0,0.0,26710000.0,0.0,-374000.0,0.0,0.0,0.0,0.0,12752040.0,0.0,0.0,929.541815,923.659995
6264,898.855713,37506000.0,0.0,0.0,0.0,26710000.0,0.0,-374000.0,0.0,0.0,0.0,0.0,12752040.0,0.0,0.0,898.712856,898.855713
6265,899.995996,37506000.0,0.0,0.0,0.0,26710000.0,0.0,-374000.0,0.0,0.0,0.0,0.0,12752040.0,0.0,0.0,897.997998,899.995996
6266,897.992493,37506000.0,0.0,0.0,0.0,26710000.0,0.0,-374000.0,0.0,0.0,0.0,0.0,12752040.0,0.0,0.0,899.992493,897.992493


In [80]:
filtered_stocks_GPT = (all_stocks_GPT.pipe(filtra_colunas_zero)
               .pipe(remover_colunas_correlacionadas)
               )
filtered_stocks_GPT

Unnamed: 0,BasicEPS,EBIT,FreeCashFlow,NetIncomeFromContinuingOperations,GrossProfit,CashAndCashEquivalents,MarketCap,EnterpriseValue,TotalLiabilitiesNetMinorityInterest,LongTermDebt,RetainedEarnings,CashFlowsfromusedinOperatingActivitiesDirect,InvestedCapital,AccountsReceivable,dividends,open,close
0,0.0,0.0,135643000.0,0.0,0.0,80000.0,0.0,0.0,100677000.0,0.0,1804000.0,135643000.0,0.0,0.0,0.000000,95.704001,95.372001
1,0.0,0.0,135643000.0,0.0,0.0,80000.0,0.0,0.0,100677000.0,0.0,1804000.0,135643000.0,0.0,0.0,0.043478,91.571305,91.498695
2,0.0,0.0,135643000.0,0.0,0.0,80000.0,0.0,0.0,100677000.0,0.0,1804000.0,135643000.0,0.0,0.0,0.033333,90.848095,90.780000
3,0.0,0.0,135643000.0,0.0,0.0,80000.0,0.0,0.0,100677000.0,0.0,1804000.0,135643000.0,0.0,0.0,0.007500,91.103499,91.320000
4,0.0,0.0,135643000.0,0.0,0.0,80000.0,0.0,0.0,100677000.0,0.0,1804000.0,135643000.0,0.0,0.0,0.012500,92.478000,92.439000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6263,0.0,0.0,-374000.0,36898000.0,0.0,3000.0,0.0,0.0,297000.0,0.0,0.0,0.0,0.0,0.0,0.000000,929.541815,923.659995
6264,0.0,0.0,-374000.0,36898000.0,0.0,3000.0,0.0,0.0,297000.0,0.0,0.0,0.0,0.0,0.0,0.000000,898.712856,898.855713
6265,0.0,0.0,-374000.0,36898000.0,0.0,3000.0,0.0,0.0,297000.0,0.0,0.0,0.0,0.0,0.0,0.000000,897.997998,899.995996
6266,0.0,0.0,-374000.0,36898000.0,0.0,3000.0,0.0,0.0,297000.0,0.0,0.0,0.0,0.0,0.0,0.000000,899.992493,897.992493


In [81]:
filtered_stocks_GPT

Unnamed: 0,BasicEPS,EBIT,FreeCashFlow,NetIncomeFromContinuingOperations,GrossProfit,CashAndCashEquivalents,MarketCap,EnterpriseValue,TotalLiabilitiesNetMinorityInterest,LongTermDebt,RetainedEarnings,CashFlowsfromusedinOperatingActivitiesDirect,InvestedCapital,AccountsReceivable,dividends,open,close
0,0.0,0.0,135643000.0,0.0,0.0,80000.0,0.0,0.0,100677000.0,0.0,1804000.0,135643000.0,0.0,0.0,0.000000,95.704001,95.372001
1,0.0,0.0,135643000.0,0.0,0.0,80000.0,0.0,0.0,100677000.0,0.0,1804000.0,135643000.0,0.0,0.0,0.043478,91.571305,91.498695
2,0.0,0.0,135643000.0,0.0,0.0,80000.0,0.0,0.0,100677000.0,0.0,1804000.0,135643000.0,0.0,0.0,0.033333,90.848095,90.780000
3,0.0,0.0,135643000.0,0.0,0.0,80000.0,0.0,0.0,100677000.0,0.0,1804000.0,135643000.0,0.0,0.0,0.007500,91.103499,91.320000
4,0.0,0.0,135643000.0,0.0,0.0,80000.0,0.0,0.0,100677000.0,0.0,1804000.0,135643000.0,0.0,0.0,0.012500,92.478000,92.439000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6263,0.0,0.0,-374000.0,36898000.0,0.0,3000.0,0.0,0.0,297000.0,0.0,0.0,0.0,0.0,0.0,0.000000,929.541815,923.659995
6264,0.0,0.0,-374000.0,36898000.0,0.0,3000.0,0.0,0.0,297000.0,0.0,0.0,0.0,0.0,0.0,0.000000,898.712856,898.855713
6265,0.0,0.0,-374000.0,36898000.0,0.0,3000.0,0.0,0.0,297000.0,0.0,0.0,0.0,0.0,0.0,0.000000,897.997998,899.995996
6266,0.0,0.0,-374000.0,36898000.0,0.0,3000.0,0.0,0.0,297000.0,0.0,0.0,0.0,0.0,0.0,0.000000,899.992493,897.992493


## Seleção de Características:
Usa Lasso para selecionar características importantes.
Transforma os conjuntos de treino e teste para manter apenas as características selecionadas.
## Validação Cruzada:
Cria um modelo RandomForestRegressor e avalia seu desempenho usando validação cruzada com 5 dobras (cv=5).
Imprime as pontuações de validação cruzada e a média dessas pontuações.
## Treinamento do Modelo:
Treina o modelo RandomForestRegressor com os dados de treino.
## Predição e Avaliação:
Faz previsões no conjunto de teste.
Avalia o modelo usando métricas como MAE (Mean Absolute Error), MSE (Mean Squared Error) e R² (R-squared).
Cria um DataFrame com os valores reais e previstos e exibe as primeiras linhas.

In [85]:
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score

imputer = SimpleImputer(strategy='mean')  # Replace NaN with the mean value
filtered_stocks_GPT = filtered_stocks_GPT.copy()
filtered_stocks_GPT[['BasicEPS', 'EBIT', 'FreeCashFlow', 'NetIncomeFromContinuingOperations', 'GrossProfit', 'CashAndCashEquivalents', 'MarketCap', 'EnterpriseValue', 'TotalLiabilitiesNetMinorityInterest', 'LongTermDebt', 'RetainedEarnings', 'CashFlowsfromusedinOperatingActivitiesDirect', 'InvestedCapital', 'AccountsReceivable', 'dividends', 'close']] = imputer.fit_transform(filtered_stocks_GPT[['BasicEPS', 'EBIT', 'FreeCashFlow', 'NetIncomeFromContinuingOperations', 'GrossProfit', 'CashAndCashEquivalents', 'MarketCap', 'EnterpriseValue', 'TotalLiabilitiesNetMinorityInterest', 'LongTermDebt', 'RetainedEarnings', 'CashFlowsfromusedinOperatingActivitiesDirect', 'InvestedCapital', 'AccountsReceivable', 'dividends', 'close']])

dados1 = np.array(filtered_stocks_GPT.drop(columns=['open', 'close']))
dados2 = np.array(filtered_stocks_GPT['close'])

# Dividir os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(dados1, dados2, test_size=0.2, random_state=42)

# Feature Selection usando Lasso
lasso = Lasso(alpha=0.01, max_iter=10000).fit(X_train, y_train)
model = SelectFromModel(lasso, prefit=True)
X_train = model.transform(X_train)
X_test = model.transform(X_test)

# Criar e treinar o modelo Random Forest com validação cruzada
model_regression = RandomForestRegressor(n_estimators=500, random_state=42)
cross_val_scores = cross_val_score(model_regression, X_train, y_train, cv=5)
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Score: {np.mean(cross_val_scores):.4f}")

# Treinar o modelo com os dados de treino
model_regression.fit(X_train, y_train)

# Fazer previsões no conjunto de teste
y_pred = model_regression.predict(X_test)

# Avaliar o modelo
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Criar um DataFrame com os valores reais e previstos
resultados = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R² Score: {r2:.4f}")

# Exibir os primeiros resultados
print(resultados.head())

Cross-Validation Scores: [0.44066169 0.70342335 0.79594348 0.56616173 0.53018911]
Mean Cross-Validation Score: 0.6073
Mean Absolute Error (MAE): 911.1147
Mean Squared Error (MSE): 50794229.0973
R² Score: 0.6421
       Actual   Predicted
0    9.897273   10.597098
1   74.968948  178.543834
2   92.250233  102.860030
3   81.956956   87.441513
4  112.631905  109.301192
