In [None]:
import pandas as pd
import numpy as np
import glob
import os
import yfinance as yf

def load_stock_data(file_path, num_days=365):
    try:
        df = pd.read_csv(file_path, sep=';', decimal=',', thousands='.', 
                         encoding='utf-8', skiprows=12)
        
        df = df.replace(['', '%'], np.nan).infer_objects(copy=False)
        df = df.dropna(how='all', axis=0).dropna(how='all', axis=1)
        
        month_map = {
            'jan': 1, 'fev': 2, 'mar': 3, 'abr': 4, 'mai': 5, 'jun': 6,
            'jul': 7, 'ago': 8, 'set': 9, 'out': 10, 'nov': 11, 'dez': 12
        }
        
        def convert_date(date_str):
            try:
                if pd.isna(date_str):
                    return None
                month, year = str(date_str).lower().strip().split('-')
                month_num = month_map.get(month.strip(), 1)
                return pd.Timestamp(2000 + int(year), month_num, 1)
            except Exception as e:
                print(f"Error converting date {date_str}: {str(e)}")
                return None
        
        df['Dates'] = df.iloc[:,0].apply(convert_date)
        df = df.dropna(subset=['Dates'])
        
        df.set_index('Dates', inplace=True)
        
        cols = {
            'PX_OPEN': np.nan,
            'PX_HIGH': np.nan,
            'PX_LOW': np.nan,
            'PX_LAST': np.nan,
            'PX_VOLUME': np.nan,
            
            'PX_TO_BOOK_RATIO': pd.to_numeric(df.iloc[:,7], errors='coerce'),
            'BS_CUR_LIAB': pd.to_numeric(df.iloc[:,25], errors='coerce'),
            'CUR_MKT_CAP': pd.to_numeric(df.iloc[:,2], errors='coerce'),
            'EBITDA': pd.to_numeric(df.iloc[:,17], errors='coerce'),
            'NET_INCOME': pd.to_numeric(df.iloc[:,18], errors='coerce'),
            'PE_RATIO': pd.to_numeric(df.iloc[:,6], errors='coerce'),
            'EV_EBITDA': pd.to_numeric(df.iloc[:,8], errors='coerce'),
            'DEBT_TO_EBITDA': pd.to_numeric(df.iloc[:,30], errors='coerce'),
            'GROSS_MARGIN': pd.to_numeric(df.iloc[:,14], errors='coerce'),
            'EBIT_MARGIN': pd.to_numeric(df.iloc[:,15], errors='coerce'),
            'LIQ_RATIO': pd.to_numeric(df.iloc[:,29], errors='coerce'),
            'ROE': pd.to_numeric(df.iloc[:,18], errors='coerce'),
            'ROA': pd.to_numeric(df.iloc[:,18], errors='coerce') / pd.to_numeric(df.iloc[:,24], errors='coerce'),
            'ASSET_TURNOVER': pd.to_numeric(df.iloc[:,1], errors='coerce'),
            'CAPEX': pd.to_numeric(df.iloc[:,1], errors='coerce'),
            'FCF': pd.to_numeric(df.iloc[:,1], errors='coerce'),
            'ROIC': pd.to_numeric(df.iloc[:,1], errors='coerce'),
            'NET_DEBT': pd.to_numeric(df.iloc[:,1], errors='coerce')
        }
        
        processed_df = pd.DataFrame(cols, index=df.index)
        
        # Determinar o primeiro e último índice válido dos fundamentos
        first_valid = processed_df.apply(lambda x: x.first_valid_index()).min()
        last_valid = processed_df.apply(lambda x: x.last_valid_index()).max()
        
        if first_valid is None or last_valid is None:
            ticker = os.path.basename(file_path).replace('.csv', '')
            print(f"Excluindo {ticker}: Nenhum dado válido encontrado")
            return None
        
        processed_df = processed_df[first_valid:last_valid]
        
        missing_percentage = (processed_df.isna().sum().sum() / 
                              (processed_df.shape[0] * processed_df.shape[1])) * 100
        
        ticker = os.path.basename(file_path).replace('.csv', '')
        print(f"\nAnálise do ativo {ticker}:")
        print(f"Período de dados fundamentas disponível: {first_valid.strftime('%Y-%m-%d')} até {last_valid.strftime('%Y-%m-%d')}")
        print(f"Total de {(last_valid - first_valid).days / 365.25:.1f} anos de histórico fundamental")
        print(f"Porcentagem de valores faltantes (fundamentais): {missing_percentage:.2f}%")
        
        if missing_percentage > 30:
            print(f"Excluindo {ticker}: {missing_percentage:.2f}% de valores faltantes")
            return None
        
        # Ajustando a data final para ontem
        end_yahoo = pd.Timestamp.today() - pd.Timedelta(days=1)
        # Ajustando a data inicial baseada na quantidade de dias escolhida
        start_yahoo = end_yahoo - pd.Timedelta(days=num_days)
        
        # Adicionando o sufixo .SA para tickers brasileiros
        ticker_yahoo = ticker.upper() + '.SA'
        
        try:
            yahoo_data = yf.download(ticker_yahoo, start=start_yahoo, end=end_yahoo)
            if yahoo_data.empty:
                print(f"Não foi possível obter dados do Yahoo para {ticker}. Excluindo.")
                return None
        except Exception as e:
            print(f"Erro ao baixar dados do Yahoo para {ticker}: {e}")
            return None
        
        # Reindexar os fundamentos no índice do yahoo_data
        processed_df = processed_df.reindex(yahoo_data.index)
        
        # Separar colunas de fundamentos para ffill
        fundamentals_cols = [c for c in processed_df.columns 
                             if c not in ['PX_OPEN','PX_HIGH','PX_LOW','PX_LAST','PX_VOLUME']]
        
        # Forward-fill e backward-fill apenas nas colunas de fundamentos
        processed_df[fundamentals_cols] = processed_df[fundamentals_cols].ffill().bfill()
        
        # Inserir dados de preço do Yahoo
        processed_df['PX_OPEN'] = yahoo_data['Open']
        processed_df['PX_HIGH'] = yahoo_data['High']
        processed_df['PX_LOW'] = yahoo_data['Low']
        processed_df['PX_LAST'] = yahoo_data['Close']
        processed_df['PX_VOLUME'] = yahoo_data['Volume']
        
        # Preencher NaN nos fundamentos com a mediana ou zero
        for column in fundamentals_cols:
            median_value = processed_df[column].median()
            if pd.isna(median_value):
                processed_df[column] = processed_df[column].fillna(0)
            else:
                processed_df[column] = processed_df[column].fillna(median_value)
        
        na_count = processed_df.isna().sum().sum()
        if na_count > 0:
            print(f"Warning: Found {na_count} NA values in {file_path}")
            fill_dict = {col: 0 for col in fundamentals_cols if processed_df[col].isna().any()}
            processed_df = processed_df.fillna(fill_dict)
        
        return processed_df
    
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        raise

def create_merged_dataset(input_folder='./data/', output_file='fundamentals.csv', num_days=365):
    """
    Process all CSV files and create merged dataset compatible with load_data().
    O parâmetro num_days define o período histórico a ser baixado do Yahoo a partir de ontem.
    """
    csv_files = glob.glob(os.path.join(input_folder, '*.csv'))
    all_data = {}
    
    if not csv_files:
        raise ValueError(f"No CSV files found in {input_folder}")
    
    print(f"Found {len(csv_files)} CSV files")
    
    skipped_files = 0
    for file in csv_files:
        ticker = os.path.basename(file).replace('.csv', '')
        try:
            print(f"\nProcessando {ticker}...")
            df = load_stock_data(file, num_days=num_days)
            if df is not None and not df.empty:
                all_data[f"{ticker} Index"] = df
                print(f"Successfully processed {ticker} Index")
                print(f"DataFrame shape: {df.shape}")
                print(f"Date range: {df.index.min()} to {df.index.max()}")
            else:
                print(f"Ativo {ticker} excluído devido a dados insuficientes")
                skipped_files += 1
        except Exception as e:
            print(f"Error processing {ticker} Index: {str(e)}")
            skipped_files += 1
            continue
    
    if not all_data:
        raise ValueError("No data was successfully processed")
    
    print(f"\nResumo do processamento:")
    print(f"Total de arquivos encontrados: {len(csv_files)}")
    print(f"Arquivos processados com sucesso: {len(all_data)}")
    print(f"Arquivos excluídos: {skipped_files}")
    
    print("\nMerging all data...")
    merged_df = pd.concat(all_data, axis=1)
    
    # Ajustar o cabeçalho
    merged_df.reset_index(inplace=True)  
    merged_df = merged_df.round(2)

    first_header = ['Dates'] + [col[0] for col in merged_df.columns[1:]]
    second_header = ['Dates'] + [col[1] for col in merged_df.columns[1:]]
    
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(','.join(first_header) + '\n')
        f.write(','.join(second_header) + '\n')
        merged_df.to_csv(f, index=False, header=False)
    
    print(f"\nProcessed data saved to {output_file}")
    print(f"Total stocks processed: {len(all_data)}")
    
    return merged_df


In [None]:
# Então execute
create_merged_dataset(input_folder='./data/', output_file='fundamentals.csv', num_days=365)