In [101]:
import pandas as pd 
import yfinance as yf
import numpy as np
from scipy.stats import kurtosis, skew
import openpyxl


In [102]:
def fetch_yahoo_data(ticker: str, period: str = "1y", interval: str = "1d") -> pd.DataFrame:
    """
    Busca dados históricos de OHLCV para o ticker especificado no Yahoo Finance.

    Parâmetros:
    - ticker: código do ativo (ex: "VALE3.SA" para ações brasileiras)
    - period: período de dados (ex: "180d", "1y", "2y")
    - interval: intervalo de tempo (ex: "1d", "1wk", "1mo")

    Retorna:
    - DataFrame com colunas: Open, High, Low, Close, Adj Close e Volume
    """
    df = yf.download(ticker,
                     period=period,
                     interval=interval,
                     auto_adjust=True)  # ajusta splits/dividends
    return df


In [103]:
weights = {

    "ABEV3.SA": 0.25,
    "B3SA3.SA": 0.25,
    "JNJ": 0.25,
    "V": 0.25


}
tickers = weights.keys()
time = "30y"
risk_threshold = -0.01

In [104]:
def fetch_data(ticker):
    df = fetch_yahoo_data(ticker, period=time, interval="1d")
    df = df.xs(ticker, axis=1, level='Ticker')

    # PnL diário (retorno)
    df[f"pnl_{ticker}"] = df['Close'].pct_change()

    # Volume relativo
    df[f"volrel_{ticker}"] = df['Volume'] / df['Volume'].rolling(window=10).mean()

    # Amplitude média (High - Low)
    df[f"amp_{ticker}"] = (df["High"] - df["Low"]).rolling(window=10).mean()

    # Gaps > 1%
    gaps = (df["Open"].shift(-1) - df["Close"]) / df["Close"]
    df[f"gaps_{ticker}"] = gaps.rolling(window=10).apply(lambda x: (np.abs(x) > 0.01).sum(), raw=True)

    # Manter apenas colunas relevantes
    metric_cols = [col for col in df.columns if col.startswith((
        'pnl_', 
        'volrel_', 
        # 'amp_', 
        # 'gaps_'
        ))]
    df = df.dropna(subset=metric_cols)

    return df[metric_cols]


### Métricas

In [105]:
# pnl acumulado em janela de 5 dias
def calculate_metrics(df):
    # label de classificacao de risco
    df["pnl_5d"] = (
        (1 + df["PnL"])[::-1]
        .rolling(window=5)
        .apply(lambda x: np.prod(x) - 1, raw=True)
    )[::-1]
    df['risk'] = df.apply(lambda x: True if x['pnl_5d'] < risk_threshold else False, axis=1)
    df = df.drop(columns=["pnl_5d"]) 

    # pnl acumulado dos últimos 5 dias
    df["pnl_cum"] = df["PnL"].rolling(window=5).apply(np.prod)

    # volatility
    df['volatility'] = df['PnL'].rolling(window=20).std()

    # skewness
    df['skewness'] = df['PnL'].rolling(window=20).apply(lambda x: skew(x), raw=False)

    # kurtosis
    df["kurtosis"] = df["PnL"].rolling(window=20).apply(lambda x: kurtosis(x), raw=False)

    # sharpe ratio
    df["ret_medio"] = df["PnL"].rolling(window=20).mean()
    df['sharpe'] = df["ret_medio"] / df["volatility"]
    df = df.drop(columns=["ret_medio"])
    return df

In [106]:
portfolio = pd.DataFrame()
for ticker in tickers:
    data = fetch_data(ticker)
    portfolio = pd.concat([portfolio, data], axis=1)
    portfolio = portfolio.dropna()
portfolio
pnl_cols = [col for col in portfolio.columns if col.startswith('pnl_')]
portfolio['PnL'] = portfolio[pnl_cols].apply(lambda x: x.mul(weights[x.name.split('_')[1]]), axis=0).sum(axis=1)
portfolio = portfolio.drop(columns=pnl_cols)
portfolio = calculate_metrics(portfolio) 
portfolio = portfolio.dropna()


volrel_cols = [col for col in portfolio.columns if col.startswith('volrel_')]
for i in range (1,len(volrel_cols)+1):
    portfolio[f'volrel.{i}'] = portfolio[volrel_cols[i-1]]
for i in range(1,8):
    portfolio[f'PnL.{i}'] = portfolio['PnL']
for i in range(1,3):
    portfolio[f'sharpe.{i}'] = portfolio['sharpe']
portfolio = portfolio.reset_index()
portfolio = portfolio.drop(columns=['pnl_cum','PnL','sharpe'] + volrel_cols)
portfolio

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Price,Date,risk,volatility,skewness,kurtosis,volrel.1,volrel.2,volrel.3,volrel.4,PnL.1,PnL.2,PnL.3,PnL.4,PnL.5,PnL.6,PnL.7,sharpe.1,sharpe.2
0,2008-04-30,False,0.020476,1.155309,0.927970,1.932149,2.533253,1.296115,1.663399,0.060159,0.060159,0.060159,0.060159,0.060159,0.060159,0.060159,0.188182,0.188182
1,2008-05-02,False,0.021545,0.899023,0.092024,0.562370,2.907604,1.171816,1.348305,0.034293,0.034293,0.034293,0.034293,0.034293,0.034293,0.034293,0.258390,0.258390
2,2008-05-05,True,0.020489,1.160829,0.930849,0.302640,0.693627,0.969898,0.686597,-0.001037,-0.001037,-0.001037,-0.001037,-0.001037,-0.001037,-0.001037,0.185518,0.185518
3,2008-05-06,True,0.020671,1.200535,0.926770,1.119575,0.641999,0.902889,0.720533,-0.008452,-0.008452,-0.008452,-0.008452,-0.008452,-0.008452,-0.008452,0.157462,0.157462
4,2008-05-07,False,0.020429,1.144365,0.922023,0.637500,0.763918,1.179498,0.899645,0.005041,0.005041,0.005041,0.005041,0.005041,0.005041,0.005041,0.196588,0.196588
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4131,2025-05-30,False,0.008026,0.395807,1.048085,2.274431,2.247632,1.782407,1.571228,0.001385,0.001385,0.001385,0.001385,0.001385,0.001385,0.001385,0.149203,0.149203
4132,2025-06-02,False,0.008087,0.395462,0.962537,0.891376,1.171949,0.777799,0.718158,-0.006044,-0.006044,-0.006044,-0.006044,-0.006044,-0.006044,-0.006044,0.139218,0.139218
4133,2025-06-03,False,0.008183,0.198134,0.716137,1.114340,1.075046,1.009621,0.972503,0.009161,0.009161,0.009161,0.009161,0.009161,0.009161,0.009161,0.217905,0.217905
4134,2025-06-04,False,0.008256,0.245712,0.614005,1.107681,0.803351,0.862706,0.798569,-0.003286,-0.003286,-0.003286,-0.003286,-0.003286,-0.003286,-0.003286,0.194932,0.194932


In [107]:
portfolio.to_parquet('portfolio5.parquet')

In [108]:
portfolios = [pd.read_parquet(f'portfolio{i}.parquet') for i in range(0,6)]
test_portfolio = pd.concat(portfolios)
test_portfolio.to_parquet('test_portfolio.parquet')


In [9]:
trues = portfolio[portfolio['risk'] == True]
falses = portfolio[portfolio['risk'] == False]
portfolio = pd.concat([trues, falses.iloc[:2528]])
portfolio = portfolio.drop(columns=['pnl_cum'])
portfolio.to_parquet('portfolio.parquet')

In [None]:
portfolio = pd.read_parquet('portfolio.parquet')
portfolio

In [11]:
portfolio.to_excel('portfolio.xlsx')

In [None]:
df = pd.read_excel('portfolio6.xlsx')
df.to_parquet('portfolio6.parquet')
df

In [None]:
trues = portfolio[portfolio['risk'] == True]
falses = portfolio[portfolio['risk'] == False]
print(len(trues), len(falses))
