### Overview
This project aims to build a cutting-edge, machine learning-powered framework for asset return forecasting, risk prediction, and portfolio optimization. Combining rigorous quantitative finance techniques with modern data science workflows, the system will leverage market data and advanced ML models to enhance risk-adjusted returns and demonstrate state-of-the-art portfolio management analytics.
___

### Objectives
1) Predict asset returns or volatility using machine learning techniques
2) Engineer informative financial features from historical price, volume, and macroeconomic data
3) Construct and evaluate robust, data-driven portfolio allocation strategies
4) Quantify and visualize portfolio risks using industry-standard metrics (e.g., VaR, Sharpe ratio, drawdown)
5) Provide fully reproducible, commented code and a comprehensive walkthrough of all methodologies.
___

###  Workflow
1) Data Collection: Source and preprocess historical market and macroeconomic data for selected assets.
2) Exploratory Data Analysis: Investigate statistical properties, visualize trends, and identify key drivers.
3) Feature Engineering: Extract and construct relevant features (technical indicators, rolling stats, etc.)

##### Machine Learning Modeling:

1) Select regression or classification models (Linear Regression, Random Forest, XGBoost/LSTM, etc.)
2) Tune and train models for forecasting returns/volatility.

##### Backtesting and Evaluation:
1) Simulate predictions and portfolio allocation over out-of-sample test sets.
2) Assess performance using financial metrics and realistic constraints.

##### Portfolio Optimization:
1) Implement optimization routines to construct portfolios that maximize risk-adjusted returns.
2) Integrate ML predictions into allocation strategies.

##### Visualization and Reporting:
1) Generate plots (prediction vs. actual, cumulative returns, feature importance)
2) Summarize key results and business implications

##### Documentation:
Ensure all code sections and methodology are clearly explained for full reproducibility and understanding.


In [None]:
# Import Nessecary Libraries

# Data handling and manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning and modeling
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Advanced models (if needed later)
import xgboost as xgb
import lightgbm as lgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# For feature scaling
from sklearn.preprocessing import StandardScaler

# For saving/loading models and saving folders
import pickle
import joblib as jb
import os 

# For downloading data
import pandas_datareader as pdr


os.makedirs('Results', exist_ok=True)

In [None]:
# Step 1 gathering the relevant data and processing it for the model training and developement 

from datetime import datetime

# Define tickers and date range
tickers = ['MSFT','AAPL','SPY','AMZN','TSLA','NVDA']
data_frames = []
start_date = datetime(2010, 1, 1)  
end_date = datetime.today()

for ticker in tickers:
    df = pdr.get_data_stooq(ticker, start=start_date, end=end_date)  # OHLCV data
    df.columns = pd.MultiIndex.from_product([[ticker], df.columns])    # multi-level columns with ticker
    data_frames.append(df)

# Concatenate all ticker DataFrames on columns to form a multi-level DataFrame
Data = pd.concat(data_frames, axis=1).sort_index(axis=1)


In [None]:

# checking mssing values per ticker columns and interpoladin them to get the data continoius 
null_cols =[]
for ticker in Data.columns:
    if Data[ticker].isna().sum() > 0:
        null_cols.append(ticker)
    else:
        pass
    
null_cols

# fitting and filling the missing values using interpolation
Data.loc[:, null_cols] = Data.loc[:, null_cols].interpolate(method='time',degree=1).ffill().bfill()



In [None]:
# NOw perfoming the EDA and Feature Engineering to observe the trends and the important feaures

# Feature Engineering 
Feature_list = ['Pct_returns','Log_Returns','Rolling_Mean_returns','Rolling_STD','Rolling_Skewness','Rolling_Kurtosis',
                'SMA(20)','SMA(50)','EMA','MACD','RSI','BOLLINGER_BANDS',
                'Day_of_Week','Month','Year','Week_of_Year','Day_of_Year','Weekend','Holiday',
                'Rolling_Correlations'
                ]


import numpy as np
import pandas as pd

def compute_rsi(series, window=14):
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -1 * delta.clip(upper=0)
    avg_gain = gain.rolling(window=window).mean()
    avg_loss = loss.rolling(window=window).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_macd(series, fast=12, slow=26, signal=9):
    ema_fast = series.ewm(span=fast, adjust=False).mean()
    ema_slow = series.ewm(span=slow, adjust=False).mean()
    macd = ema_fast - ema_slow
    signal_line = macd.ewm(span=signal, adjust=False).mean()
    histogram = macd - signal_line
    return macd, signal_line, histogram

def compute_bollinger_bands(series, window=20):
    sma = series.rolling(window=window).mean()
    std = series.rolling(window=window).std()
    upper_band = sma + 2 * std
    lower_band = sma - 2 * std
    return upper_band, lower_band

technical_features = {}

for ticker in Data.columns.get_level_values(0).unique():
    close = Data[ticker]['Close']
    high = Data[ticker]['High']
    low = Data[ticker]['Low']
    volume = Data[ticker]['Volume']
    Data[(ticker,'log_returns')] = np.log(Data[ticker]['Close'] / Data[ticker]['Close'].shift(1))
    Data[(ticker,'Pct_change')] = (Data[ticker]['Close'] - Data[ticker]['Close'].shift(1)) / Data[ticker]['Close'].shift(1)
    Data[(ticker,'rolling_logvol_30')] = Data[(ticker,'log_returns')].rolling(window=30).std()
    
    # RSI (14-day)
    technical_features[(ticker, 'RSI_14')] = compute_rsi(close)
    
    # MACD
    macd, signal_line, hist = compute_macd(close)
    technical_features[(ticker, 'MACD')] = macd
    technical_features[(ticker, 'MACD_Signal')] = signal_line
    technical_features[(ticker, 'MACD_Histogram')] = hist
    
    # Bollinger Bands
    upper_band, lower_band = compute_bollinger_bands(close)
    technical_features[(ticker, 'Bollinger_Upper')] = upper_band
    technical_features[(ticker, 'Bollinger_Lower')] = lower_band
    
    # SMA and EMA
    technical_features[(ticker, 'SMA_20')] = close.rolling(window=20).mean()
    technical_features[(ticker, 'EMA_20')] = close.ewm(span=20, adjust=False).mean()
    
    # Average True Range (ATR) - volatility measure
    tr1 = high - low
    tr2 = (high - close.shift(1)).abs()
    tr3 = (low - close.shift(1)).abs()
    true_range = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    atr = true_range.rolling(window=14).mean()
    technical_features[(ticker, 'ATR_14')] = atr
    
    # On Balance Volume (OBV) - volume momentum
    direction = np.sign(close.diff())
    obv = (direction * volume).fillna(0).cumsum()
    technical_features[(ticker, 'OBV')] = obv

# Combine all features into a DataFrame
tech_indicators_df = pd.DataFrame(technical_features)

# Sort columns by ticker and feature
tech_indicators_df = tech_indicators_df.sort_index(axis=1, level=[0, 1])

Data.sort_index(axis=1, level=[0, 1], inplace=True)

Final_Data = pd.concat([Data, tech_indicators_df], axis=1).sort_index(axis=1, level=[0, 1])

Final_Data.to_csv('Results/Final_Data.csv')

In [None]:
# Now we do EDA and then feature slection for our model that is what arr the iportant fetaure for our model

# lets have pair plots for the Data frame to get the important relaton between features


for ticker in tickers:
    sns.pairplot(Data[ticker], diag_kind='kde',title=f'Pairplot for {ticker}')
    plt.tight_layout()
    plt.savefig(f'Results/{ticker}_pairplot.png')


In [None]:
# Correlation matrix for each ticker for their feataure vs log_returns
corr_matrix_dict_log_returns={}
corr_matrix_dict_rolling_logvol_30={}
for ticker in tickers:
    corr_matrix_dict_log_returns[ticker] =Final_Data[ticker].drop(columns=['log_returns','Pct_change']).dropna().corrwith(Final_Data[(ticker,'log_returns')]).sort_values(ascending=False)
    corr_matrix_dict_rolling_logvol_30[ticker] =Final_Data[ticker].drop(columns=['log_returns','rolling_logvol_30']).dropna().corrwith(Final_Data[(ticker,'rolling_logvol_30')]).sort_values(ascending=False)
    
    
# Impoetant top 10 fetaures for the log_returns and rolling vol_30 for each ticker
Features_log_returns ={}
Features_rolling_logvol_30 ={}
for ticker in tickers:
    Features_log_returns[ticker] = list(corr_matrix_dict_log_returns[ticker].head(10).index)
    Features_rolling_logvol_30[ticker] = list(corr_matrix_dict_rolling_logvol_30[ticker].head(10).index)




In [75]:
## Creating final data fro each ticker for Each tikcer and their target and rolling vol dt fames to finlay use in our train and test and model 

Model_AApl_ret = pd.concat([Final_Data['AAPL'][Features_log_returns['AAPL']],Final_Data['AAPL']['log_returns'].shift(-1)],axis=1).dropna()
Model_MSFT_ret = pd.concat([Final_Data['MSFT'][Features_log_returns['MSFT']],Final_Data['MSFT']['log_returns'].shift(-1)],axis=1).dropna()
Model_AMZN_ret = pd.concat([Final_Data['AMZN'][Features_log_returns['AMZN']],Final_Data['AMZN']['log_returns'].shift(-1)],axis=1).dropna()
Model_SPY_ret = pd.concat([Final_Data['SPY'][Features_log_returns['SPY']],Final_Data['SPY']['log_returns'].shift(-1)],axis=1).dropna()
Model_TSLA_ret = pd.concat([Final_Data['TSLA'][Features_log_returns['TSLA']],Final_Data['TSLA']['log_returns'].shift(-1)],axis=1).dropna()
Model_NVDA_ret = pd.concat([Final_Data['NVDA'][Features_log_returns['NVDA']],Final_Data['NVDA']['log_returns'].shift(-1)],axis=1).dropna()

Model_AApl_roll = pd.concat([Final_Data['AAPL'].loc[Final_Data['AAPL']['rolling_logvol_30'].notna()][Features_rolling_logvol_30['AAPL']],Final_Data['AAPL']['rolling_logvol_30'].shift(-1)],axis=1).dropna()
Model_MSFT_roll = pd.concat([Final_Data['MSFT'].loc[Final_Data['MSFT']['rolling_logvol_30'].notna()][Features_rolling_logvol_30['MSFT']],Final_Data['MSFT']['rolling_logvol_30'].shift(-1)],axis=1).dropna()
Model_AMZN_roll = pd.concat([Final_Data['AMZN'].loc[Final_Data['AMZN']['rolling_logvol_30'].notna()][Features_rolling_logvol_30['AMZN']],Final_Data['AMZN']['rolling_logvol_30'].shift(-1)],axis=1).dropna()    
Model_SPY_roll = pd.concat([Final_Data['SPY'].loc[Final_Data['SPY']['rolling_logvol_30'].notna()][Features_rolling_logvol_30['SPY']],Final_Data['SPY']['rolling_logvol_30'].shift(-1)],axis=1).dropna() 
Model_TSLA_roll = pd.concat([Final_Data['TSLA'].loc[Final_Data['TSLA']['rolling_logvol_30'].notna()][Features_rolling_logvol_30['TSLA']],Final_Data['TSLA']['rolling_logvol_30'].shift(-1)],axis=1).dropna()    
Model_NVDA_roll = pd.concat([Final_Data['NVDA'].loc[Final_Data['NVDA']['rolling_logvol_30'].notna()][Features_rolling_logvol_30['NVDA']],Final_Data['NVDA']['rolling_logvol_30'].shift(-1)],axis=1).dropna()    

os.makedirs('Results/Model_Data', exist_ok=True)

for ticker in tickers:
    Model_AApl_ret.to_csv(f'Results/Model_Data/Model_AApl_ret.csv')
    Model_MSFT_ret.to_csv(f'Results/Model_Data/Model_MSFT_ret.csv')
    Model_AMZN_ret.to_csv(f'Results/Model_Data/Model_AMZN_ret.csv')
    Model_SPY_ret.to_csv(f'Results/Model_Data/Model_SPY_ret.csv')
    Model_TSLA_ret.to_csv(f'Results/Model_Data/Model_TSLA_ret.csv')
    Model_NVDA_ret.to_csv(f'Results/Model_Data/Model_NVDA_ret.csv')
    Model_AApl_roll.to_csv(f'Results/Model_Data/Model_AApl_roll.csv')
    Model_MSFT_roll.to_csv(f'Results/Model_Data/Model_MSFT_roll.csv')
    Model_AMZN_roll.to_csv(f'Results/Model_Data/Model_AMZN_roll.csv')
    Model_SPY_roll.to_csv(f'Results/Model_Data/Model_SPY_roll.csv')
    Model_TSLA_roll.to_csv(f'Results/Model_Data/Model_TSLA_roll.csv')
    Model_NVDA_roll.to_csv(f'Results/Model_Data/Model_NVDA_roll.csv')

In [None]:
'''
Now we perfomr the machine leaning part where we will create pipe line for each ticker and have model slected for each ticker
we will use the metrics to score the outcoes of our model
we will use liner regressor and randomforest regressor 

'''
