# Week 04 Extraction Features Using Basic Machine Learning Methods

In [35]:
import pandas as pd
import numpy as np
import ta as ta

In [36]:
row_data_path = '../../data/row/'
processed_data_path = '../../data/processed/'
data = pd.read_csv(
    processed_data_path + 'features_NVDA.csv', index_col='date', parse_dates=True)

In [37]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import tree
import numpy as np

## Time Series Split

In [38]:
# Define features and target variable
X = data.drop(['close', '4. close'], axis=1)
y = data['close']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

## Gradient Boosting Regression

Train a Gradient Boosting Regression model within a cross-validation loop, compute RMSE for each split, and use PCA if necessary due to high dimensionality:

In [39]:
rmse_scores = []

for test_train, test_index in tscv.split(X_scaled):
    X_train, X_test = X_scaled[test_train], X_scaled[test_index]
    y_train, y_test = y.iloc[test_train], y.iloc[test_index]

    # Train the model
    model = GradientBoostingRegressor()
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate RMSE and append to scores list
    rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))

print(f"Average RMSE: {np.mean(rmse_scores)}")

Average RMSE: 89.35665833609478


## Feature Importance and Model Comparison

After training, extract feature importance and consider removing low-importance features or testing other models for comparison:

In [40]:
feature_importance = model.feature_importances_
# Filter or modify features based on importance as needed
feature_names = X.columns

# Combine feature names and their importances into a DataFrame for easier analysis
importances_df = pd.DataFrame(
    {'Feature': feature_names, 'Importance': feature_importance})

# Sort the DataFrame by importance in descending order
importances_df.sort_values(by='Importance', ascending=False, inplace=True)

# Display the sorted importances
print("Feature Importances:")
print(importances_df)

Feature Importances:
                 Feature  Importance
8                   MA-5    0.891638
2                    low    0.042493
1                   high    0.023814
15                 EMA12    0.012268
0                   open    0.009404
9                  MA-30    0.006522
14                 SMA10    0.006427
20           Bollinger_U    0.002792
21           Bollinger_L    0.002222
13               z_score    0.001000
19           Bollinger_M    0.000672
12           Williams_%R    0.000380
7          daily_returns    0.000134
17                   RoC    0.000109
11        5-day_variance    0.000082
18                   K15    0.000012
22                 MOM12    0.000011
10                   RSI    0.000007
16                  MACD    0.000006
3                 volume    0.000005
6             log_volume    0.000004
5   8. split coefficient    0.000000
4     7. dividend amount    0.000000


## Reducing the Dimensions with Set Threshold

In [41]:
# Define a threshold below which features are considered of low importance
# For demonstration, using 0.01 (1%) as the threshold
threshold = 0.001

# Identify features with importance below the threshold
low_importance_features = importances_df[importances_df['Importance']
                                         < threshold]['Feature']

X_filtered1 = X.drop(columns=low_importance_features)

In [42]:
def test_train(X, y, model_func=GradientBoostingRegressor()):
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    rmse_scores = []

    for train_index, test_index in tscv.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the model
        model = model_func
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)

        # Calculate RMSE and append to scores list
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))

    print(f"Average RMSE: {np.mean(rmse_scores)}")
    feature_importance = model.feature_importances_
    # Filter or modify features based on importance as needed
    feature_names = X.columns

    # Combine feature names and their importances into a DataFrame for easier analysis
    importances_df = pd.DataFrame(
        {'Feature': feature_names, 'Importance': feature_importance})

    # Sort the DataFrame by importance in descending order
    importances_df.sort_values(by='Importance', ascending=False, inplace=True)

    # Display the sorted importances
    print("Feature Importances:")
    print(importances_df)

In [43]:
test_train(X_filtered1, y=y)

Average RMSE: 90.57642509232794
Feature Importances:
       Feature  Importance
3         MA-5    0.893428
2          low    0.044031
1         high    0.024079
6        EMA12    0.010687
0         open    0.009076
4        MA-30    0.007242
5        SMA10    0.006507
7  Bollinger_U    0.002627
8  Bollinger_L    0.002323


## PCA for Dimensionality Reduction

If the feature dimension is too large, you might consider applying PCA before training:

In [44]:
# Apply PCA
# Retain 95% of variance or choose the number of components
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

# Transform PCA results into a DataFrame with meaningful column names
pca_columns = [f'PC{i+1}' for i in range(X_pca.shape[1])]
# Ensure to match the original index
X_pca_df = pd.DataFrame(X_pca, columns=pca_columns, index=X.index)

# Now X_pca_df can be used for further analysis or modeling
print(X_pca_df.head())

                  PC1       PC2       PC3       PC4       PC5       PC6  \
date                                                                      
2024-01-30  11.768584  3.795726  0.339991 -3.948547 -0.729412  0.065153   
2024-01-29  10.779546  4.329662  1.270286 -2.579493 -0.203480  0.054095   
2024-01-26  10.450967  4.519557  2.050041 -1.402683 -0.044201 -0.006534   
2024-01-25  10.279822  3.753403  0.368083 -3.313611  0.093792  0.195549   
2024-01-24  10.308375  3.704569  1.022974 -2.402368  0.040738  0.108888   

                 PC7       PC8       PC9  
date                                      
2024-01-30  6.303305  0.507095  0.546658  
2024-01-29  0.977580 -1.158515 -0.862616  
2024-01-26 -0.897449 -1.725984 -1.395414  
2024-01-25 -1.178938 -1.274401 -0.291720  
2024-01-24 -1.138463 -1.346010 -0.594912  


In [45]:
test_train(X=X_pca_df, y=y)

Average RMSE: 49.2893361527947
Feature Importances:
  Feature  Importance
0     PC1    0.950381
7     PC8    0.019594
2     PC3    0.018127
1     PC2    0.007970
6     PC7    0.002331
3     PC4    0.000962
8     PC9    0.000289
4     PC5    0.000177
5     PC6    0.000170


## Encapsulating the Process
Finally, encapsulate your data preprocessing, model training, and prediction steps into a function for reuse:

In [46]:
def add_features(data):
    data = data.rename(columns={"1. open": "open",
                                "2. high": "high",
                                "3. low": "low",
                                "5. adjusted close": "close",
                                "6. volume": "volume"})
    data.dropna(inplace=True)

    data['log_volume'] = np.log(data['volume'])

    data['daily_returns'] = data['close'].diff()

    data['MA-5'] = data['close'].rolling(window=5).mean()

    data['MA-30'] = data['close'].rolling(window=30).mean()

    RSI = ta.momentum.RSIIndicator(data['close'], window=14)
    data['RSI'] = RSI.rsi()

    data['5-day_variance'] = data['close'].rolling(window=5).var()

    WILLR = ta.momentum.WilliamsRIndicator(high=data['high'],
                                           low=data['low'],
                                           close=data['close'],
                                           lbp=14)
    data['Williams_%R'] = WILLR.williams_r()

    data['z_score'] = (data['close'] - data['close'].rolling(window=10).mean()
                       ) / data['close'].rolling(window=10).std()

    data['SMA10'] = data['close'].rolling(window=10).mean()

    data['EMA12'] = data['close'].ewm(span=12, adjust=False).mean()

    MACD = ta.trend.MACD(close=data['close'], window_fast=12, window_slow=26)
    data['MACD'] = MACD.macd()

    RoC = ta.momentum.ROCIndicator(close=data['close'], window=1)
    data['RoC'] = RoC.roc()

    low_min = data['low'].rolling(window=15).min()
    high_max = data['high'].rolling(window=15).max()
    data['K15'] = ((data['close'] - low_min) /
                   (high_max - low_min)) * 100

    data['Bollinger_M'] = data['close'].rolling(window=20).mean()
    data['Bollinger_U'] = data['Bollinger_M'] + \
        2 * data['close'].rolling(window=20).std()
    data['Bollinger_L'] = data['Bollinger_M'] - \
        2 * data['close'].rolling(window=20).std()

    data['MOM12'] = data['close'] - data['close'].shift(12)

    data.dropna(inplace=True)
    return data

def pca_reduction(data):
    # Define features and target variable
    X = data.drop(['close', '4. close'], axis=1)
    y = data['close']
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Retain 95% of variance or choose the number of components
    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)
    return X_pca, y
    
def train_and_predict(X_pca, y, model_func=GradientBoostingRegressor()):    
    X_scaled = scaler.fit_transform(X_pca)

    # Initialize TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    rmse_scores = []

    for train_index, test_index in tscv.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the model
        model = model_func
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)

        # Calculate RMSE and append to scores list
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))

    print(f"Average RMSE: {np.mean(rmse_scores)}")
    return model

In [28]:
def financial_forecast_pipeline(data, model_func=GradientBoostingRegressor()):
    data_more_features = add_features(data)
    X_pca, y = pca_reduction(data=data_more_features)
    model = train_and_predict(X_pca=X_pca, y=y, model_func=model_func)
    return model

In [30]:
NVDA_5_years_daily_data = pd.read_csv(
    row_data_path + 'NVDA_5_years_daily_data.csv', index_col='date', parse_dates=True)

In [47]:
model_NVDA = financial_forecast_pipeline(NVDA_5_years_daily_data,
                                         model_func=GradientBoostingRegressor())

Average RMSE: 49.27153117355907


In [None]:
financial_forecast_pipeline(NVDA_5_years_daily_data,
                            model_func=LinearRegression())

Average RMSE: 62.90965647670862


In [None]:
financial_forecast_pipeline(NVDA_5_years_daily_data,
                            model_func=tree.DecisionTreeRegressor())

Average RMSE: 48.371460059514895


## Extra Data

In [53]:
extra_stickers = ['AAPL', 'INTC', 'AMD', 'TSM', 'QCOM', 'AVGO', 'SNPS', 'CRUS',
                  'META', 'AMZN', 'MSFT', 'GOOGL', 'MU', 'MRVL', 'NXPI', 'STM', 'HPQ', 'IBM', 'CSCO']

row_data_path = '../../data/row/'
years = 5

In [None]:

from alpha_vantage.timeseries import TimeSeries
from alpha_vantage.fundamentaldata import FundamentalData
import matplotlib.pyplot as plt
import datetime
import os

In [54]:

def get_daily_data(symbol, years=5):
    premium_api_key = "E8YTXNGHL6A0A5L9"
    ts = TimeSeries(key=premium_api_key, output_format='pandas')

    # Get historical stock data for symbol
    daily_adjusted_data, _ = ts.get_daily_adjusted(
        symbol=symbol, outputsize='full')

    # Filter the last n years
    now = pd.Timestamp.now()
    start_date = now - pd.DateOffset(years=years)
    data_last_n_years = daily_adjusted_data.loc[:start_date]
    # data_last_n_years['5. adjusted close'].plot(
    #     title= symbol + ' Adjusted Close Price Over the Last ' + str(years) + ' Years')
    # plt.show()
    return data_last_n_years
    
def get_and_save_daily_data(symbol, years=5):
    data_last_n_years = get_daily_data(symbol, years)
    # today_str = datetime.datetime.now().strftime("%Y%m%d")
    csv_file_path = os.path.join(
        row_data_path, f'{symbol}_{years}_years_daily_data.csv')
    # csv_file_path = os.path.join(
    #     row_data_path, f'{symbol}_5_years_daily_data_until_{today_str}.csv')
    data_last_n_years.to_csv(csv_file_path)

    print(f"Data saved to {csv_file_path}")
    

In [55]:
for sticker in extra_stickers:
    get_and_save_daily_data(sticker, years=years)
    

Data saved to ../../data/row/AAPL_5_years_daily_data.csv
Data saved to ../../data/row/INTC_5_years_daily_data.csv
Data saved to ../../data/row/AMD_5_years_daily_data.csv
Data saved to ../../data/row/TSM_5_years_daily_data.csv
Data saved to ../../data/row/QCOM_5_years_daily_data.csv
Data saved to ../../data/row/AVGO_5_years_daily_data.csv
Data saved to ../../data/row/SNPS_5_years_daily_data.csv
Data saved to ../../data/row/CRUS_5_years_daily_data.csv
Data saved to ../../data/row/META_5_years_daily_data.csv
Data saved to ../../data/row/AMZN_5_years_daily_data.csv
Data saved to ../../data/row/MSFT_5_years_daily_data.csv
Data saved to ../../data/row/GOOGL_5_years_daily_data.csv
Data saved to ../../data/row/MU_5_years_daily_data.csv
Data saved to ../../data/row/MRVL_5_years_daily_data.csv
Data saved to ../../data/row/NXPI_5_years_daily_data.csv
Data saved to ../../data/row/STM_5_years_daily_data.csv
Data saved to ../../data/row/HPQ_5_years_daily_data.csv
Data saved to ../../data/row/IBM_5_y

In [57]:
def get_generalized_model(extra_stickers, model=GradientBoostingRegressor(), years=years):
    for sticker in extra_stickers:
        csv_file_path = os.path.join(
            row_data_path, f'{sticker}_{years}_years_daily_data.csv')
        data = pd.read_csv(csv_file_path, index_col='date', parse_dates=True)
        model = financial_forecast_pipeline(data=data, model_func=model)
    return model


get_generalized_model(extra_stickers=extra_stickers,
                    model=model_NVDA, years=years)

Average RMSE: 22.075409778075986
Average RMSE: 2.1796013806108743
Average RMSE: 8.346300999566463
Average RMSE: 6.220393891797426
Average RMSE: 8.68331814361168
Average RMSE: 59.369540211456275
Average RMSE: 32.12260492903165
Average RMSE: 4.1827059085087175
Average RMSE: 22.785643972594027
Average RMSE: 9.26601443406506
Average RMSE: 20.60355248292143
Average RMSE: 9.73306018824199
Average RMSE: 2.709542693148882
Average RMSE: 3.2298623068524477
Average RMSE: 9.232079841622028
Average RMSE: 2.353037357570634
Average RMSE: 1.4009811833127057
Average RMSE: 3.882987235320262
Average RMSE: 1.2998923022634807
