# Week 04 Extraction Features Using Basic Machine Learning Methods

In [2]:
import pandas as pd
import numpy as np
import ta as ta

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
row_data_path = '../../data/row/'
processed_data_path = '../../data/processed/'
data = pd.read_csv(
    processed_data_path + 'features_NVDA.csv', index_col='date', parse_dates=True)

In [6]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import tree
import numpy as np

## Time Series Split

In [7]:
# Define features and target variable
X = data.drop(['close', '4. close'], axis=1)
y = data['close']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

## Gradient Boosting Regression

Train a Gradient Boosting Regression model within a cross-validation loop, compute RMSE for each split, and use PCA if necessary due to high dimensionality:

In [8]:
rmse_scores = []

for test_train, test_index in tscv.split(X_scaled):
    X_train, X_test = X_scaled[test_train], X_scaled[test_index]
    y_train, y_test = y.iloc[test_train], y.iloc[test_index]

    # Train the model
    model = GradientBoostingRegressor()
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate RMSE and append to scores list
    rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))

print(f"Average RMSE: {np.mean(rmse_scores)}")

Average RMSE: 82.95039864817232


## Feature Importance and Model Comparison

After training, extract feature importance and consider removing low-importance features or testing other models for comparison:

In [11]:
feature_importance = model.feature_importances_
# Filter or modify features based on importance as needed
feature_names = X.columns

# Combine feature names and their importances into a DataFrame for easier analysis
importances_df = pd.DataFrame(
    {'Feature': feature_names, 'Importance': feature_importance})

# Sort the DataFrame by importance in descending order
importances_df.sort_values(by='Importance', ascending=False, inplace=True)

# Display the sorted importances
print("Feature Importances:")
print(importances_df)

Feature Importances:
                 Feature  Importance
8                   MA-5    0.890773
2                    low    0.042797
1                   high    0.023779
15                 EMA12    0.011197
0                   open    0.009313
20           Bollinger_U    0.006534
9                  MA-30    0.006205
14                 SMA10    0.005310
21           Bollinger_L    0.001665
13               z_score    0.001000
19           Bollinger_M    0.000680
12           Williams_%R    0.000381
7          daily_returns    0.000134
17                   RoC    0.000109
11        5-day_variance    0.000081
22                 MOM12    0.000011
18                   K15    0.000011
10                   RSI    0.000007
16                  MACD    0.000006
3                 volume    0.000005
6             log_volume    0.000003
5   8. split coefficient    0.000000
4     7. dividend amount    0.000000


## Reducing the Dimensions with Set Threshold

In [10]:
# Define a threshold below which features are considered of low importance
# For demonstration, using 0.01 (1%) as the threshold
threshold = 0.001

# Identify features with importance below the threshold
low_importance_features = importances_df[importances_df['Importance']
                                         < threshold]['Feature']

X_filtered1 = X.drop(columns=low_importance_features)

In [None]:
def test_train(X, y, model_func=GradientBoostingRegressor()):
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    rmse_scores = []

    for train_index, test_index in tscv.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the model
        model = model_func
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)

        # Calculate RMSE and append to scores list
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))

    print(f"Average RMSE: {np.mean(rmse_scores)}")
    feature_importance = model.feature_importances_
    # Filter or modify features based on importance as needed
    feature_names = X.columns

    # Combine feature names and their importances into a DataFrame for easier analysis
    importances_df = pd.DataFrame(
        {'Feature': feature_names, 'Importance': feature_importance})

    # Sort the DataFrame by importance in descending order
    importances_df.sort_values(by='Importance', ascending=False, inplace=True)

    # Display the sorted importances
    print("Feature Importances:")
    print(importances_df)

In [None]:
test_train(X_filtered1, y=y)

Average RMSE: 91.02298686390976
Feature Importances:
       Feature  Importance
3         MA-5    0.893919
2          low    0.044035
1         high    0.023748
6        EMA12    0.010929
0         open    0.009332
4        MA-30    0.006421
7  Bollinger_U    0.006370
5        SMA10    0.004512
8  Bollinger_L    0.000734


## PCA for Dimensionality Reduction

If the feature dimension is too large, you might consider applying PCA before training:

In [None]:
# Apply PCA
# Retain 95% of variance or choose the number of components
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

# Transform PCA results into a DataFrame with meaningful column names
pca_columns = [f'PC{i+1}' for i in range(X_pca.shape[1])]
# Ensure to match the original index
X_pca_df = pd.DataFrame(X_pca, columns=pca_columns, index=X.index)

# Now X_pca_df can be used for further analysis or modeling
print(X_pca_df.head())

                  PC1       PC2       PC3       PC4       PC5       PC6  \
date                                                                      
2024-01-30  11.768584  3.795726  0.339991 -3.948547 -0.729412  0.065153   
2024-01-29  10.779546  4.329662  1.270286 -2.579493 -0.203480  0.054095   
2024-01-26  10.450967  4.519557  2.050041 -1.402683 -0.044201 -0.006534   
2024-01-25  10.279822  3.753403  0.368083 -3.313611  0.093792  0.195549   
2024-01-24  10.308375  3.704569  1.022974 -2.402368  0.040738  0.108888   

                 PC7       PC8       PC9  
date                                      
2024-01-30  6.303305  0.507095  0.546658  
2024-01-29  0.977580 -1.158515 -0.862616  
2024-01-26 -0.897449 -1.725984 -1.395414  
2024-01-25 -1.178938 -1.274401 -0.291720  
2024-01-24 -1.138463 -1.346010 -0.594912  


In [None]:
test_train(X=X_pca_df, y=y)

Average RMSE: 49.18003227590566
Feature Importances:
  Feature  Importance
0     PC1    0.950407
7     PC8    0.019602
2     PC3    0.018132
1     PC2    0.007926
6     PC7    0.002335
3     PC4    0.000964
8     PC9    0.000290
4     PC5    0.000175
5     PC6    0.000170


## Encapsulating the Process
Finally, encapsulate your data preprocessing, model training, and prediction steps into a function for reuse:

In [12]:
def add_features(data):
    data = data.rename(columns={"1. open": "open",
                                "2. high": "high",
                                "3. low": "low",
                                "5. adjusted close": "close",
                                "6. volume": "volume"})
    data.dropna(inplace=True)

    data['log_volume'] = np.log(data['volume'])

    data['daily_returns'] = data['close'].diff()

    data['MA-5'] = data['close'].rolling(window=5).mean()

    data['MA-30'] = data['close'].rolling(window=30).mean()

    RSI = ta.momentum.RSIIndicator(data['close'], window=14)
    data['RSI'] = RSI.rsi()

    data['5-day_variance'] = data['close'].rolling(window=5).var()

    WILLR = ta.momentum.WilliamsRIndicator(high=data['high'],
                                           low=data['low'],
                                           close=data['close'],
                                           lbp=14)
    data['Williams_%R'] = WILLR.williams_r()

    data['z_score'] = (data['close'] - data['close'].rolling(window=10).mean()
                       ) / data['close'].rolling(window=10).std()

    data['SMA10'] = data['close'].rolling(window=10).mean()

    data['EMA12'] = data['close'].ewm(span=12, adjust=False).mean()

    MACD = ta.trend.MACD(close=data['close'], window_fast=12, window_slow=26)
    data['MACD'] = MACD.macd()

    RoC = ta.momentum.ROCIndicator(close=data['close'], window=1)
    data['RoC'] = RoC.roc()

    low_min = data['low'].rolling(window=15).min()
    high_max = data['high'].rolling(window=15).max()
    data['K15'] = ((data['close'] - low_min) /
                   (high_max - low_min)) * 100

    data['Bollinger_M'] = data['close'].rolling(window=20).mean()
    data['Bollinger_U'] = data['Bollinger_M'] + \
        2 * data['close'].rolling(window=20).std()
    data['Bollinger_L'] = data['Bollinger_M'] - \
        2 * data['close'].rolling(window=20).std()

    data['MOM12'] = data['close'] - data['close'].shift(12)

    data.dropna(inplace=True)
    return data

def pca_reduction(data):
    # Define features and target variable
    X = data.drop(['close', '4. close'], axis=1)
    y = data['close']
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Retain 95% of variance or choose the number of components
    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)
    return X_pca, y
    
def train_and_predict(X_pca, y, model_func=GradientBoostingRegressor()):    
    X_scaled = scaler.fit_transform(X_pca)

    # Initialize TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    rmse_scores = []

    for train_index, test_index in tscv.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the model
        model = model_func
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)

        # Calculate RMSE and append to scores list
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))

    print(f"Average RMSE: {np.mean(rmse_scores)}")

In [14]:
def financial_forecast_pipeline(data, model_func=GradientBoostingRegressor()):
    data_more_features = add_features(data)
    X_pca, y = pca_reduction(data=data_more_features)
    train_and_predict(X_pca=X_pca, y=y, model_func=model_func)

In [17]:
NVDA_5_years_daily_data = pd.read_csv(
    row_data_path + 'NVDA_5_years_daily_data.csv', index_col='date', parse_dates=True)

In [18]:
financial_forecast_pipeline(NVDA_5_years_daily_data,
                            model_func=GradientBoostingRegressor())

Average RMSE: 49.198931642728176


In [19]:
financial_forecast_pipeline(NVDA_5_years_daily_data,
                            model_func=LinearRegression())

Average RMSE: 62.90965647670862


In [20]:
financial_forecast_pipeline(NVDA_5_years_daily_data,
                            model_func=tree.DecisionTreeRegressor())

Average RMSE: 48.371460059514895
