#### Modeling Phase 

The Modeling notebook is broken into the following parts
- Feature Selection 
- Models 
- Results

In [142]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE

In [101]:
df = pd.read_csv('data/QQQ_modified_data.csv', ).rename(columns={'Unnamed: 0': 'Date'}).drop(columns=['Close'])

In [102]:
# remove the first 30 rows of data to eliminate NaN values
df = df.iloc[30:]

In [120]:
df_with_dummies = df[['cross', 'triple cross']]
df_with_dummies = pd.get_dummies(df_with_dummies).astype(int)

df = pd.concat([df, df_with_dummies], axis=1)

df = df.drop(['cross', 'triple cross'], axis=1)

In [121]:
X = df.drop(columns=['Date', 'Future1M_movement_direction', 'Future2M_movement_direction', 'Future15d_movement_direction'])

targets = df[['Future1M_movement_direction', 'Future2M_movement_direction', 'Future15d_movement_direction']].replace(-1, 0)

y1M = targets['Future1M_movement_direction'].values
y2M = targets['Future2M_movement_direction'].values
y15D = targets['Future15d_movement_direction'].values

#### Feature Selection 
- LASSO: a linear regression technique that penalizes the absolute size of the regression coefficients, effectively encouraging sparse solutions where some coefficients are exactly zero. It is commonly used for feature selection by shrinking less important features' coefficients to zero, thereby identifying the most relevant features for prediction.
- RFE (XGBoost): feature selection technique that iteratively removes less important features based on their importance scores obtained from XGBoost. By leveraging the power of XGBoost's gradient boosting algorithm, XGBoost RFE identifies a subset of the most relevant features, improving model interpretability and potentially enhancing predictive performance.

In [128]:
# LASSO util function

def lasso_feature_selection(X, y, model_type):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    lasso = Lasso(alpha=0.1)  
    lasso.fit(X_scaled, y)
    
    feature_coefficients = pd.Series(lasso.coef_, index=X.columns) 

    sorted_coefficients = feature_coefficients.abs().sort_values(ascending=False)
    
    top_10_features = sorted_coefficients.head(10)
    
    results_df = pd.DataFrame({
        'Feature': top_10_features.index,  
        'Coefficient': top_10_features.values,
        'Model_Type': model_type
    })
    
    return results_df

In [123]:
# XGBOOST util function
def xgboost_rfe_feature_selection(X, y, model_type):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    xgb = XGBRegressor()
    
    rfe = RFE(estimator=xgb, n_features_to_select=10)
    rfe.fit(X_scaled, y)
    
    selected_features = X.columns[rfe.support_]

    results_df = pd.DataFrame({
        'Feature': selected_features,
        'Model_Type': model_type
    })
    
    return results_df

In [134]:
# Run function - LASSO 

y1M_lasso_results = lasso_feature_selection(X=X, y=y1M, model_type='Lasso Regression')
y2M_lasso_results = lasso_feature_selection(X=X, y=y2M, model_type='Lasso Regression')
y15D_lasso_results = lasso_feature_selection(X=X, y=y15D, model_type='Lasso Regression')


In [135]:
# Run function - rfe

y1M_rfe_results = xgboost_rfe_feature_selection(X=X, y=y1M, model_type = 'XGBoost RFE')
y2M_rfe_results = xgboost_rfe_feature_selection(X=X, y=y2M, model_type = 'XGBoost RFE')
y15D_rfe_results = xgboost_rfe_feature_selection(X=X, y=y15D, model_type = 'XGBoost RFE')

In [136]:
# stitch together results for each respective target

y1M_results = pd.concat([y1M_lasso_results, y1M_rfe_results])
y2M_results = pd.concat([y2M_lasso_results, y2M_rfe_results])
y15D_results = pd.concat([y15D_lasso_results, y15D_rfe_results])

#### Models

Types of Linear and non-linear models 
- Logistic Regression
- Decision Trees
- XGBoost

Cross-validation Approach: TimeSeries Cross-Validation with 5 folds is a technique used for evaluating time series models, where the data is divided into five consecutive and non-overlapping folds. Each fold acts as a test set once while the preceding data is used for training. This approach helps assess model performance while preserving the temporal order of the data


All models will be evaluated with the following metrics 
- Precision: Proportion of correctly classified instances (both true positives and true negatives) out of the total number of instances
- Accuracy Metric: Proportion of true positive predictions among all positive predictions made by the model
- Recall (True Positive Rate): Proportion of true positive predictions among all actual positive instances in the dataset


In [143]:
# Instantiate the models
logistic_regression = LogisticRegression()
decision_tree = DecisionTreeClassifier()
xgboost = XGBClassifier()

In [147]:
def evaluate_model_with_cross_validation(X, y, model):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    tscv = TimeSeriesSplit(n_splits=5) 
    accuracies = []
    precisions = []
    recalls = []

    for train_index, test_index in tscv.split(X_train):
        X_train_cv, X_val = X_train.iloc[train_index], X_train.iloc[test_index]  
        y_train_cv, y_val = y_train.iloc[train_index], y_train.iloc[test_index]  

        model.fit(X_train_cv, y_train_cv)

        y_pred_val = model.predict(X_val)

        accuracy = accuracy_score(y_val, y_pred_val)
        precision = precision_score(y_val, y_pred_val)
        recall = recall_score(y_val, y_pred_val)

        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)

    average_accuracy = np.mean(accuracies)
    average_precision = np.mean(precisions)
    average_recall = np.mean(recalls)

    results_df = pd.DataFrame({
        'Metric': ['Accuracy', 'Precision', 'Recall'],
        'Average_Value': [average_accuracy, average_precision, average_recall]
    })

    return results_df

    average_recall = np.mean(recalls)

    results_df = pd.DataFrame({
        'Metric': ['Accuracy', 'Precision', 'Recall'],
        'Average_Value': [average_accuracy, average_precision, average_recall]
    })

    return results_df

In [148]:

results_df = evaluate_model_with_cross_validation(X=X, y=y1M, model=logistic_regression)



AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

## Results 