### Imports

In [1]:
import pandas as pd
from datetime import timedelta
from datetime import datetime as dt
import datetime
import yfinance as yf
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import ADASYN
from xgboost import XGBClassifier
import xgboost as xgb
import pytz
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectFromModel
import tensorflow as tf
from tensorflow import keras
from keras import layers, callbacks
from sklearn.utils.class_weight import compute_class_weight
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.pipeline import Pipeline
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.regularizers import l2
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from keras.models import Sequential
import keras_tuner as kt
from kerastuner.tuners import RandomSearch
import joblib
import itertools
import lightgbm as lgb

  from kerastuner.tuners import RandomSearch


### Helper Functions

In [2]:
def next_four_fridays():
    # Get today's date and time in EST timezone
    est = pytz.timezone('US/Eastern')
    today = dt.now(est)
    
    # Find the next Friday
    days_until_friday = (4 - today.weekday() + 7) % 7
    next_friday = today + timedelta(days=days_until_friday)
    
    # Reset the time to 4:30 PM on the next Friday
    next_friday = next_friday.replace(hour=16, minute=30, second=0, microsecond=0)
    
    # If today is Friday and it's past 4:30 PM, consider the next week's Friday
    if today.weekday() == 4 and today.time() > next_friday.time():
        next_friday += timedelta(days=7)
    
    # Remove time from datetime for output
    next_friday = next_friday.date()
    
    # Find the Friday after the next
    following_friday = next_friday + timedelta(days=7)
    
    # Find the Friday after the following
    next_next_friday = following_friday + timedelta(days=7)
    
    # Find the Friday after the next next
    next_next_next_friday = next_next_friday + timedelta(days=7)
    
    return next_friday.strftime('%Y-%m-%d'), following_friday.strftime('%Y-%m-%d'), next_next_friday.strftime('%Y-%m-%d'), next_next_next_friday.strftime('%Y-%m-%d')

# Get the next four Fridays
friday1, friday2, friday3, friday4 = next_four_fridays()
print(f"The next four Fridays are: {friday1}, {friday2}, {friday3}, and {friday4}")

The next four Fridays are: 2023-12-15, 2023-12-22, 2023-12-29, and 2024-01-05


# Viewing option chains

In [3]:
print(friday1)

2023-12-15


In [4]:
file_viewer = pd.read_csv('Data/naked_puts_results.csv')
filtered_data = file_viewer[file_viewer['Expiration Date'] == friday1]

# Display all rows, no limit
pd.set_option('display.max_rows', None)
# Display all columns, no limit
pd.set_option('display.max_columns', None)

filtered_data['ROI (%)'] = filtered_data['ROI (%)'].astype(float)
# filter out rows with ROI (%) greater than 0.8
# filtered_data = filtered_data[filtered_data['ROI (%)'] > 0.8]
# filter out rows with strike price less than 6
# filtered_data = filtered_data[filtered_data['strike'] <= 7]

# sort by ROI (%) in ascending order
filtered_data = filtered_data.sort_values(by=['ROI (%)'], ascending=False)

# Show only the columns 'Expiration Date', 'ETF', 'Stock Price', 'bid', 'strike', 'ROI (%)', 'OTM (%)', 'Implied Volatility', 'volume', 'openInterest', 'highPrice', 'Original Stock Price', 'Original ROI (%)', 'Original OTM (%)', '50-day MA', '100-day MA', '200-day MA', 'RSI', 'MACD', 'VWAP', '52WeekHigh', and '52WeekLow'
filtered_data = filtered_data[['Expiration Date', 'ETF', 'Stock Price', 'bid', 'strike', 'ROI (%)', 'OTM (%)', 'Implied Volatility', 'volume', 'openInterest', 'highPrice', 'Original Stock Price', 'Original ROI (%)',
    'Original OTM (%)', '50-day MA', '100-day MA', '200-day MA', 'RSI', 'MACD', 'VWAP', '52WeekHigh', '52WeekLow']]

# Show only the top 10 rows
filtered_data.head(20)

Unnamed: 0,Expiration Date,ETF,Stock Price,bid,strike,ROI (%),OTM (%),Implied Volatility,volume,openInterest,highPrice,Original Stock Price,Original ROI (%),Original OTM (%),50-day MA,100-day MA,200-day MA,RSI,MACD,VWAP,52WeekHigh,52WeekLow
292,2023-12-15,VCSA,7.17,1.4,2.5,127.27,65.13,23.47,9.0,0.0,1.4,7.17,127.27,65.13,8.22,9.96,13.47,43.28,-0.23,17.33,38.6,6.88
180,2023-12-15,MTEM,4.92,1.25,2.5,100.0,49.19,17.2,,2.0,1.25,4.92,100.0,49.19,5.65,6.87,6.78,52.98,-0.21,7.37,9.3,4.75
138,2023-12-15,HUT,10.25,1.07,3.5,44.03,65.85,11.68,1500.0,3063.0,1.07,10.25,44.03,65.85,10.16,11.62,11.22,54.35,-0.04,11.26,22.35,3.94
148,2023-12-15,INMB,10.81,2.75,10.0,37.93,7.49,7.49,1.0,125.0,2.75,10.81,37.93,7.49,7.69,7.85,7.96,91.77,0.79,8.16,10.9,6.01
98,2023-12-15,FHTX,5.63,1.2,5.0,31.58,11.19,6.42,10.0,10.0,1.2,5.63,31.58,11.19,3.88,5.84,6.24,78.66,0.27,6.25,9.87,2.94
137,2023-12-15,HUT,10.25,0.59,3.0,24.48,70.73,12.22,5.0,17079.0,0.59,10.25,24.48,70.73,10.16,11.62,11.22,54.35,-0.04,11.26,22.35,3.94
229,2023-12-15,QNST,13.0,2.25,12.5,21.95,3.85,4.65,79.0,151.0,2.25,13.0,21.95,3.85,10.73,10.02,10.74,78.83,0.57,10.95,18.03,6.92
201,2023-12-15,ONTF,7.54,1.1,7.5,17.19,0.53,3.64,,2.0,1.1,7.54,17.19,0.53,6.7,6.91,7.6,68.54,0.28,7.83,10.54,5.89
136,2023-12-15,HUT,10.25,0.36,2.5,16.82,75.61,11.23,27.0,4382.0,0.36,10.25,16.82,75.61,10.16,11.62,11.22,54.35,-0.04,11.26,22.35,3.94
225,2023-12-15,PTN,3.02,0.3,2.5,13.64,17.22,5.63,5.0,5.0,0.3,3.02,13.64,17.22,2.0,2.05,2.26,90.77,0.18,2.52,4.64,1.45


In [5]:
# Read in data
df = pd.read_csv('Data/putsDataSuccessFailed.csv')

# Define function to color rows 
def color_row(row):
    color = 'lightgreen' if row['Status'] == 'Success' else 'lightcoral'
    return ['background-color: %s' % color]*len(row)

# Filter columns to keep
columns = ['Expiration Date', 'ETF', 'Original Stock Price', 'Stock Price', 'strike', 'volume',  
           'openInterest', 'highPrice', 'Original ROI (%)', 'Original OTM (%)',
           '50-day MA', '100-day MA', '200-day MA', 'RSI', 'MACD', 'VWAP',
           '52WeekHigh', '52WeekLow', 'Status']


df = df[columns].sort_values('Expiration Date', ascending=False).head(100)
# Take top 100 rows ordered by Original ROI (%)
df = df.sort_values('Original ROI (%)', ascending=False).head(100)

# Style DataFrame 
styled_df = (df.style  
             .applymap(lambda x: 'color: black') # Set text color
             .apply(color_row, axis=1) # Set row colors
             .format('{:.2f}', subset=df.select_dtypes(include=['float64']).columns))

# Display DataFrame 
styled_df

Unnamed: 0,Expiration Date,ETF,Original Stock Price,Stock Price,strike,volume,openInterest,highPrice,Original ROI (%),Original OTM (%),50-day MA,100-day MA,200-day MA,RSI,MACD,VWAP,52WeekHigh,52WeekLow,Status
30,2023-12-08,HUT,11.4,10.25,2.5,298.0,76.0,0.32,14.68,78.07,10.14,11.9,11.21,64.74,0.04,11.24,22.35,3.94,Success
59,2023-12-08,SFIX,3.87,3.99,3.5,944.0,622.0,0.16,4.79,9.56,3.39,3.7,3.93,62.98,0.16,4.02,5.6,2.77,Success
7,2023-12-08,AMC,7.02,6.93,7.0,6227.0,5623.0,0.28,4.17,0.28,8.63,18.78,34.53,38.15,-0.71,19.44,76.1,6.65,Failed
17,2023-12-08,FSR,1.57,1.64,1.5,908.0,7841.0,0.06,4.17,4.46,4.43,5.23,5.62,15.56,-0.85,5.4,8.39,1.57,Success
18,2023-12-08,FTCH,1.18,1.23,1.0,1847.0,2578.0,0.04,4.17,15.25,1.67,2.78,3.86,43.96,-0.13,3.92,7.53,0.97,Success
1,2023-12-08,AAOI,16.13,19.76,16.0,26.0,71.0,0.55,3.56,0.81,10.25,10.65,6.87,67.11,1.73,9.71,16.68,1.64,Success
11,2023-12-08,CHPT,1.99,2.35,1.5,8136.0,3511.0,0.05,3.45,24.62,3.09,5.13,7.16,22.81,-0.35,7.0,13.38,1.86,Success
28,2023-12-08,HOOD,10.53,11.73,10.5,2295.0,234.0,0.3,2.94,0.28,9.11,10.13,9.82,77.51,0.12,9.87,13.12,7.7,Success
40,2023-12-08,MPW,5.12,4.69,5.0,1199.0,1978.0,0.14,2.88,2.34,4.91,6.36,7.33,60.68,0.04,8.06,12.6,4.08,Failed
44,2023-12-08,NVAX,5.59,5.57,5.5,583.0,1821.0,0.15,2.8,1.61,6.51,7.25,7.45,34.74,-0.3,8.47,17.32,5.35,Success


In [6]:
# Load the CSV file into a DataFrame
from sklearn.impute import SimpleImputer


df = pd.read_csv('Data/putsDataSuccessFailed.csv')

# Save the columns to be dropped for later use
retained_columns = df[['contractSymbol', 'Expiration Date', 'ETF']]

# Data Preprocessing
df.drop(['contractSymbol', 'Expiration Date', 'ETF'], axis=1, inplace=True)
df = pd.get_dummies(df, columns=['recommendationKey'], drop_first=True)

# Handle missing values with imputation
imputer = SimpleImputer(strategy='mean')
X = df.drop('Status', axis=1)
y = df['Status']

# Encode the target variable 'Status'
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_imputed = imputer.fit_transform(X)

# Address class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_imputed, y_encoded)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Model Selection and Hyperparameter Tuning (LightGBM)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.05, 0.1, 0.2]
}

# Use GridSearchCV for hyperparameter tuning
lgb_classifier = lgb.LGBMClassifier(random_state=42)
grid_search = GridSearchCV(lgb_classifier, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_lgb_model = grid_search.best_estimator_

# Save the best model and other objects
joblib.dump(best_lgb_model, 'best_lgb_model.joblib')
joblib.dump(label_encoder, 'label_encoder.joblib')
joblib.dump(imputer, 'imputer.joblib')

# Load new data
new_data = pd.read_csv('Data/naked_puts_results.csv')

# Save the columns to be dropped from new data for later use
new_retained_columns = new_data[['contractSymbol', 'Expiration Date', 'ETF']]

# Preprocess new data as per training data
new_data.drop(['contractSymbol', 'Expiration Date', 'ETF'], axis=1, inplace=True)
new_data_encoded = pd.get_dummies(new_data, columns=['recommendationKey'], drop_first=True)

# Align columns of new data with training data
missing_cols = set(X.columns) - set(new_data_encoded.columns)
extra_cols = set(new_data_encoded.columns) - set(X.columns)
for c in missing_cols:
    new_data_encoded[c] = 0
new_data_encoded.drop(extra_cols, axis=1, inplace=True)
new_data_encoded = new_data_encoded[X.columns]

# Impute missing values
new_data_imputed = joblib.load('imputer.joblib').transform(new_data_encoded)

# Load the saved model and make predictions
loaded_model = joblib.load('best_lgb_model.joblib')
new_predictions = loaded_model.predict(new_data_imputed)

# Add predictions to the new data
new_data_with_predictions = new_data_encoded.copy()
new_data_with_predictions['Guess'] = joblib.load('label_encoder.joblib').inverse_transform(new_predictions)

# Reverse one-hot encoding to get original 'recommendationKey' column
def reverse_one_hot(df, original_column_prefix):
    one_hot_cols = [col for col in df if col.startswith(original_column_prefix)]
    df[original_column_prefix] = df[one_hot_cols].idxmax(axis=1).str.replace(original_column_prefix + '_', '')
    df.drop(one_hot_cols, axis=1, inplace=True)
    return df

new_data_with_predictions = reverse_one_hot(new_data_with_predictions, 'recommendationKey')

# Add back the retained columns to the new data with predictions
new_data_with_predictions = pd.concat([new_retained_columns.reset_index(drop=True), new_data_with_predictions.reset_index(drop=True)], axis=1)

# Save the new data with predictions
new_data_with_predictions.to_csv('Data/new_data_with_predictions.csv', index=False)

# Model Evaluation
y_pred = best_lgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Best LightGBM Model: {best_lgb_model}')
print(f'Accuracy: {accuracy}')
print(report)

[LightGBM] [Info] Number of positive: 45, number of negative: 49
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006367 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 977
[LightGBM] [Info] Number of data points in the train set: 94, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.478723 -> initscore=-0.085158
[LightGBM] [Info] Start training from score -0.085158
[LightGBM] [Info] Number of positive: 45, number of negative: 49
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001303 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 985
[LightGBM] [Info] Number of data points in the train set: 94, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.478723 -> initscore=-0.085158
[LightGBM] [Info] Start training from score -0.085158
[LightGBM] [Info] Number of posi

In [7]:
# # Load the CSV file into a DataFrame
# df = pd.read_csv('Data/putsDataSuccessFailed.csv')

# # Data Preprocessing
# # Drop any columns that are not relevant for prediction
# df.drop(['contractSymbol', 'Expiration Date', 'ETF'], axis=1, inplace=True)

# # Encode categorical variables using one-hot encoding
# df = pd.get_dummies(df, columns=['recommendationKey'], drop_first=True)

# # Handle missing values with imputation
# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(strategy='mean')
# X = df.drop('Status', axis=1)
# y = df['Status']

# # Encode the target variable 'Status'
# label_encoder = LabelEncoder()
# y_encoded = label_encoder.fit_transform(y)

# X_imputed = imputer.fit_transform(X)

# # Address class imbalance with SMOTE
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X_imputed, y_encoded)

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# # Model Selection and Hyperparameter Tuning (LightGBM)
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [3, 4, 5],
#     'learning_rate': [0.05, 0.1, 0.2]
# }

# # Use GridSearchCV for hyperparameter tuning
# lgb_classifier = lgb.LGBMClassifier(random_state=42)
# grid_search = GridSearchCV(lgb_classifier, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
# grid_search.fit(X_train, y_train)

# best_lgb_model = grid_search.best_estimator_

# # Model Evaluation
# y_pred = best_lgb_model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# report = classification_report(y_test, y_pred)

# print(f'Best LightGBM Model: {best_lgb_model}')
# print(f'Accuracy: {accuracy}')
# print(report)


# import joblib

# # Dump the label_encoder to a file
# label_encoder_filename = "label_encoder.joblib"
# joblib.dump(label_encoder, label_encoder_filename)

# # Save the trained LightGBM model and scaler to separate files
# model_filename = "model.joblib"
# scaler_filename = "scaler.joblib"

# joblib.dump(best_lgb_model, model_filename)
# joblib.dump(imputer, scaler_filename)

# # Making Predictions on New Data
# # Load the new data
# new_data = pd.read_csv("Data/naked_puts_results.csv")

# # Make a copy of the new data for predictions
# new_data_with_predictions = new_data.copy()

# # Apply the same preprocessing steps as before
# # Drop columns not relevant for prediction
# new_data.drop(['contractSymbol', 'Expiration Date', 'ETF'], axis=1, inplace=True)

# # Encode categorical variables using one-hot encoding
# new_data = pd.get_dummies(new_data, columns=['recommendationKey'], drop_first=True)

# # Handle missing values with imputation
# imputer = SimpleImputer(strategy='mean')
# new_X_imputed = imputer.fit_transform(new_data)  # No need to drop 'Status'

# # Load the label encoder
# label_encoder = joblib.load("label_encoder.joblib")

# # Load the trained LightGBM model
# best_lgb_model = joblib.load("model.joblib")

# # Make predictions on the new data
# new_predictions = best_lgb_model.predict(new_X_imputed)

# # Convert numerical predictions back to labels ("Success" or "Failed")
# new_predictions_labels = label_encoder.inverse_transform(new_predictions)

# # Add predictions to the new data
# new_data_with_predictions["Guess"] = new_predictions_labels

# # Save the new data with predictions, including 'contractSymbol', 'Expiration Date', and 'ETF'
# new_data_with_predictions.to_csv("Data/new_data_with_predictions.csv", index=False)

In [8]:
# Read in data 
df = pd.read_csv('Data/new_data_with_predictions.csv')

# Sort by Original ROI column  
df = df.sort_values(by='Original ROI (%)', ascending=False)  

# Filter to top 300 rows
df = df.head(300)

# Filter columns to show
columns = ['Expiration Date', 'ETF', 'Stock Price', 'bid', 'strike', 
           'ROI (%)', 'OTM (%)', 'recommendationKey', 'targetLowPrice', 'targetMeanPrice', 'targetHighPrice', 'Guess']
df = df[columns]

# Define row color function
def color_row(row):
    color = 'lightgreen' if row['Guess'] == 'Success' else 'lightcoral'
    return ['background-color: %s' % color]*len(row)

# Apply cell and row styling
styled_df = (df.style
                .applymap(lambda x: 'color: black') # Black text
                .apply(color_row, axis=1) # Color rows
                .format('{:.2f}', subset=df.select_dtypes(include=['float64']).columns)) 

# Display styled DataFrame
styled_df

Unnamed: 0,Expiration Date,ETF,Stock Price,bid,strike,ROI (%),OTM (%),recommendationKey,targetLowPrice,targetMeanPrice,targetHighPrice,Guess
292,2023-12-15,VCSA,7.17,1.4,2.5,127.27,65.13,hold,8.0,10.9,15.0,Success
180,2023-12-15,MTEM,4.92,1.25,2.5,100.0,49.19,hold,15.0,16.5,18.0,Success
138,2023-12-15,HUT,10.25,1.07,3.5,44.03,65.85,hold,2.0,3.06,4.76,Success
148,2023-12-15,INMB,10.81,2.75,10.0,37.93,7.49,hold,14.0,17.33,22.0,Success
98,2023-12-15,FHTX,5.63,1.2,5.0,31.58,11.19,hold,6.0,14.25,20.0,Success
137,2023-12-15,HUT,10.25,0.59,3.0,24.48,70.73,hold,2.0,3.06,4.76,Success
229,2023-12-15,QNST,13.0,2.25,12.5,21.95,3.85,hold,10.0,13.38,19.0,Success
201,2023-12-15,ONTF,7.54,1.1,7.5,17.19,0.53,hold,9.0,9.5,10.0,Success
136,2023-12-15,HUT,10.25,0.36,2.5,16.82,75.61,hold,2.0,3.06,4.76,Success
225,2023-12-15,PTN,3.02,0.3,2.5,13.64,17.22,hold,9.0,43.0,70.0,Success
