### Imports

In [1]:
import pandas as pd
from datetime import timedelta
from datetime import datetime as dt
import datetime
import yfinance as yf
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import ADASYN
from xgboost import XGBClassifier
import xgboost as xgb
import pytz
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectFromModel
import tensorflow as tf
from tensorflow import keras
from keras import layers, callbacks
from sklearn.utils.class_weight import compute_class_weight
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.pipeline import Pipeline
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.regularizers import l2
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from keras.models import Sequential
import keras_tuner as kt
from kerastuner.tuners import RandomSearch
import joblib
import itertools
import lightgbm as lgb

  from kerastuner.tuners import RandomSearch


### Helper Functions

In [2]:
def next_four_fridays():
    # Get today's date and time in EST timezone
    est = pytz.timezone('US/Eastern')
    today = dt.now(est)
    
    # Find the next Friday
    days_until_friday = (4 - today.weekday() + 7) % 7
    next_friday = today + timedelta(days=days_until_friday)
    
    # Reset the time to 4:30 PM on the next Friday
    next_friday = next_friday.replace(hour=16, minute=30, second=0, microsecond=0)
    
    # If today is Friday and it's past 4:30 PM, consider the next week's Friday
    if today.weekday() == 4 and today.time() > next_friday.time():
        next_friday += timedelta(days=7)
    
    # Remove time from datetime for output
    next_friday = next_friday.date()
    
    # Find the Friday after the next
    following_friday = next_friday + timedelta(days=7)
    
    # Find the Friday after the following
    next_next_friday = following_friday + timedelta(days=7)
    
    # Find the Friday after the next next
    next_next_next_friday = next_next_friday + timedelta(days=7)
    
    return next_friday.strftime('%Y-%m-%d'), following_friday.strftime('%Y-%m-%d'), next_next_friday.strftime('%Y-%m-%d'), next_next_next_friday.strftime('%Y-%m-%d')

# Get the next four Fridays
friday1, friday2, friday3, friday4 = next_four_fridays()
print(f"The next four Fridays are: {friday1}, {friday2}, {friday3}, and {friday4}")

The next four Fridays are: 2024-03-15, 2024-03-22, 2024-03-29, and 2024-04-05


# Viewing option chains

In [3]:
print(friday1)

2024-03-15


In [4]:
file_viewer = pd.read_csv('Data/naked_puts_results.csv')
filtered_data = file_viewer[file_viewer['Expiration Date'] == friday1]

# Display all rows, no limit
pd.set_option('display.max_rows', None)
# Display all columns, no limit
pd.set_option('display.max_columns', None)

filtered_data['ROI (%)'] = filtered_data['ROI (%)'].astype(float)
# filter out rows with ROI (%) greater than 0.8
# filtered_data = filtered_data[filtered_data['ROI (%)'] > 0.8]
# filter out rows with strike price less than 6
# filtered_data = filtered_data[filtered_data['strike'] <= 7]

# sort by ROI (%) in ascending order
filtered_data = filtered_data.sort_values(by=['ROI (%)'], ascending=False)

# Show only the columns 'Expiration Date', 'ETF', 'Stock Price', 'bid', 'strike', 'ROI (%)', 'OTM (%)', 'Implied Volatility', 'volume', 'openInterest', 'highPrice', 'Original Stock Price', 'Original ROI (%)', 'Original OTM (%)', '50-day MA', '100-day MA', '200-day MA', 'RSI', 'MACD', 'VWAP', '52WeekHigh', and '52WeekLow'
filtered_data = filtered_data[['Expiration Date', 'ETF', 'Stock Price', 'bid', 'strike', 'ROI (%)', 'OTM (%)', 'Implied Volatility', 'volume', 'openInterest', 'highPrice', 'Original Stock Price', 'Original ROI (%)',
    'Original OTM (%)', '50-day MA', '100-day MA', '200-day MA', 'RSI', 'MACD', 'VWAP', '52WeekHigh', '52WeekLow']]

# Show only the top 10 rows
filtered_data.head(20)

Unnamed: 0,Expiration Date,ETF,Stock Price,bid,strike,ROI (%),OTM (%),Implied Volatility,volume,openInterest,highPrice,Original Stock Price,Original ROI (%),Original OTM (%),50-day MA,100-day MA,200-day MA,RSI,MACD,VWAP,52WeekHigh,52WeekLow
2101,2024-03-15,CRIS,10.5,4.0,5.0,400.0,52.38,26.84,,20.0,4.0,10.5,400.0,52.38,11.01,9.61,5.41,50.1,-0.14,1.91,14.47,0.38
2802,2024-03-15,EGIO,8.66,1.95,2.5,354.55,71.13,24.47,1.0,0.0,1.95,8.66,354.55,71.13,10.11,17.04,22.89,46.37,-0.58,21.11,43.6,7.71
3521,2024-03-15,GOEV,2.13,0.75,1.0,300.0,53.05,24.75,,1.0,0.75,2.13,300.0,53.05,3.8,5.07,8.66,23.26,-0.5,8.33,18.49,2.0
3101,2024-03-15,FATH,4.43,1.75,2.5,233.33,43.57,0.0,,1.0,1.75,4.43,233.33,43.57,4.43,4.35,6.42,41.88,0.02,7.48,17.34,3.0
7215,2024-03-15,SNCR,10.84,1.25,2.5,100.0,76.94,19.66,1.0,100.0,1.25,10.84,100.0,76.94,7.44,5.97,7.28,50.28,0.85,7.48,12.24,3.15
114,2024-03-15,ACB,3.18,0.73,1.5,94.81,52.83,0.0,30.0,48.0,0.73,3.18,94.81,52.83,3.97,4.34,5.01,26.11,-0.28,6.39,9.83,2.9
113,2024-03-15,ACB,3.18,0.42,1.0,72.41,68.55,0.0,20.0,531.0,0.42,3.18,72.41,68.55,3.97,4.34,5.01,26.11,-0.28,6.39,9.83,2.9
2742,2024-03-15,DY,142.83,37.0,135.0,37.76,5.48,5.73,,0.0,37.0,142.83,37.76,5.48,117.56,106.55,103.32,98.42,5.99,100.51,142.83,81.36
1936,2024-03-15,CNS,75.55,14.1,75.0,23.15,0.73,4.09,100.0,1.0,14.1,75.55,23.15,0.73,71.14,65.27,63.31,73.66,1.43,62.84,77.46,50.6
6482,2024-03-15,PTCT,31.16,5.5,31.0,21.57,0.51,3.58,2.0,2.0,5.5,31.16,21.57,0.51,27.58,25.21,31.23,80.56,1.27,32.54,59.1,18.07


In [11]:
# Read in data
df = pd.read_csv('Data/putsDataSuccessFailed.csv')

# Ensure 'Expiration Date' is in datetime format
df['Expiration Date'] = pd.to_datetime(df['Expiration Date'])


df = df[df['Expiration Date'] == '2024-03-08']

# Filter columns to keep
columns = ['Expiration Date', 'ETF', 'Original Stock Price', 'Stock Price', 'strike', 'volume',  
           'openInterest', 'highPrice', 'Original ROI (%)', 'Original OTM (%)',
           '50-day MA', '100-day MA', '200-day MA', 'RSI', 'MACD', 'VWAP',
           '52WeekHigh', '52WeekLow', 'Status']

df = df[columns]

# Take top 100 rows ordered by Original ROI (%)
df = df.sort_values('Original ROI (%)', ascending=False).head(100)

# Define function to color rows 
def color_row(row):
    color = 'lightgreen' if row['Status'] == 'Success' else 'lightcoral'
    return ['background-color: %s' % color]*len(row)

# Style DataFrame 
styled_df = (df.style  
             .applymap(lambda x: 'color: black') # Set text color
             .apply(color_row, axis=1) # Set row colors
             .format('{:.2f}', subset=df.select_dtypes(include=['float64']).columns))

# Display DataFrame 
styled_df


Unnamed: 0,Expiration Date,ETF,Original Stock Price,Stock Price,strike,volume,openInterest,highPrice,Original ROI (%),Original OTM (%),50-day MA,100-day MA,200-day MA,RSI,MACD,VWAP,52WeekHigh,52WeekLow,Status
89,2024-03-08 00:00:00,ACB,3.17,3.18,0.5,2.0,37.0,0.1,25.0,84.23,4.14,4.45,5.09,14.11,-0.28,6.5,9.83,3.14,Success
3330,2024-03-08 00:00:00,PCT,6.07,5.13,6.0,90.0,123.0,0.7,13.21,1.15,4.17,4.28,6.51,67.19,0.5,5.79,11.84,2.4,Failed
3217,2024-03-08 00:00:00,NYCB,3.55,3.42,3.5,6796.0,1243.0,0.35,11.11,1.41,7.85,8.86,10.23,29.73,-1.02,8.38,13.87,3.55,Failed
1665,2024-03-08 00:00:00,EOSE,1.02,1.04,1.0,5.0,1075.0,0.1,11.11,1.96,1.04,1.24,2.09,37.5,-0.03,2.32,5.03,0.77,Success
3836,2024-03-08 00:00:00,SOUN,6.04,5.86,6.0,7103.0,2797.0,0.55,10.09,0.66,2.64,2.3,2.52,72.7,1.18,3.59,7.42,1.54,Failed
2010,2024-03-08 00:00:00,GTLB,73.31,57.7,73.0,127.0,90.0,6.4,9.61,0.42,67.89,58.46,52.24,38.82,1.04,48.23,77.6,26.77,Failed
3663,2024-03-08 00:00:00,SE,51.05,57.72,51.0,1390.0,9.0,4.4,9.44,0.1,40.73,40.85,46.83,80.9,2.29,49.75,88.07,34.82,Success
879,2024-03-08 00:00:00,CHPT,2.08,1.92,2.0,23482.0,3457.0,0.17,9.29,3.85,2.09,2.39,4.97,47.15,-0.01,5.0,11.26,1.65,Failed
2009,2024-03-08 00:00:00,GTLB,73.31,57.7,72.0,72.0,88.0,5.9,8.93,1.79,67.89,58.46,52.24,38.82,1.04,48.23,77.6,26.77,Failed
3329,2024-03-08 00:00:00,PCT,6.07,5.13,5.5,440.0,55.0,0.45,8.91,9.39,4.17,4.28,6.51,67.19,0.5,5.79,11.84,2.4,Failed


In [12]:
# Load the CSV file into a DataFrame
from sklearn.impute import SimpleImputer


df = pd.read_csv('Data/putsDataSuccessFailed.csv')

# Save the columns to be dropped for later use
retained_columns = df[['contractSymbol', 'Expiration Date', 'ETF']]

# Data Preprocessing
df.drop(['contractSymbol', 'Expiration Date', 'ETF'], axis=1, inplace=True)
df = pd.get_dummies(df, columns=['recommendationKey'], drop_first=True)

# Handle missing values with imputation
imputer = SimpleImputer(strategy='mean')
X = df.drop('Status', axis=1)
y = df['Status']

# Encode the target variable 'Status'
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_imputed = imputer.fit_transform(X)

# Address class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_imputed, y_encoded)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Model Selection and Hyperparameter Tuning (LightGBM)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.05, 0.1, 0.2]
}

# Use GridSearchCV for hyperparameter tuning
lgb_classifier = lgb.LGBMClassifier(random_state=42)
grid_search = GridSearchCV(lgb_classifier, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_lgb_model = grid_search.best_estimator_

# Save the best model and other objects
joblib.dump(best_lgb_model, 'best_lgb_model.joblib')
joblib.dump(label_encoder, 'label_encoder.joblib')
joblib.dump(imputer, 'imputer.joblib')

# Load new data
new_data = pd.read_csv('Data/naked_puts_results.csv')

# Save the columns to be dropped from new data for later use
new_retained_columns = new_data[['contractSymbol', 'Expiration Date', 'ETF']]

# Preprocess new data as per training data
new_data.drop(['contractSymbol', 'Expiration Date', 'ETF'], axis=1, inplace=True)
new_data_encoded = pd.get_dummies(new_data, columns=['recommendationKey'], drop_first=True)

# Align columns of new data with training data
missing_cols = set(X.columns) - set(new_data_encoded.columns)
extra_cols = set(new_data_encoded.columns) - set(X.columns)
for c in missing_cols:
    new_data_encoded[c] = 0
new_data_encoded.drop(extra_cols, axis=1, inplace=True)
new_data_encoded = new_data_encoded[X.columns]

# Impute missing values
new_data_imputed = joblib.load('imputer.joblib').transform(new_data_encoded)

# Load the saved model and make predictions
loaded_model = joblib.load('best_lgb_model.joblib')
new_predictions = loaded_model.predict(new_data_imputed)

# Add predictions to the new data
new_data_with_predictions = new_data_encoded.copy()
new_data_with_predictions['Guess'] = joblib.load('label_encoder.joblib').inverse_transform(new_predictions)

# Reverse one-hot encoding to get original 'recommendationKey' column
def reverse_one_hot(df, original_column_prefix):
    one_hot_cols = [col for col in df if col.startswith(original_column_prefix)]
    df[original_column_prefix] = df[one_hot_cols].idxmax(axis=1).str.replace(original_column_prefix + '_', '')
    df.drop(one_hot_cols, axis=1, inplace=True)
    return df

new_data_with_predictions = reverse_one_hot(new_data_with_predictions, 'recommendationKey')

# Add back the retained columns to the new data with predictions
new_data_with_predictions = pd.concat([new_retained_columns.reset_index(drop=True), new_data_with_predictions.reset_index(drop=True)], axis=1)

# Save the new data with predictions
new_data_with_predictions.to_csv('Data/new_data_with_predictions.csv', index=False)

# Model Evaluation
y_pred = best_lgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Best LightGBM Model: {best_lgb_model}')
print(f'Accuracy: {accuracy}')
print(report)

[LightGBM] [Info] Number of positive: 3672, number of negative: 3720
[LightGBM] [Info] Number of positive: 3672, number of negative: 3721
[LightGBM] [Info] Number of positive: 3672, number of negative: 3721
[LightGBM] [Info] Number of positive: 3672, number of negative: 3721
[LightGBM] [Info] Number of positive: 3672, number of negative: 3721
[LightGBM] [Info] Number of positive: 3672, number of negative: 3721
[LightGBM] [Info] Number of positive: 3672, number of negative: 3721
[LightGBM] [Info] Number of positive: 3672, number of negative: 3720
[LightGBM] [Info] Number of positive: 3672, number of negative: 3720
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004928 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005303 seconds.
You can set `force_row_wise=true` to remove the o

In [13]:
# Read in data 
df = pd.read_csv('Data/new_data_with_predictions.csv')

# Sort by Original ROI column  
df = df.sort_values(by='Original ROI (%)', ascending=False)  

# Filter to top 300 rows
df = df.head(300)

# Filter columns to show
columns = ['Expiration Date', 'ETF', 'Stock Price', 'bid', 'strike', 
           'ROI (%)', 'OTM (%)', 'recommendationKey', 'targetLowPrice', 'targetMeanPrice', 'targetHighPrice', 'Guess']
df = df[columns]

# Define row color function
def color_row(row):
    color = 'lightgreen' if row['Guess'] == 'Success' else 'lightcoral'
    return ['background-color: %s' % color]*len(row)

# Apply cell and row styling
styled_df = (df.style
                .applymap(lambda x: 'color: black') # Black text
                .apply(color_row, axis=1) # Color rows
                .format('{:.2f}', subset=df.select_dtypes(include=['float64']).columns)) 

# Display styled DataFrame
styled_df

Unnamed: 0,Expiration Date,ETF,Stock Price,bid,strike,ROI (%),OTM (%),recommendationKey,targetLowPrice,targetMeanPrice,targetHighPrice,Guess
2101,2024-03-15,CRIS,10.5,4.0,5.0,400.0,52.38,hold,15.0,21.75,26.0,Success
2802,2024-03-15,EGIO,8.66,1.95,2.5,354.55,71.13,hold,14.0,34.67,50.0,Success
3521,2024-03-15,GOEV,2.13,0.75,1.0,300.0,53.05,hold,0.25,1.88,4.0,Success
3101,2024-03-15,FATH,4.43,1.75,2.5,233.33,43.57,hold,5.0,5.0,5.0,Success
7215,2024-03-15,SNCR,10.84,1.25,2.5,100.0,76.94,hold,4.05,20.02,36.0,Success
114,2024-03-15,ACB,3.18,0.73,1.5,94.81,52.83,hold,31.99,31.99,31.99,Success
113,2024-03-15,ACB,3.18,0.42,1.0,72.41,68.55,hold,31.99,31.99,31.99,Success
2742,2024-03-15,DY,142.83,37.0,135.0,37.76,5.48,hold,101.0,145.22,165.0,Success
1936,2024-03-15,CNS,75.55,14.1,75.0,23.15,0.73,hold,69.0,73.0,77.0,Failed
6482,2024-03-15,PTCT,31.16,5.5,31.0,21.57,0.51,hold,15.0,27.64,45.0,Failed
