In [None]:
# Load your dataset
data = pd.read_csv("Data/putsDataSuccessFailed.csv")

# Preprocessing Steps
# -------------------

# 1. Fill missing values with median for numeric columns
for col in data.select_dtypes(include=['float64']).columns:
    data[col].fillna(data[col].median(), inplace=True)

# 2. Label Encode 'Status' column
label_encoder = LabelEncoder()
data['Status'] = label_encoder.fit_transform(data['Status'])

# 3. Feature selection (You can experiment with different feature sets)
selected_columns = ['strike', 'volume', 'openInterest', 'Implied Volatility', 'Original Stock Price', 'Original ROI (%)', 'Original OTM (%)', 'Original Implied Volatility', '50-day MA', '100-day MA', '200-day MA', 'RSI', 'MACD', 'VWAP', 'Delta', 'Gamma', 'Theta', 'Vega', 'Rho', '52WeekHigh', '52WeekLow', 'targetHighPrice', 'targetLowPrice', 'targetMeanPrice', 'targetMedianPrice', 'recommendationMean']
X = data[selected_columns]
y = data['Status']

# 4. Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# Initialize a more complex model with increased regularization
input_layer = keras.Input(shape=(X_train.shape[1],))
x = layers.Dense(256, activation='relu', kernel_regularizer=l2(0.001))(input_layer)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(128, activation='relu', kernel_regularizer=l2(0.001))(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(64, activation='relu', kernel_regularizer=l2(0.001))(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(32, activation='relu', kernel_regularizer=l2(0.001))(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.5)(x)
output_layer = layers.Dense(1, activation='sigmoid')(x)

model = keras.Model(inputs=input_layer, outputs=output_layer)

# Compile the model with custom optimizer and learning rate
custom_optimizer = Adam(learning_rate=0.001)  # Experiment with the learning rate
model.compile(optimizer=custom_optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Define a learning rate scheduler
lr_scheduler = callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,       # Reduce learning rate by half
    patience=5,       # Wait for 5 epochs with no improvement
    min_lr=1e-7,      # Minimum learning rate
    verbose=1
)

# Define early stopping
early_stopping = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,      # Stop if no improvement for 10 epochs
    restore_best_weights=True
)

# Implement k-fold cross-validation for more robust training
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

for train_idx, val_idx in kfold.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Apply SMOTE to the training data for each fold
    smote = SMOTE(random_state=42)
    X_train_fold_balanced, y_train_fold_balanced = smote.fit_resample(X_train_fold, y_train_fold)

    # Calculate class weights for each fold
    class_weights_fold = compute_class_weight('balanced', classes=np.unique(y_train_fold_balanced), y=y_train_fold_balanced.ravel())
    class_weight_dict_fold = {i: weight for i, weight in enumerate(class_weights_fold)}


    # Train the model for this fold
    history = model.fit(X_train_fold_balanced, y_train_fold_balanced,
                        epochs=100,
                        batch_size=128,  # Increased batch size for stability
                        validation_data=(X_val_fold, y_val_fold),
                        callbacks=[lr_scheduler, early_stopping],
                        class_weight=class_weight_dict_fold,
                        verbose=0)  # Suppress verbose output

    # Evaluate the model on the validation fold
    y_val_pred_prob = model.predict(X_val_fold)
    y_val_pred_binary = (y_val_pred_prob > 0.5).astype(int)
    val_accuracy = accuracy_score(y_val_fold, y_val_pred_binary)
    accuracies.append(val_accuracy)

# Calculate the mean accuracy over all folds
mean_accuracy = np.mean(accuracies)
print(f"Mean Validation Accuracy: {mean_accuracy * 100:.2f}%")

# Retrain the model on the full training set
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Calculate class weights for the full training set
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_balanced), y=y_train_balanced.ravel())
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# Train the final model
history = model.fit(X_train_balanced, y_train_balanced,
                    epochs=100,
                    batch_size=128,
                    validation_split=0.2,
                    callbacks=[lr_scheduler, early_stopping],
                    class_weight=class_weight_dict)

# Evaluate the model on the test set
y_test_pred_prob = model.predict(X_test)
y_test_pred_binary = (y_test_pred_prob > 0.5).astype(int)
test_accuracy = accuracy_score(y_test, y_test_pred_binary)
print(f"Test Set Accuracy: {test_accuracy * 100:.2f}%")

# Print classification report for the test set
print(classification_report(y_test, y_test_pred_binary, target_names=['Failed', 'Success']))

# Making Predictions on New Data
# ------------------------------
# Load new data
new_data = pd.read_csv("naked_puts_results.csv")

# Apply the same preprocessing steps
for col in new_data.select_dtypes(include=['float64']).columns:
    new_data[col].fillna(new_data[col].median(), inplace=True)

new_X = new_data[selected_columns]
new_X_scaled = scaler.transform(new_X)

# Make predictions
new_predictions = model.predict(new_X_scaled)

# Convert numerical predictions to binary labels ("Success" or "Failed") using a threshold of 0.5
new_predictions_binary = (new_predictions > 0.5).astype(int)

# Convert binary labels to original labels using the label encoder
new_predictions_labels = label_encoder.inverse_transform(new_predictions_binary)

# Add predictions to the new data
new_data["Guess"] = new_predictions_labels

# Save the new data with predictions
new_data.to_csv("Data/new_data_with_predictions.csv", index=False)

In [1]:
# OLD ML ALGORITHM

# # Load your dataset
# data = pd.read_csv("Data/putsDataSuccessFailed.csv")

# # Preprocessing Steps
# # -------------------
# # 1. Fill missing values with median
# # Fill missing values with median only for numeric columns
# for col in data.select_dtypes(include=['float64']).columns:
#     data[col].fillna(data[col].median(), inplace=True)

# # 2. Label Encode 'Status' column
# label_encoder = LabelEncoder()
# data['Status'] = label_encoder.fit_transform(data['Status'])

# # 3. Feature selection
# # selected_columns = ['strike', 'volume', 'openInterest', 'Implied Volatility', 'Original Stock Price', 'Original ROI (%)', 'Original OTM (%)', 'Original Implied Volatility', '50-day MA', '100-day MA', '200-day MA', 'RSI', 'MACD', 'VWAP', 'Delta', 'Gamma', 'Theta', 'Vega', 'Rho', '52WeekHigh', '52WeekLow', 'targetHighPrice', 'targetLowPrice', 'targetMeanPrice', 'targetMedianPrice', 'recommendationMean']
# selected_columns = ['Original ROI (%)', 'Delta', 'openInterest', 'Gamma', 'Original OTM (%)', 'VWAP', 'MACD', '200-day MA', '52WeekHigh', 'volume']
# X = data[selected_columns]
# y = data['Status']

# # 4. Feature Scaling
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # Split the data
# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# # Apply SMOTE to the training data
# smote = SMOTE(random_state=42)
# X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# # Initialize the model
# random_forest_model = RandomForestClassifier(random_state=42)

# rf_param_grid = {
#     'n_estimators': [50, 100, 150, 200],
#     'max_depth': [None, 10, 20, 30, 40],
#     'min_samples_split': [2, 5, 10, 15],
#     'min_samples_leaf': [1, 2, 4, 8],
#     'max_features': ['auto', 'sqrt', 'log2', None],  # Add 'None' as a valid value
#     'bootstrap': [True, False],
#     'criterion': ['gini', 'entropy']
# }

# rf_grid_search = GridSearchCV(estimator=random_forest_model, param_grid=rf_param_grid,
#                               cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# rf_grid_search.fit(X_train_balanced, y_train_balanced)

# # Get the best parameters and update the model
# best_params = rf_grid_search.best_params_
# random_forest_model = RandomForestClassifier(**best_params, random_state=42)

# # Train the model with best parameters
# random_forest_model.fit(X_train_balanced, y_train_balanced)

# # Evaluate the model
# y_pred = random_forest_model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Model Accuracy with Best Parameters: {accuracy * 100:.2f}%")

# # Making Predictions on New Data
# # ------------------------------
# # Load new data
# new_data = pd.read_csv("naked_puts_results.csv")

# # Apply the same preprocessing steps
# for col in new_data.select_dtypes(include=['float64']).columns:
#     new_data[col].fillna(new_data[col].median(), inplace=True)

# new_X = new_data[selected_columns]
# new_X_scaled = scaler.transform(new_X)

# # Make predictions
# new_predictions = random_forest_model.predict(new_X_scaled)

# # Convert numerical predictions back to labels ("Success" or "Failed")
# new_predictions_labels = label_encoder.inverse_transform(new_predictions)

# # Add predictions to the new data
# new_data["Guess"] = new_predictions_labels

# # Save the new data with predictions
# new_data.to_csv("Data/new_data_with_predictions.csv", index=False)