In [None]:
# Load your dataset
data = pd.read_csv("Data/putsDataSuccessFailed.csv")

# Preprocessing Steps
# -------------------

# 1. Fill missing values with median for numeric columns
for col in data.select_dtypes(include=['float64']).columns:
    data[col].fillna(data[col].median(), inplace=True)

# 2. Label Encode 'Status' column
label_encoder = LabelEncoder()
data['Status'] = label_encoder.fit_transform(data['Status'])

# 3. Feature selection (You can experiment with different feature sets)
selected_columns = ['strike', 'volume', 'openInterest', 'Implied Volatility', 'Original Stock Price', 'Original ROI (%)', 'Original OTM (%)', 'Original Implied Volatility', '50-day MA', '100-day MA', '200-day MA', 'RSI', 'MACD', 'VWAP', 'Delta', 'Gamma', 'Theta', 'Vega', 'Rho', '52WeekHigh', '52WeekLow', 'targetHighPrice', 'targetLowPrice', 'targetMeanPrice', 'targetMedianPrice', 'recommendationMean']
X = data[selected_columns]
y = data['Status']

# 4. Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# Define a more complex model
def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units_input', min_value=256, max_value=1024, step=64), activation='relu', kernel_regularizer=l2(0.001), input_shape=(X_train.shape[1],)))
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_input', min_value=0.2, max_value=0.5, step=0.1)))
    
    for i in range(hp.Int('num_layers', min_value=2, max_value=5)):
        model.add(Dense(units=hp.Int(f'units_{i}', min_value=64, max_value=512, step=64), activation='relu', kernel_regularizer=l2(0.001)))
        model.add(BatchNormalization())
        model.add(Dropout(hp.Float(f'dropout_{i}', min_value=0.2, max_value=0.5, step=0.1)))
    
    model.add(Dense(1, activation='sigmoid'))
    
    # Use a lower learning rate
    custom_optimizer = Adam(learning_rate=hp.Choice('learning_rate', values=[1e-4, 5e-5, 1e-5]))
    model.compile(optimizer=custom_optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    return model

# Create a Keras Tuner RandomSearch object
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,  # You can adjust the number of trials
    directory='keras_tuner',
    project_name='put_option_model'
)

# Perform hyperparameter tuning
tuner.search(X_train, y_train,
             epochs=100,
             batch_size=128,
             validation_split=0.2,
             verbose=0)

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build and compile the best model
best_model = build_model(best_hps)

# Implement k-fold cross-validation for more robust training
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

for train_idx, val_idx in kfold.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Apply SMOTE to the training data for each fold
    smote = SMOTE(random_state=42)
    X_train_fold_balanced, y_train_fold_balanced = smote.fit_resample(X_train_fold, y_train_fold)

    # Calculate class weights for each fold
    class_weights_fold = compute_class_weight('balanced', classes=np.unique(y_train_fold_balanced), y=y_train_fold_balanced.ravel())
    class_weight_dict_fold = {i: weight for i, weight in enumerate(class_weights_fold)}

    # Train the best model for this fold
    history = best_model.fit(X_train_fold_balanced, y_train_fold_balanced,
                            epochs=100,
                            batch_size=128,  # Increased batch size for stability
                            validation_data=(X_val_fold, y_val_fold),
                            callbacks=[early_stopping],
                            class_weight=class_weight_dict_fold,
                            verbose=0)  # Suppress verbose output

    # Evaluate the best model on the validation fold
    y_val_pred_prob = best_model.predict(X_val_fold)
    y_val_pred_binary = (y_val_pred_prob > 0.5).astype(int)
    val_accuracy = accuracy_score(y_val_fold, y_val_pred_binary)
    accuracies.append(val_accuracy)

# Calculate the mean accuracy over all folds
mean_accuracy = np.mean(accuracies)
print(f"Mean Validation Accuracy: {mean_accuracy * 100:.2f}%")

# Retrain the best model on the full training set
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Apply class weighting
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# Initialize the XGBoost classifier
model = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    scale_pos_weight=class_weights[1] / class_weights[0]  # Adjust scale_pos_weight for class imbalance
)

# Train the XGBoost model
model.fit(X_train, y_train)

# Evaluate the XGBoost model on the test set
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"XGBoost Test Set Accuracy: {test_accuracy * 100:.2f}%")

# Print classification report for the XGBoost test set
print("Classification Report for XGBoost Model:")
print(classification_report(y_test, y_test_pred, target_names=['Failed', 'Success']))


# Making Predictions on New Data
# ------------------------------
# Load new data
new_data = pd.read_csv("naked_puts_results.csv")

# Apply the same preprocessing steps
for col in new_data.select_dtypes(include=['float64']).columns:
    new_data[col].fillna(new_data[col].median(), inplace=True)

new_X = new_data[selected_columns]
new_X_scaled = scaler.transform(new_X)

# Make predictions
new_predictions = model.predict(new_X_scaled)

# Convert numerical predictions to binary labels ("Success" or "Failed") using a threshold of 0.5
new_predictions_binary = (new_predictions > 0.5).astype(int)

# Convert binary labels to original labels using the label encoder
new_predictions_labels = label_encoder.inverse_transform(new_predictions_binary)

# Add predictions to the new data
new_data["Guess"] = new_predictions_labels

# Save the new data with predictions
new_data.to_csv("Data/new_data_with_predictions.csv", index=False)

# Save the best model
best_model.save("put_option_best_model.h5")

In [1]:
# Load your dataset
data = pd.read_csv("Data/putsDataSuccessFailed.csv")

# Preprocessing Steps
# -------------------
# 1. Fill missing values with median
# Fill missing values with median only for numeric columns
for col in data.select_dtypes(include=['float64']).columns:
    data[col].fillna(data[col].median(), inplace=True)

# 2. Label Encode 'Status' column
label_encoder = LabelEncoder()
data['Status'] = label_encoder.fit_transform(data['Status'])

# Save the label encoder to a file
label_encoder_filename = "label_encoder.joblib"
joblib.dump(label_encoder, label_encoder_filename)

# 3. Feature selection
selected_columns = ['strike', 'volume', 'openInterest', 'Implied Volatility', 'Original Stock Price', 'Original ROI (%)', 'Original OTM (%)', 'Original Implied Volatility', '50-day MA', '100-day MA', '200-day MA', 'RSI', 'MACD', 'VWAP', 'Delta', 'Gamma', 'Theta', 'Vega', 'Rho', '52WeekHigh', '52WeekLow', 'targetHighPrice', 'targetLowPrice', 'targetMeanPrice', 'targetMedianPrice', 'recommendationMean']
X = data[selected_columns]
y = data['Status']

# 4. Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Initialize the model
random_forest_model = RandomForestClassifier(random_state=42)

rf_param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['auto', 'sqrt', 'log2', None, 0.25, 0.5, 0.75],  # Include valid options
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

rf_grid_search = GridSearchCV(estimator=random_forest_model, param_grid=rf_param_grid,
                              cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

rf_grid_search.fit(X_train_balanced, y_train_balanced)

# Get the best parameters and update the model
best_params = rf_grid_search.best_params_
random_forest_model = RandomForestClassifier(**best_params, random_state=42)

# Train the model with best parameters
random_forest_model.fit(X_train_balanced, y_train_balanced)

# Save the trained model and scaler to separate files
model_filename = "random_forest_model.joblib"
scaler_filename = "scaler.joblib"

joblib.dump(random_forest_model, model_filename)
joblib.dump(scaler, scaler_filename)

print(f"Random Forest model saved to {model_filename}")
print(f"Scaler saved to {scaler_filename}")

# Evaluate the model
y_pred = random_forest_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy with Best Parameters: {accuracy * 100:.2f}%")

# Making Predictions on New Data
# ------------------------------
# Load new data
new_data = pd.read_csv("naked_puts_results.csv")

# Apply the same preprocessing steps
for col in new_data.select_dtypes(include=['float64']).columns:
    new_data[col].fillna(new_data[col].median(), inplace=True)

new_X = new_data[selected_columns]
new_X_scaled = scaler.transform(new_X)

# Make predictions
new_predictions = random_forest_model.predict(new_X_scaled)

# Convert numerical predictions back to labels ("Success" or "Failed")
new_predictions_labels = label_encoder.inverse_transform(new_predictions)

# Add predictions to the new data
new_data["Guess"] = new_predictions_labels

# Save the new data with predictions
new_data.to_csv("Data/new_data_with_predictions.csv", index=False)