In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

# Load the CSV file into a DataFrame
df = pd.read_csv('Data/putsDataSuccessFailed.csv')

# Data Preprocessing
# Drop any columns that are not relevant for prediction
df.drop(['contractSymbol', 'Expiration Date', 'ETF'], axis=1, inplace=True)

# Encode categorical variables using one-hot encoding
df = pd.get_dummies(df, columns=['recommendationKey'], drop_first=True)

# Handle missing values with imputation
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X = df.drop('Status', axis=1)
y = df['Status']

# Encode the target variable 'Status'
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_imputed = imputer.fit_transform(X)

# Address class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_imputed, y_encoded)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Model Selection and Hyperparameter Tuning (XGBoost)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.05, 0.1, 0.2]
}

# Use GridSearchCV for hyperparameter tuning
xgb_classifier = XGBClassifier(random_state=42)
grid_search = GridSearchCV(xgb_classifier, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_xgb_model = grid_search.best_estimator_

# Model Evaluation
y_pred = best_xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Best XGBoost Model: {best_xgb_model}')
print(f'Accuracy: {accuracy}')
print(report)

# You can also save the trained model for future use if needed
# from joblib import dump
# dump(best_xgb_model, 'trained_xgb_model.joblib')


Best XGBoost Model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.2, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=200, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)
Accuracy: 0.7719298245614035
              precision    recall  f1-score   support

           0       0.76      0.81      0.78        58
           1       0.79      0.73      0.76        56

    accuracy                           0.

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

# Load the CSV file into a DataFrame
df = pd.read_csv('Data/putsDataSuccessFailed.csv')

# Data Preprocessing
# Drop any columns that are not relevant for prediction
df.drop(['contractSymbol', 'Expiration Date', 'ETF'], axis=1, inplace=True)

# Encode categorical variables using one-hot encoding
df = pd.get_dummies(df, columns=['recommendationKey'], drop_first=True)

# Handle missing values with imputation
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X = df.drop('Status', axis=1)
y = df['Status']

# Encode the target variable 'Status'
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_imputed = imputer.fit_transform(X)

# Address class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_imputed, y_encoded)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Model Selection and Hyperparameter Tuning (LightGBM)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.05, 0.1, 0.2]
}

# Use GridSearchCV for hyperparameter tuning
lgb_classifier = lgb.LGBMClassifier(random_state=42)
grid_search = GridSearchCV(lgb_classifier, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_lgb_model = grid_search.best_estimator_

# Model Evaluation
y_pred = best_lgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Best LightGBM Model: {best_lgb_model}')
print(f'Accuracy: {accuracy}')
print(report)

# You can also save the trained model for future use if needed
# best_lgb_model.save_model('trained_lgb_model.txt')


[LightGBM] [Info] Number of positive: 206, number of negative: 204[LightGBM] [Info] Number of positive: 206, number of negative: 205

[LightGBM] [Info] Number of positive: 206, number of negative: 204
[LightGBM] [Info] Number of positive: 206, number of negative: 204
[LightGBM] [Info] Number of positive: 206, number of negative: 204
[LightGBM] [Info] Number of positive: 206, number of negative: 204
[LightGBM] [Info] Number of positive: 206, number of negative: 204
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002662 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002712 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002727 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3105
[LightGBM] 