In [None]:
import shap
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,ConfusionMatrixDisplay
from sklearn.impute import SimpleImputer # Import the SimpleImputer class from the correct module
from sklearn.pipeline import Pipeline  # Import Pipeline for creating the pipeline

In [None]:
models = []
models.append(('LR', LogisticRegression(solver ='lbfgs',multi_class='auto')))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVC', SVC(gamma='scale')))
models.append(('RFC', RandomForestClassifier(n_estimators=100)))
models.append(('DTR', DecisionTreeClassifier()))
models.append(('XGB',XGBClassifier()))

In [None]:
results = []
names = []

In [None]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np

# Ensure target variable is numeric
if y_resampled.dtype == 'object':
    le = LabelEncoder()
    y_resampled = le.fit_transform(y_resampled)

# Scale features BEFORE converting to NumPy
scaler = StandardScaler()
scaled_X_train_resampled_combined = scaler.fit_transform(scaled_X_train_resampled_combined)

# Convert feature set and target variable to NumPy array
X_array = np.array(scaled_X_train_resampled_combined, dtype=np.float32)  # Ensure XGB compatibility
y_array = np.array(y_resampled, dtype=np.int32)  # Ensure consistent int dtype

# Define models
models = [
    ('Logistic Regression', LogisticRegression(max_iter=500)),
    ('Random Forest', RandomForestClassifier(n_jobs=-1, class_weight={0:1, 1:1.5})),  # Boost recall for manual
    ('XGB', XGBClassifier(eval_metric='logloss', n_jobs=-1, use_label_encoder=False))
]

# Set up K-Fold cross-validation
kfold = KFold(n_splits=3, shuffle=True, random_state=42)

# Run cross-validation
for name, model in models:
    if name == "XGB":
        # Manually perform K-Fold for XGB
        accuracies = []
        for train_idx, test_idx in kfold.split(X_array):
            X_train, X_test = X_array[train_idx], X_array[test_idx]
            y_train, y_test = y_array[train_idx], y_array[test_idx]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            accuracies.append(accuracy_score(y_test, y_pred))

        print(f"{name}: {np.mean(accuracies):.4f}")
    else:
        cv_results = cross_val_score(model, X_array, y_array, cv=kfold, scoring='accuracy')
        print(f"{name}: {cv_results.mean():.4f}")


### **Hyperparameter Tuning**
Since RandomForestClassifier is the best performing model based on the cross-validation results, we will be be performimg hyperparameter tuning to identify the best hyperparameter for prediction.

In [None]:
# hyper parameter tuning of random forest regressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error


In [None]:
#Instantiating
RF = RandomForestClassifier()

# Default parameters
RF.get_params()

In [None]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],

}

In [None]:
# Create a Random Forest Regressor
rf_regressor = RandomForestClassifier()


# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=rf_regressor,
                           param_grid=param_grid, cv=3,
                           scoring='precision',
                           n_jobs=-1, verbose=2 )

In [None]:
# Fit the GridSearchCV object to the training data
grid_search.fit(scaled_X_train_resampled_combined, y_resampled)

#Use the best estimator from grid search
best_rf = grid_search.best_estimator_

In [None]:
best_rf

In [None]:
# Ensure target variable is numeric
if y_resampled.dtype == 'object':
    le = LabelEncoder()
    y_resampled = le.fit_transform(y_resampled)

# Check if scaled_X_train_resampled_combined is a DataFrame
if isinstance(scaled_X_train_resampled_combined, pd.DataFrame):
    training_feature_names = list(scaled_X_train_resampled_combined.columns)
else:
    # Try retrieving from test set if train set is already NumPy
    if isinstance(scaled_X_test_resampled_combined, pd.DataFrame):
        training_feature_names = list(scaled_X_test_resampled_combined.columns)
    else:
        raise ValueError("Feature names are lost! Use the original DataFrame before conversion.")

# Convert feature set and target variable to NumPy arrays
scaled_X_train_resampled_combined = np.array(scaled_X_train_resampled_combined)
y_array = np.array(y_resampled)

# Scale features
scaler = StandardScaler()
scaled_X_train_resampled_combined = scaler.fit_transform(scaled_X_train_resampled_combined)

# Convert X_test to NumPy and ensure it has correct feature names
scaled_X_test_resampled_combined = scaler.transform(scaled_X_test_resampled_combined)  # Apply same scaling
scaled_X_test_resampled_combined = pd.DataFrame(scaled_X_test_resampled_combined, columns=training_feature_names)

# Train the best model and make predictions
best_rf = grid_search.fit(scaled_X_train_resampled_combined, y_array).best_estimator_
y_pred = best_rf.predict(scaled_X_test_resampled_combined)


In [None]:
# Now, make predictions
y_pred = best_rf.predict(X_test)

In [None]:
if len(scaled_y1_resampled_df) == len(y_pred):
    accuracy = accuracy_score(scaled_y1_resampled_df, y_pred)
    print(f"Accuracy of the best model on the test dataset: {accuracy:.4f}")
else:
    print(f" Length mismatch: scaled_y1_resampled_df={len(scaled_y1_resampled_df)}, y_pred={len(y_pred)}")


In [None]:
# Evaluate the performance of the best model on the test dataset
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the best model on the test dataset: {accuracy:.4f}")

In [None]:
# Evaluate the performance of the best model on the test dataset
accuracy = accuracy_score(y_test, y_pred)  # Use y_test instead of y_resampled
print(f"Accuracy of the best model on the test dataset: {accuracy:.4f}")

# Generate classification report and confusion matrix
print(classification_report(y_test, y_pred))  # Use y_test
cm = confusion_matrix(y_test, y_pred)  # Use y_test
print("Confusion Matrix:")
print(cm)


In [None]:
# You can also visualize the confusion matrix using ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()