## StackingCVClassifier

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import roc_curve, auc , f1_score
from sklearn.metrics import confusion_matrix
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from mlxtend.classifier import StackingCVClassifier
import random

In [2]:
import pandas as pd

# Load the training dataset
Final_Dataset_Provider_Train = pd.read_csv('Final_Train.csv')
print("Train columns before dropping:", Final_Dataset_Provider_Train.columns)

# Drop columns if they exist
Final_Dataset_Provider_Train = Final_Dataset_Provider_Train.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], errors='ignore')
print("Train columns after dropping:", Final_Dataset_Provider_Train.columns)

# Load the testing dataset
Final_Dataset_Provider_Test = pd.read_csv('Final_Test.csv')
print("Test columns before dropping:", Final_Dataset_Provider_Test.columns)

# Drop columns if they exist
Final_Dataset_Provider_Test = Final_Dataset_Provider_Test.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], errors='ignore')
print("Test columns after dropping:", Final_Dataset_Provider_Test.columns)


Train columns before dropping: Index(['Unnamed: 0', 'Provider', 'PotentialFraud', 'InscClaimAmtReimbursed',
       'DeductibleAmtPaid', 'Hospitalization_Duration', 'Claim_Period',
       'ExtraClaimDays', 'Inpatient_or_Outpatient', 'RenalDiseaseIndicator',
       ...
       'PerProviderClmDiagnosisCode_4_count_ClaimID',
       'PerProviderClmDiagnosisCode_5_count_ClaimID',
       'PerProviderClmDiagnosisCode_6_count_ClaimID',
       'PerProviderDiagnosisGroupCode_count_ClaimID', 'Gender_0', 'Gender_1',
       'Race_1', 'Race_2', 'Race_3', 'Race_5'],
      dtype='object', length=248)
Train columns after dropping: Index(['Provider', 'PotentialFraud', 'InscClaimAmtReimbursed',
       'DeductibleAmtPaid', 'Hospitalization_Duration', 'Claim_Period',
       'ExtraClaimDays', 'Inpatient_or_Outpatient', 'RenalDiseaseIndicator',
       'NoOfMonths_PartACov',
       ...
       'PerProviderClmDiagnosisCode_4_count_ClaimID',
       'PerProviderClmDiagnosisCode_5_count_ClaimID',
       'PerProvider

In [3]:
Final_Dataset_Provider_Train.head(2)

Unnamed: 0,Provider,PotentialFraud,InscClaimAmtReimbursed,DeductibleAmtPaid,Hospitalization_Duration,Claim_Period,ExtraClaimDays,Inpatient_or_Outpatient,RenalDiseaseIndicator,NoOfMonths_PartACov,...,PerProviderClmDiagnosisCode_4_count_ClaimID,PerProviderClmDiagnosisCode_5_count_ClaimID,PerProviderClmDiagnosisCode_6_count_ClaimID,PerProviderDiagnosisGroupCode_count_ClaimID,Gender_0,Gender_1,Race_1,Race_2,Race_3,Race_5
0,PRV51001,0,104640,5340.0,30.0,61,0.0,5,8,300,...,297,297,297,405,16,9,21,4,0,0
1,PRV51003,1,605670,66286.0,382.0,617,0.0,62,29,1560,...,3046,4452,5556,4972,78,54,107,24,0,1


In [4]:
Final_Dataset_Provider_Train.shape, Final_Dataset_Provider_Test.shape

((5410, 247), (1353, 246))

#### Split your whole data into train and test(80-20)

In [5]:
# Split your whole data into train and test(80-20)
from sklearn.model_selection import train_test_split
Final_Dataset_Provider_Tr, Final_Dataset_Provider_val = train_test_split(Final_Dataset_Provider_Train, test_size=0.2,
                               random_state=42,stratify=Final_Dataset_Provider_Train['PotentialFraud'])
Final_Dataset_Provider_Tr.shape, Final_Dataset_Provider_val.shape

((4328, 247), (1082, 247))

In [6]:
# Seperate dependent and independent variables
x_train = Final_Dataset_Provider_Tr.drop(axis=1,columns=['Provider','PotentialFraud'])
y_train = Final_Dataset_Provider_Tr['PotentialFraud']
x_train.shape, y_train.shape

((4328, 245), (4328,))

In [7]:
# Seperate dependent and independent variables
x_validation_provider_labels = Final_Dataset_Provider_val[['Provider','PotentialFraud']]
x_validation = Final_Dataset_Provider_val.drop(columns=['Provider','PotentialFraud'],axis=1)
y_validation = Final_Dataset_Provider_val['PotentialFraud']
x_validation.shape, y_validation.shape

((1082, 245), (1082,))

In [8]:
# prepare test data
x_test = Final_Dataset_Provider_Test.drop(axis=1,columns=['Provider'])

In [9]:
from sklearn.preprocessing import StandardScaler
# Standardize the data (train and test)
standard_scaler = StandardScaler()
standard_scaler.fit(x_train)
x_tr = standard_scaler.transform(x_train)
x_val = standard_scaler.transform(x_validation)
x_test_std = standard_scaler.transform(x_test)

# convert pandas.core.series.Series to numpy.ndarray
y_tr = y_train.to_numpy()
y_val = y_validation.to_numpy()

In [10]:
x_val.shape, y_val.shape

((1082, 245), (1082,))

In [11]:
from sklearn.metrics import roc_curve, auc , f1_score
import matplotlib.pyplot as plt
def pred_prob(clf, data):
    # predicts the probabability of class label using the model
    y_pred = clf.predict_proba(data)[:,1]
    return y_pred

def draw_roc(train_fpr, train_tpr, test_fpr, test_tpr):
    # calculate auc for train and test
    train_auc = auc(train_fpr, train_tpr)
    test_auc = auc(test_fpr, test_tpr)
    plt.plot(train_fpr, train_tpr, label="Train AUC ="+"{:.4f}".format(train_auc))
    plt.plot(test_fpr, test_tpr, label="Test AUC ="+"{:.4f}".format(test_auc))
    plt.legend()
    plt.xlabel("False Positive Rate(FPR)", size = 14)
    plt.ylabel("True Positive Rate(TPR)", size = 14)
    plt.title("Area Under Curve", size = 16)
    plt.grid(b=True, which='major', color='g', linestyle='-')
    plt.show()
    
def find_best_threshold(threshold, fpr, tpr):
    t = threshold[np.argmax(tpr*(1-fpr))]
#     print("max(tpr*(1-fpr)) = ", max(tpr*(1-fpr)), "for threshold = ", np.round(t,3))
    return t

def predict_with_best_t(proba, threshold):
    predictions = []
    for i in proba:
        if i>=threshold:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions

In [12]:
from sklearn.metrics import confusion_matrix
def draw_confusion_matrix(best_t, x_train, x_test, y_train, y_test, y_train_pred, y_test_pred):
    # Confusion matrix for train and test dataset
    fig, ax = plt.subplots(1,2, figsize=(20,6))

    train_prediction = predict_with_best_t(y_train_pred, best_t)
    cm = confusion_matrix(y_train, train_prediction)
    sns.heatmap(cm, annot=True, fmt='d', ax=ax[0])
    ax[0].set_title('Train Dataset Confusion Matrix', size = 16)
    ax[0].set_xlabel("Predicted Label", size = 14)
    ax[0].set_ylabel("Actual Label", size = 14)

    test_prediction = predict_with_best_t(y_test_pred, best_t)
    cm = confusion_matrix(y_test, test_prediction)
    sns.heatmap(cm, annot=True, fmt='d', ax=ax[1])
    ax[1].set_title('Test Dataset Confusion Matrix', size = 16)
    ax[1].set_xlabel("Predicted Label", size = 14)
    ax[1].set_ylabel("Actual Label", size = 14)
    # plt.grid()
    plt.show()
    
    return train_prediction, test_prediction

In [13]:
def validate_model(clf, x_train, x_test, y_train, y_test):
    # predict the probability of train data
    y_train_pred = pred_prob(clf, x_train)
    # predict the probability of test data
    y_test_pred = pred_prob(clf, x_test)
    # calculate tpr, fpr for diffeent thresholds using roc_curve
    train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred)
    test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred)
    
    # calculate auc for train and test
    train_auc = auc(train_fpr, train_tpr)
    print("Train AUC = ", train_auc)
    test_auc = auc(test_fpr, test_tpr)
    print("Test AUC = ", test_auc)
    
    draw_roc(train_fpr, train_tpr, test_fpr, test_tpr)
    
    best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
    
    train_prediction, test_prediction = draw_confusion_matrix(best_t, x_train, x_test, y_train, y_test, y_train_pred, y_test_pred)
    
    train_f1_score = f1_score(y_train, train_prediction)
    test_f1_score = f1_score(y_test, test_prediction)
    
    return test_auc, test_f1_score, best_t

## Build StackingCVClassifier

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from mlxtend.classifier import StackingCVClassifier

### Find best parameters using hyperparameter tuning

In [15]:
pip install --upgrade mlxtend






[notice] A new release of pip is available: 23.3.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import make_classification

# Create an example dataset (replace this with your actual dataset)
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
x_tr, y_tr = X, y  # Assuming x_tr and y_tr are your training data

# Initialize base classifiers (using only sklearn classifiers)
RANDOM_SEED = 42
clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=RANDOM_SEED)
clf3 = GaussianNB()
clf4 = SVC(kernel='linear')
clf5 = LogisticRegression()
clf6 = DecisionTreeClassifier(class_weight='balanced')

# Meta classifier
lr = LogisticRegression()

# StackingClassifier initialization with base classifiers
estimators = [
    ('knn', clf1),
    ('rf', clf2),
    ('gnb', clf3),
    ('svc', clf4),
    ('log_reg', clf5),
    ('dt', clf6)
]

stacking_clf = StackingClassifier(estimators=estimators, final_estimator=lr, cv=5)

# Hyperparameter grid for RandomizedSearchCV
params = {
    'knn__n_neighbors': [5, 10],
    'knn__leaf_size': [20, 40],
    'rf__n_estimators': [200, 400],
    'rf__max_depth': [5, 10],
    'rf__min_samples_split': [5, 10],
    'gnb__var_smoothing': [1e-5, 1e-9],
    'svc__C': [0.1, 1],
    'log_reg__C': [0.1, 1],
    'dt__max_depth': [20, 40],
    'dt__min_samples_split': [200, 250],
    'final_estimator__C': [0.1, 1]
}

# StratifiedKFold Cross-validation
cv = StratifiedKFold(n_splits=5)

# Wrapping StackingClassifier inside a RandomizedSearchCV
random_cv = RandomizedSearchCV(
    estimator=stacking_clf,
    param_distributions=params,
    scoring='roc_auc',
    n_jobs=-1,
    return_train_score=True,
    cv=cv,  # Using StratifiedKFold here
    refit=True,
    error_score='raise'  # Raise errors to debug
)

# Fit the RandomizedSearchCV with training data
random_cv.fit(x_tr, y_tr)

# Output the best parameters and model
print("Best parameters:", random_cv.best_params_)
print("Best model:", random_cv.best_estimator_)

# Optionally, you can also access results with:
# random_cv.cv_results_


Best parameters: {'svc__C': 1, 'rf__n_estimators': 400, 'rf__min_samples_split': 5, 'rf__max_depth': 10, 'log_reg__C': 1, 'knn__n_neighbors': 5, 'knn__leaf_size': 20, 'gnb__var_smoothing': 1e-09, 'final_estimator__C': 1, 'dt__min_samples_split': 250, 'dt__max_depth': 20}
Best model: StackingClassifier(cv=5,
                   estimators=[('knn', KNeighborsClassifier(leaf_size=20)),
                               ('rf',
                                RandomForestClassifier(max_depth=10,
                                                       min_samples_split=5,
                                                       n_estimators=400,
                                                       random_state=42)),
                               ('gnb', GaussianNB()),
                               ('svc', SVC(C=1, kernel='linear')),
                               ('log_reg', LogisticRegression(C=1)),
                               ('dt',
                                DecisionTreeClassifier(cl

### Use best parameters to train StackingCVClassifier

In [17]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# Define base classifiers
clf1 = KNeighborsClassifier(n_neighbors=10, leaf_size=40)
clf2 = RandomForestClassifier(n_estimators=400, min_samples_split=10, max_depth=10)
clf3 = GaussianNB(var_smoothing=1e-05)
clf4 = SVC(C=0.1, kernel='linear')
clf5 = LogisticRegression(C=0.1)
clf6 = DecisionTreeClassifier(max_depth=40, min_samples_split=250, class_weight='balanced')

# Define final estimator
final_estimator = LogisticRegression(C=1)

# Initialize StackingClassifier
stacking_clf = StackingClassifier(estimators=[
    ('knn', clf1),
    ('rf', clf2),
    ('nb', clf3),
    ('svc', clf4),
    ('lr', clf5),
    ('dt', clf6)
], final_estimator=final_estimator)

# Fit the stacking classifier
stacking_clf.fit(x_tr, y_tr)

# # Predict on test data
# y_pred = stacking_clf.predict(x_test_std)

# # Evaluate accuracy (or any other metrics)
# from sklearn.metrics import accuracy_score
# accuracy = accuracy_score(y_test_std, y_pred)
# print(f'Accuracy: {accuracy}')


In [18]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier  # You can use any classifier

# Example classifier (you can replace it with any classifier)
classifier = RandomForestClassifier()

# Example data (replace with your actual dataset)
# X = features, y = labels
X = np.random.rand(100, 10)  # 100 samples, 10 features
y = np.random.randint(0, 2, 100)  # Binary target variable

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the validate_model function
def validate_model(classifier, X_train, X_val, y_train, y_val):
    # Train the classifier on the training data
    classifier.fit(X_train, y_train)
    
    # Get predicted probabilities for the validation set
    y_probs = classifier.predict_proba(X_val)[:, 1]
    
    # Calculate AUC score
    test_auc = roc_auc_score(y_val, y_probs)
    
    # Calculate Precision-Recall curve and F1 Score
    precision, recall, thresholds = precision_recall_curve(y_val, y_probs)
    
    # Calculate F1 scores for each threshold
    f1_scores = 2 * (precision * recall) / (precision + recall)
    
    # Find the best threshold based on F1 score
    best_t = thresholds[f1_scores.argmax()]
    
    # Calculate the F1 score at the best threshold
    y_pred_best = (y_probs >= best_t).astype(int)
    test_f1_score = f1_score(y_val, y_pred_best)
    
    return test_auc, test_f1_score, best_t

# Call the validate_model function and get results
test_auc, test_f1_score, best_t = validate_model(classifier, X_train, X_val, y_train, y_val)

# Print the results
print("Best Threshold = {:.4f}".format(best_t))
print("Model AUC is : {:.4f}".format(test_auc))
print("Model F1 Score is : {:.4f}".format(test_f1_score))


Best Threshold = 0.5000
Model AUC is : 0.8021
Model F1 Score is : 0.8333


In [19]:

# Function to get predicted probabilities (probabilities for the positive class)
def pred_prob(classifier, X_val):
    return classifier.predict_proba(X_val)[:, 1]

# Function to predict with the best threshold
def predict_with_best_t(y_pred_prob, best_t):
    return (y_pred_prob >= best_t).astype(int)

# Example data and classifier
X = np.random.rand(100, 10)  # Replace with your actual data
y = np.random.randint(0, 2, 100)  # Replace with your actual target

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Example classifier (replace with your actual classifier)
classifier = RandomForestClassifier()

# Train the classifier
classifier.fit(X_train, y_train)

# Validate the model and get the best threshold
test_auc, test_f1_score, best_t = validate_model(classifier, X_train, X_val, y_train, y_val)

# Get predicted probabilities for the validation set
y_val_pred = pred_prob(classifier, X_val)

# Get predictions based on the best threshold
y_val_prediction = predict_with_best_t(y_val_pred, best_t)

# Assuming you have a DataFrame to store the results
import pandas as pd

# Example DataFrame, replace with your actual DataFrame
x_validation_provider_labels = pd.DataFrame(X_val, columns=[f"Feature_{i}" for i in range(1, X_val.shape[1] + 1)])

# Add predicted labels to the DataFrame
x_validation_provider_labels['Predicted_Label'] = y_val_prediction

# Reset index if needed
x_validation_provider_labels = x_validation_provider_labels.reset_index(drop=True)

# Output the result
print(x_validation_provider_labels.head())


   Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \
0   0.540878   0.599052   0.511086   0.489511   0.474792   0.200858   
1   0.439993   0.865022   0.030926   0.190746   0.166529   0.476428   
2   0.700840   0.464298   0.697205   0.567443   0.904723   0.722051   
3   0.410866   0.926026   0.516775   0.487900   0.554579   0.074421   
4   0.777505   0.886357   0.492454   0.930671   0.167379   0.410005   

   Feature_7  Feature_8  Feature_9  Feature_10  Predicted_Label  
0   0.237598   0.492746   0.512410    0.504521                1  
1   0.824067   0.120861   0.450666    0.519638                1  
2   0.864470   0.229974   0.775099    0.470605                1  
3   0.616081   0.383272   0.288337    0.897699                1  
4   0.803970   0.609554   0.753018    0.515502                1  


In [20]:
x_validation_provider_labels.head(10)

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,Predicted_Label
0,0.540878,0.599052,0.511086,0.489511,0.474792,0.200858,0.237598,0.492746,0.51241,0.504521,1
1,0.439993,0.865022,0.030926,0.190746,0.166529,0.476428,0.824067,0.120861,0.450666,0.519638,1
2,0.70084,0.464298,0.697205,0.567443,0.904723,0.722051,0.86447,0.229974,0.775099,0.470605,1
3,0.410866,0.926026,0.516775,0.4879,0.554579,0.074421,0.616081,0.383272,0.288337,0.897699,1
4,0.777505,0.886357,0.492454,0.930671,0.167379,0.410005,0.80397,0.609554,0.753018,0.515502,1
5,0.250657,0.003734,0.721635,0.238956,0.51252,0.635442,0.651917,0.20494,0.544261,0.6675,1
6,0.345893,0.327521,0.618445,0.678236,0.295261,0.379861,0.582444,0.474327,0.443452,0.093462,1
7,0.15148,0.999976,0.947731,0.888003,0.717475,0.530054,0.089239,0.102027,0.599992,0.704202,1
8,0.360362,0.558732,0.006318,0.735428,0.535645,0.182571,0.286119,0.356301,0.551898,0.10361,1
9,0.500759,0.610452,0.11679,0.160275,0.340378,0.178239,0.810017,0.314559,0.5583,0.74651,1


In [21]:
x_validation_provider_labels.tail(10)

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,Predicted_Label
10,0.677534,0.892016,0.735542,0.949316,0.16284,0.402519,0.804139,0.785566,0.850846,0.217984,1
11,0.159785,0.770467,0.210313,0.337344,0.195532,0.655915,0.806374,0.008629,0.607257,0.094312,1
12,0.972026,0.876887,0.351172,0.611964,0.031296,0.031457,0.197321,0.95931,0.22389,0.997566,1
13,0.203123,0.082167,0.989045,0.399997,0.22314,0.189154,0.441214,0.544276,0.992514,0.038109,1
14,0.005873,0.412824,0.902082,0.628674,0.789815,0.381954,0.091975,0.979612,0.775869,0.623348,1
15,0.470107,0.892027,0.204918,0.073637,0.757239,0.600166,0.813053,0.962134,0.925227,0.559478,1
16,0.475559,0.227839,0.506784,0.863036,0.125967,0.194856,0.805002,0.452358,0.061799,0.6235,1
17,0.26953,0.782542,0.84516,0.833772,0.651446,0.3997,0.417102,0.400965,0.995448,0.7263,1
18,0.990729,0.554464,0.380074,0.359838,0.525872,0.603651,0.935386,0.91712,0.441179,0.66165,1
19,0.080707,0.187415,0.43411,0.707378,0.344294,0.685027,0.231784,0.463097,0.950662,0.862757,1
