In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, Binarizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, f1_score
from sklearn.impute import SimpleImputer
from skopt import BayesSearchCV
from skopt.space import Real, Integer

# Load the training and test data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')


In [2]:
# Custom function to convert currency strings to numerical values
def currency_to_num(currency_series):
    if pd.api.types.is_numeric_dtype(currency_series):
        return currency_series
    return pd.to_numeric(currency_series.str.replace(r'[^\d.]', '', regex=True), errors='coerce')

# Apply the currency conversion function to the 'Total Assets' and 'Liabilities' columns
for df in [train_df, test_df]:
    df['Total Assets'] = currency_to_num(df['Total Assets'])
    df['Liabilities'] = currency_to_num(df['Liabilities'])
    # Create a new feature 'Assets_to_Liabilities' by dividing 'Total Assets' by 'Liabilities'
    df['Assets_to_Liabilities'] = df['Total Assets'] / np.where(df['Liabilities'] == 0, 0.01, df['Liabilities'])

# Use LabelEncoder to convert 'Education' levels into numerical labels
label_encoder = LabelEncoder()
train_df['Education'] = label_encoder.fit_transform(train_df['Education'])


In [3]:
# Use SimpleImputer to fill missing values in numerical columns with the mean value
imputer = SimpleImputer(strategy='mean')
train_df[numeric_columns] = imputer.fit_transform(train_df[numeric_columns])

# Perform one-hot encoding on the dataset to convert categorical variables into a format that can be provided to ML algorithms
train_df = pd.get_dummies(train_df.drop(['ID', 'Candidate', 'Constituency ∇'], axis=1), drop_first=True)


NameError: name 'numeric_columns' is not defined

In [None]:
# Separate features and the target variable
X = train_df.drop('Education', axis=1)
y = train_df['Education']

# Split the dataset into a training set and a validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [None]:
# Initialize the Bernoulli Naive Bayes classifier
model = BernoulliNB()

# Define the search space for the hyperparameters
search_space = {
    'alpha': Real(0.0001, 10, prior='log-uniform'),  # Smooth parameter
    'binarize': Real(0.0, 1.0)                       # Threshold for binarizing the input features
}

# Set up the Bayesian optimization with cross-validation for hyperparameter tuning
opt = BayesSearchCV(model, search_space, n_iter=32, scoring='f1_weighted', cv=StratifiedKFold(5), n_jobs=-1, random_state=42)

# Fit the optimizer to the training data
opt.fit(X_train, y_train)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assume 'feature_importances_' is available for your model
feature_importances = opt.best_estimator_.feature_importances_
features = X_train.columns

# Create a bar plot of feature importances
sns.barplot(x=feature_importances, y=features)
plt.title('Feature Importances')
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Calculate the confusion matrix
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
