In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

In [2]:
# Load the dataset
df = pd.read_csv("advertising_ef.csv")

In [3]:
# Drop rows with missing values
df_cleaned = df.dropna()

In [4]:
# Encode categorical features
label_enc = LabelEncoder()
df_cleaned['Gender'] = label_enc.fit_transform(df_cleaned['Gender'])
df_cleaned['Country'] = label_enc.fit_transform(df_cleaned['Country'])
df_cleaned['City'] = label_enc.fit_transform(df_cleaned['City'])

In [5]:
df

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,City,Gender,Country,Clicked on Ad
0,68.95,35.0,61833.90,256.09,Wrightburgh,Female,Tunisia,0
1,,31.0,68441.85,193.77,West Jodi,Male,Nauru,0
2,69.47,26.0,59785.94,236.50,Davidton,Female,San Marino,0
3,74.15,29.0,54806.18,245.89,West Terrifurt,Male,Italy,0
4,68.37,35.0,73889.99,225.58,South Manuel,Female,Iceland,0
...,...,...,...,...,...,...,...,...
1004,72.97,30.0,71384.57,208.58,Duffystad,Male,Lebanon,1
1005,51.30,45.0,67782.17,134.42,New Darlene,Male,Bosnia and Herzegovina,1
1006,51.63,51.0,42415.72,120.37,South Jessica,Male,Mongolia,1
1007,55.55,19.0,41920.79,187.95,West Steven,Female,Guatemala,0


In [6]:

# Define features (X) and target (y)
X = df_cleaned.drop(columns=['Clicked on Ad'])
y = df_cleaned['Clicked on Ad']


In [7]:
# Standardize numerical features
scaler = StandardScaler()
X[['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage']] = scaler.fit_transform(
    X[['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage']]
)
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
# Gaussian Naïve Bayes (for numerical features)
gnb = GaussianNB()
X_train_gnb = X_train[['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage']]
X_test_gnb = X_test[['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage']]
gnb.fit(X_train_gnb, y_train)
probs_gnb = gnb.predict_proba(X_test_gnb)

In [9]:
# Multinomial Naïve Bayes (for categorical features)
mnb = MultinomialNB()
X_train_mnb = X_train[['City', 'Country']]
X_test_mnb = X_test[['City', 'Country']]
mnb.fit(X_train_mnb, y_train)
probs_mnb = mnb.predict_proba(X_test_mnb)

In [10]:
# Bernoulli Naïve Bayes (for binary feature: Gender)
bnb = BernoulliNB()
X_train_bnb = X_train[['Gender']]
X_test_bnb = X_test[['Gender']]
bnb.fit(X_train_bnb, y_train)
probs_bnb = bnb.predict_proba(X_test_bnb)

In [11]:
# Ensemble Using Probability Multiplication
ensemble_probs = probs_gnb * probs_mnb * probs_bnb  # Multiply probabilities
ensemble_probs = ensemble_probs / ensemble_probs.sum(axis=1, keepdims=True)  # Normalize

# Final predictions
final_predictions = ensemble_probs[:, 1] >= 0.5  # Convert probabilities to binary (0 or 1)

# Model accuracies
accuracy_gnb = accuracy_score(y_test, gnb.predict(X_test_gnb))
accuracy_mnb = accuracy_score(y_test, mnb.predict(X_test_mnb))
accuracy_bnb = accuracy_score(y_test, bnb.predict(X_test_bnb))
ensemble_accuracy = accuracy_score(y_test, final_predictions)

# Print results
print("Gaussian Naïve Bayes Accuracy:", accuracy_gnb)
print("Multinomial Naïve Bayes Accuracy:", accuracy_mnb)
print("Bernoulli Naïve Bayes Accuracy:", accuracy_bnb)
print("Ensemble Model Accuracy:", ensemble_accuracy)

Gaussian Naïve Bayes Accuracy: 0.9477351916376306
Multinomial Naïve Bayes Accuracy: 0.49477351916376305
Bernoulli Naïve Bayes Accuracy: 0.5017421602787456
Ensemble Model Accuracy: 0.9547038327526133


In [12]:
# Convert probabilities to DataFrame for better understanding in the form of table
prob_df = pd.DataFrame({
    'Actual Target': y_test.values,
    'GNB - P(No)': probs_gnb[:, 0], 'GNB - P(Yes)': probs_gnb[:, 1],
    'MNB - P(No)': probs_mnb[:, 0], 'MNB - P(Yes)': probs_mnb[:, 1],
    'BNB - P(No)': probs_bnb[:, 0], 'BNB - P(Yes)': probs_bnb[:, 1],
    'Ensemble - P(No)': ensemble_probs[:, 0], 'Ensemble - P(Yes)': ensemble_probs[:, 1],
    'Final Prediction': final_predictions
})

# Map 0 -> "No", 1 -> "Yes" for better readability
prob_df['Actual Target'] = prob_df['Actual Target'].map({0: "No", 1: "Yes"})
prob_df['Final Prediction'] = prob_df['Final Prediction'].map({0: "No", 1: "Yes"})

# Display DataFrame in tabular format with rounded values
prob_df = prob_df.round(6)

prob_df.head()

Unnamed: 0,Actual Target,GNB - P(No),GNB - P(Yes),MNB - P(No),MNB - P(Yes),BNB - P(No),BNB - P(Yes),Ensemble - P(No),Ensemble - P(Yes),Final Prediction
0,No,0.999381,0.000619,0.941175,0.058825,0.468887,0.531113,0.999956,4.4e-05,
1,Yes,2e-06,0.999998,0.064263,0.935737,0.468887,0.531113,0.0,1.0,
2,No,0.999426,0.000574,0.558725,0.441275,0.468887,0.531113,0.999486,0.000514,
3,No,0.999869,0.000131,0.086554,0.913446,0.468887,0.531113,0.998436,0.001564,
4,Yes,8e-06,0.999992,0.652755,0.347245,0.468887,0.531113,1.3e-05,0.999987,
