In [9]:
# Define the data
data = [
    ('<=30', 'high', 'no', 'fair', 'no'),
    ('<=30', 'high', 'no', 'excellent', 'no'),
    ('31...40', 'high', 'no', 'fair', 'yes'),
    ('>40', 'medium', 'no', 'fair', 'yes'),
    ('>40', 'low', 'yes', 'fair', 'yes'),
    ('>40', 'low', 'yes', 'excellent', 'no'),
    ('31...40', 'low', 'yes', 'excellent', 'yes'),
    ('<=30', 'medium', 'no', 'fair', 'no'),
    ('<=30', 'low', 'yes', 'fair', 'yes'),
    ('>40', 'medium', 'yes', 'fair', 'yes'),
    ('<=30', 'medium', 'yes', 'excellent', 'yes'),
    ('31...40', 'medium', 'no', 'excellent', 'yes'),
    ('31...40', 'high', 'yes', 'fair', 'yes'),
    ('>40', 'medium', 'no', 'excellent', 'no')
]

# Count the occurrences of each class
class_counts = {}
total_instances = len(data)

for instance in data:
    class_label = instance[-1]  # Assuming the class label is in the last column
    class_counts[class_label] = class_counts.get(class_label, 0) + 1

# Calculate prior probabilities
prior_probabilities = {}

for class_label, count in class_counts.items():
    prior_probabilities[class_label] = count / total_instances

# Print the prior probabilities
for class_label, probability in prior_probabilities.items():
    print(f'Prior Probability for Class "{class_label}": {probability:.2f}')


Prior Probability for Class "no": 0.36
Prior Probability for Class "yes": 0.64


In [13]:
from collections import defaultdict

# Assuming the class label is in the last column
class_labels = set(instance[-1] for instance in data)

# Calculate the class conditional probabilities for each categorical feature
class_conditional_probabilities = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

for feature_index in range(len(data[0]) - 1):  # Exclude the last column (class label)
    for class_label in class_labels:
        # Count the occurrences of each category within the specific class
        category_counts = defaultdict(int)
        total_instances_in_class = 0

        for instance in data:
            if instance[-1] == class_label:
                category_counts[instance[feature_index]] += 1
                total_instances_in_class += 1

        # Calculate the class conditional probabilities for each category
        for category, count in category_counts.items():
            probability = count / total_instances_in_class
            class_conditional_probabilities[class_label][feature_index][category] = probability

            # Check if any probability values are zero
            if probability == 0:
                print(f'Zero probability found for Feature {feature_index + 1}, Class "{class_label}", Category "{category}"')

# Print the class conditional probabilities
for class_label, feature_probs in class_conditional_probabilities.items():
    for feature_index, category_probs in feature_probs.items():
        print(f'Class "{class_label}", Feature {feature_index + 1} probabilities:')
        for category, probability in category_probs.items():
            print(f'  Category "{category}": {probability:.2f}')


Class "no", Feature 1 probabilities:
  Category "<=30": 0.60
  Category ">40": 0.40
Class "no", Feature 2 probabilities:
  Category "high": 0.40
  Category "low": 0.20
  Category "medium": 0.40
Class "no", Feature 3 probabilities:
  Category "no": 0.80
  Category "yes": 0.20
Class "no", Feature 4 probabilities:
  Category "fair": 0.40
  Category "excellent": 0.60
Class "yes", Feature 1 probabilities:
  Category "31...40": 0.44
  Category ">40": 0.33
  Category "<=30": 0.22
Class "yes", Feature 2 probabilities:
  Category "high": 0.22
  Category "medium": 0.44
  Category "low": 0.33
Class "yes", Feature 3 probabilities:
  Category "no": 0.33
  Category "yes": 0.67
Class "yes", Feature 4 probabilities:
  Category "fair": 0.67
  Category "excellent": 0.33


In [16]:
from scipy.stats import chi2_contingency
import numpy as np

# Assuming the class label is in the last column
class_labels = set(instance[-1] for instance in data)

# Function to perform Chi-square test for independence
def chi_square_test(feature1, feature2):
    contingency_table = np.zeros((len(class_labels), len(class_labels)))
    
    for i, class_label1 in enumerate(class_labels):
        for j, class_label2 in enumerate(class_labels):
            # Count the occurrences of each combination of categories
            count = sum(1 for instance in data if instance[-1] == class_label1 and instance[feature1] == instance[feature2] == class_label2)
            contingency_table[i, j] = count
    
    # Check if all observed values are zero
    if np.all(contingency_table == 0):
        print(f'Observations for Feature {feature1 + 1} and Feature {feature2 + 1} are all zero. Chi-square test cannot be performed.\n')
        return np.nan, np.nan
    
    # Perform Chi-square test
    chi2_stat, p_value, _, _ = chi2_contingency(contingency_table)
    
    return chi2_stat, p_value

# Test for independence between the four given features
for i in range(len(data[0]) - 1):  # Exclude the last column (class label)
    for j in range(i + 1, len(data[0]) - 1):
        chi2_stat, p_value = chi_square_test(i, j)
        
        print(f'Chi-square test for independence between Feature {i + 1} and Feature {j + 1}:')
        print(f'Chi2 Stat: {chi2_stat:.2f}')
        print(f'P-value: {p_value:.4f}')
        
        # Check if the test was not performed
        if np.isnan(chi2_stat) or np.isnan(p_value):
            continue
        
        # Check significance level (e.g., 0.05)
        if p_value <= 0.05:
            print('There is a significant association between the features.\n')
        else:
            print('There is no significant association between the features.\n')


Observations for Feature 1 and Feature 2 are all zero. Chi-square test cannot be performed.

Chi-square test for independence between Feature 1 and Feature 2:
Chi2 Stat: nan
P-value: nan
Observations for Feature 1 and Feature 3 are all zero. Chi-square test cannot be performed.

Chi-square test for independence between Feature 1 and Feature 3:
Chi2 Stat: nan
P-value: nan
Observations for Feature 1 and Feature 4 are all zero. Chi-square test cannot be performed.

Chi-square test for independence between Feature 1 and Feature 4:
Chi2 Stat: nan
P-value: nan
Observations for Feature 2 and Feature 3 are all zero. Chi-square test cannot be performed.

Chi-square test for independence between Feature 2 and Feature 3:
Chi2 Stat: nan
P-value: nan
Observations for Feature 2 and Feature 4 are all zero. Chi-square test cannot be performed.

Chi-square test for independence between Feature 2 and Feature 4:
Chi2 Stat: nan
P-value: nan
Observations for Feature 3 and Feature 4 are all zero. Chi-square

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Assuming the class label is in the last column
X = [instance[:-1] for instance in data]
y = [instance[-1] for instance in data]

# Convert categorical labels to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Convert categorical features to numerical values
X_numeric = []
for instance in X:
    numeric_instance = []
    for value in instance:
        if isinstance(value, str):
            # Encode categorical values
            if value in label_encoder.classes_:
                encoded_value = label_encoder.transform([value])[0]
            else:
                # Handle previously unseen label in test set
                encoded_value = label_encoder.transform([label_encoder.classes_[0]])[0]
            numeric_instance.append(encoded_value)
        else:
            # For numerical values, keep them unchanged
            numeric_instance.append(float(value))
    X_numeric.append(numeric_instance)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y_encoded, test_size=0.2, random_state=42)

# Build the Gaussian Naive Bayes model
model = GaussianNB()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_rep)


Accuracy: 0.67
Classification Report:
              precision    recall  f1-score   support

          no       0.50      1.00      0.67         1
         yes       1.00      0.50      0.67         2

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3



In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the Data
df = pd.read_csv("final1")

# Step 2: Data Preprocessing
# Assume 'target_column' is the column you want to predict
X = df.drop('embed_0', axis=1)  # Features
y = df['Label']  # Target variable

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Naive Bayes Model (Gaussian)
model = GaussianNB()

# Step 5: Training
model.fit(X_train, y_train)

# Step 6: Testing and Evaluation
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_rep)


FileNotFoundError: [Errno 2] No such file or directory: 'final1'