In [28]:
import pandas as pd
import math

In [29]:
data = pd.read_csv('zoo.csv')
print(data.head())

  animal_name  hair  feathers  eggs  milk  airborne  aquatic  predator  \
0    aardvark     1         0     0     1         0        0         1   
1    antelope     1         0     0     1         0        0         0   
2        bass     0         0     1     0         0        1         1   
3        bear     1         0     0     1         0        0         1   
4        boar     1         0     0     1         0        0         1   

   toothed  backbone  breathes  venomous  fins  legs  tail  domestic  catsize  \
0        1         1         1         0     0     4     0         0        1   
1        1         1         1         0     0     4     1         0        1   
2        1         1         0         0     1     0     1         0        0   
3        1         1         1         0     0     4     0         0        1   
4        1         1         1         0     0     4     1         0        1   

   class_type  
0           1  
1           1  
2           4  
3   

In [30]:
# Identify the target column 'class_type' before modification
class_type_col = data.columns[-1]

In [31]:
# Make 'legs' feature boolean type
leg_numbers = data['legs'].unique()
print(leg_numbers)

[4 0 2 6 8 5]


In [32]:
for value in leg_numbers:
    data[f'leg_number_is_{value}'] = (data['legs'] == value).astype(int)

data = data.drop(columns=['legs'])

In [33]:
# Reorder columns to ensure 'class_type' is the last column
cols = [col for col in data.columns if col != class_type_col] + [class_type_col]
data = data[cols]

# Check the updated dataframe
print("Updated Dataframe: ")
print(data.head())

Updated Dataframe: 
  animal_name  hair  feathers  eggs  milk  airborne  aquatic  predator  \
0    aardvark     1         0     0     1         0        0         1   
1    antelope     1         0     0     1         0        0         0   
2        bass     0         0     1     0         0        1         1   
3        bear     1         0     0     1         0        0         1   
4        boar     1         0     0     1         0        0         1   

   toothed  backbone  ...  tail  domestic  catsize  leg_number_is_4  \
0        1         1  ...     0         0        1                1   
1        1         1  ...     1         0        1                1   
2        1         1  ...     1         0        0                0   
3        1         1  ...     0         0        1                1   
4        1         1  ...     1         0        1                1   

   leg_number_is_0  leg_number_is_2  leg_number_is_6  leg_number_is_8  \
0                0                0

In [34]:
# Split data into training and testing sets (70/30 split)
train = data.sample(frac=0.7, random_state=1)
test = data.drop(train.index)

In [35]:
# Calculate class probabilities
def get_class_probs(train_data):
    class_probs = {}
    classes = train_data['class_type'].unique()
    total_count = len(train_data)
    
    for class_value in classes:
        class_data = train_data[train_data['class_type'] == class_value]
        class_count = len(class_data)
        class_probs[class_value] = class_count / total_count

    return class_probs

In [36]:
# Calculate conditional probabilities with Laplace smoothing
def get_conditional_probs(train_data):
    conditional_probs = {}
    classes = train_data['class_type'].unique()

    for class_value in classes:
        class_data = train_data[train_data['class_type'] == class_value]
        class_instance_count = len(class_data)
        conditional_probs[class_value] = {}
        

        # Loop through each feature, excluding 'animal_name' and 'class_type'
        for feature in train_data.columns.drop(['animal_name', 'class_type']):
            # Count the number of 1s and 0s for each feature in this class
            count_1 = class_data[feature].sum() # Number of 1s
            count_0 = class_instance_count - count_1 # Number of 0s

            # Use a small probability if the value wasn't seen (Laplace smoothing)
            prob_1 = (count_1 + 1e-4) / (class_instance_count + 1e-4 * 2) # alpha = 0.0001
            prob_0 = (count_0 + 1e-4) / (class_instance_count + 1e-4 * 2)
                
            conditional_probs[class_value][f'{feature}_probability_1'] = prob_1
            conditional_probs[class_value][f'{feature}_probability_0'] = prob_0

    return conditional_probs

In [37]:
# Function to predict the class for a given instance
def predict(test_instance, class_probs, conditional_probs):
    probabilities = {}  # Stores the probabilities for each class
    
    # Loop through each class to calculate its probability
    for class_value, class_prob in class_probs.items():
        # Start with the class probability (prior)
        probabilities[class_value] = math.log(class_prob)  # Use log for numerical stability
        
        # Multiply by the conditional probabilities for each feature
        for feature in test_instance.index.drop('animal_name'):  
            # Skip 'animal_name' only, as 'class_type' will be dropped while calling instance
            
            value = test_instance[feature]
            
            # Get the probability for feature being 1 or 0
            prob_key = f'{feature}_probability_{value}'

            # Add the log of the conditional probability
            probabilities[class_value] += math.log(conditional_probs[class_value][prob_key])
            

    # Convert log probabilities back to real probabilities (exp)
    probabilities = {class_value: math.exp(log_prob) for class_value, log_prob in probabilities.items()}
    
    # Normalize probabilities so they sum to 1
    total = sum(probabilities.values())
    probabilities = {class_value: prob / total for class_value, prob in probabilities.items()}

    # Return the class with the highest probability
    predicted_class = max(probabilities, key=probabilities.get)

    return predicted_class, probabilities

In [38]:
# Get class probabilities and conditional probabilities from the training set
class_probs = get_class_probs(train)
conditional_probs = get_conditional_probs(train)


# Prepare a list to hold the output data
output_data = []

# Iterate through the test set to make predictions and print to standard output
# print(','.join(list(test.columns) + ['predicted', 'probability', 'correct?']))  # Print CSV header

correct_predictions = 0  # Counter for correct predictions

# Iterate over each row in the test set
for index, row in test.iterrows():
    # Get the instance without the class label
    test_instance = row.drop('class_type') # Exclude the actual class for prediction
    actual_class = row['class_type']  # The actual class label
    predicted_class, probabilities = predict(test_instance, class_probs, conditional_probs)  # Predict the class

    # Check if the prediction is correct
    is_correct = (predicted_class == actual_class)
    if is_correct:
        correct_predictions += 1

    # Prepare the row for output (include 'animal_name' and prediction results)
    output_row = row.tolist() + [predicted_class, probabilities[predicted_class], 
                                 "CORRECT" if is_correct else "wrong"]
    # print(','.join(map(str, output_row)))  # Print each row to standard output
    
    output_data.append(output_row)

# Create a DataFrame for the output
output_columns = list(test.columns) + ['predicted', 'probability', 'correct?']
output_df = pd.DataFrame(output_data, columns=output_columns)



In [39]:
# print(output_df)

# Save the results to a CSV file (Optional)
output_df.to_csv('output.csv', index=False)
print("Output saved to output.csv")

Output saved to output.csv


In [40]:
# Calculate and display accuracy
accuracy = correct_predictions / len(test) * 100
print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 96.67%
