In [14]:
import math
import pandas as pd

In [5]:
def custom_mean(arr_data):
    return sum(arr_data)/ float(len(arr_data))
def custom_variance(arr_data):
    avg = custom_mean(arr_data)
    varianc = sum([(x-avg)**2 for x in arr_data]) / float(len(arr_data)-1)
    return varianc

def liklihood(x, mean, variance):
    if variance == 0:
        return 0
    exponent = math.exp(-(math.pow( x - mean ,2) / (2 * variance)))
    return (1 / (math.sqrt(2 * math.pi * variance ))) * exponent

def training(arr_data):
    separated = {}
    # separating data by class
    for i in range(len(arr_data)):
        sample = arr_data[i]
        class_value = sample[-1]
        if class_value not in separated:
            separated[class_value] = []
        separated[class_value].append(sample)


    model_stats = {}
    for class_value, rows in separated.items():
        class_summaries = []
        for i in range(len(rows[0])-1):
            column = [row[i] for row in rows]
            mean = custom_mean(column)
            variance = custom_variance(column)
            class_summaries.append((mean,variance))
        model_stats[class_value] = {
            'summaries': class_summaries,
            'count': len(rows)
            ,'total':len(arr_data)
        }
    return model_stats

def predict(model_stats, test_sample):
    probabilities = {}
    for class_value, class_stats in model_stats.items():
        probabilities[class_value] = class_stats['count']/class_stats['total']
        for i in range(len(class_stats['summaries'])):
            mean, var = class_stats['summaries'][i]
            x = test_sample[i]
            prob = liklihood(x, mean, var)
            probabilities[class_value] *= prob
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

In [13]:
# Lab tasks
dataset = [
    [25, 40000, 0],
    [35, 60000, 0],
    [45, 80000, 0],
    [20, 20000, 0],
    [35, 120000, 0],
    [52, 18000, 0],
    [23, 95000, 1],
    [40, 62000, 1],
    [60, 100000, 1],
    [48, 220000, 1],
    [33, 150000, 1]
]
feature_names = ["Age", "Loan"]
full_model = training(dataset)

# Task 1
sample = [48, 142000]
prediction = predict(full_model, sample)
print(f'Test Sample: Age:{sample[0]}, Loan = {sample[1]},')
print(f"'Predicted Class: {prediction}, ({'Defaulter' if prediction == 1 else 'Non-Defaulter'})'")

Test Sample: Age:48, Loan = 142000,
'Predicted Class: 1, (Defaulter)'


In [21]:
# Task 2
df = pd.read_csv('./Heart Disease dataset/diabetes.csv')
dataset = df.values.tolist()
df.head()
mid = len(dataset)//2
fold1 = dataset[:mid]
fold2 = dataset[mid:]

accuracies = []
print(f'Training on {len(fold1)} samples and testing on {len(fold2)} samples ^___^')
model_1 = training(fold1)
correct = 0
for row in fold2:
    pred = predict(model_1, row[:-1])
    if pred == row[-1]:
        correct+=1
acc1 = (correct / len(fold2))*100
accuracies.append(acc1)
print(f"Accuracy Fold 1: {acc1:.2f}%")
print(f"Fold 2: Training on {len(fold2)} samples, Testing on {len(fold1)} samples")
model_2 = training(fold2)
correct = 0
for row in fold1:
    pred = predict(model_2, row[:-1])
    if pred == row[-1]:
        correct += 1
acc2 = (correct / len(fold1)) * 100
accuracies.append(acc2)
print(f"Accuracy Fold 2: {acc2:.2f}%")

avg_accuracy = sum(accuracies) / len(accuracies)
print(f"\nAverage Accuracy: {avg_accuracy:.2f}%")

Training on 384 samples and testing on 384 samples ^___^
Accuracy Fold 1: 77.08%
Fold 2: Training on 384 samples, Testing on 384 samples
Accuracy Fold 2: 73.70%

Average Accuracy: 75.39%
