In [21]:
import numpy as np
from math import sqrt
import pandas as pd
from sklearn.metrics import accuracy_score

from labs.helpers import read_dataset

In [22]:
data_dir = "../../data/train"
classes = ["field", "road", "town", "water"]
file_pattern = "*{}*.jpg"
standard_shape = (50, 50, 3)

In [23]:
dataset, features = read_dataset(data_dir, classes, file_pattern, standard_shape)
train_df = dataset.sample(frac=0.85, random_state=18)
test_df = dataset.drop(train_df.index)

In [24]:
class Classifier:

    head = None
    __features = None
    __classes_centers = None
    __radius = None
    __next_level_predictor = None

    def __init__(self, head=True):
        self.head = head

    def fit(self, df, train_features, class_target, depth=5):
        self.__features = train_features
        self.__classes_centers = df.groupby(by=class_target).mean()[train_features]
        self.__radius = self.__calc_classes_radius(df, class_target)
        self.__next_level_predictor = self.__build_next_level_predict_action(df, class_target, depth)

    def __calc_classes_radius(self, df, class_target):
        radius = {}
        for class_name in self.__classes_centers.index:
            distances = []
            class_center = self.__classes_centers.loc[class_name].values
            class_df = df[df[class_target] == class_name]
            for measure_idx in class_df.index:
                measure = class_df.loc[measure_idx, self.__features].values
                measure_evclid_distance = self.__calc_evclid_distance(class_center, measure)
                distances.append(measure_evclid_distance)
            radius[class_name] = max(distances)
        return pd.DataFrame.from_dict(radius, orient="index", columns=["Radius"])

    def __build_next_level_predict_action(self, df, class_target, depth):
        uncertain_measures = self.__define_uncertain_measures(df)
        if depth != 0 and len(uncertain_measures) != 0:
            next_level_model = Classifier(head=False)
            next_level_model.fit(uncertain_measures, self.__features, class_target, depth - 1)
            return next_level_model.predict_class

    def __define_uncertain_measures(self, df):
        uncertain_measures_idxs = list()
        for measure_idx in df.index:
            measure = df.loc[measure_idx, self.__features].values
            measure_classes = self.__measure_belong_to_classes(measure)
            if len(measure_classes) > 1:
                uncertain_measures_idxs.append(measure_idx)
        return df.loc[uncertain_measures_idxs]

    def __measure_belong_to_classes(self, measure):
        owner_classes = list()
        for class_name in self.__classes_centers.index:
            class_center = self.__classes_centers.loc[class_name, self.__features].values
            evclid_dist = self.__calc_evclid_distance(measure, class_center)
            class_radius = self.__radius.loc[class_name, "Radius"]
            if evclid_dist < class_radius:
                owner_classes.append(class_name)
        return owner_classes

    @staticmethod
    def __calc_evclid_distance(measure1, measure2):
        difference = measure1 - measure2
        squared_diff = np.square(difference)
        sum_of_squared_diff = np.sum(squared_diff)
        return sqrt(sum_of_squared_diff)

    def predict(self, df):

        def iterate_test_df(measure_df):
            measure = measure_df.values
            return self.predict_class(measure)

        result = df[self.__features].apply(iterate_test_df, axis=1)
        return result

    def predict_class(self, measure):
        defined_classes = self.__measure_belong_to_classes(measure)
        defined_clases_number = len(defined_classes)
        if defined_clases_number == 1:
            return defined_classes[0]
        elif not self.__next_level_predictor is None:
            defined_class = self.__next_level_predictor(measure)
            if defined_class is None and self.head:
                return self.__predict_class_by_standard_method(measure)
            else:
                return defined_class
        elif not self.head:
            return None
        else:
            return self.__predict_class_by_standard_method(measure)

    def __predict_class_by_standard_method(self, measure):
        distances_to_classes = []
        for class_name in self.__classes_centers.index:
            class_standard = self.__classes_centers.loc[class_name].values
            evclid_dst = self.__calc_evclid_distance(measure, class_standard)
            distances_to_classes.append(evclid_dst)
        best_class_index = np.argmin(distances_to_classes)
        return self.__classes_centers.index[best_class_index]



In [25]:
test_df

Unnamed: 0,1:1:1,1:1:2,1:1:3,1:2:1,1:2:2,1:2:3,1:3:1,1:3:2,1:3:3,1:4:1,...,50:48:1,50:48:2,50:48:3,50:49:1,50:49:2,50:49:3,50:50:1,50:50:2,50:50:3,class
2,187.0,208.0,133.0,184.0,205.0,130.0,181.0,202.0,127.0,180.0,...,145.0,179.0,92.0,143.0,179.0,92.0,143.0,179.0,92.0,field
5,196.0,207.0,131.0,194.0,205.0,129.0,190.0,200.0,127.0,186.0,...,174.0,186.0,114.0,174.0,192.0,116.0,174.0,192.0,116.0,field
8,137.0,171.0,87.0,139.0,173.0,89.0,141.0,175.0,91.0,143.0,...,150.0,184.0,107.0,149.0,183.0,109.0,148.0,182.0,108.0,field
19,204.0,216.0,142.0,197.0,208.0,130.0,190.0,202.0,120.0,187.0,...,178.0,188.0,101.0,166.0,189.0,101.0,165.0,188.0,100.0,road
24,201.0,201.0,129.0,202.0,201.0,134.0,203.0,201.0,140.0,209.0,...,207.0,199.0,162.0,205.0,209.0,150.0,207.0,211.0,152.0,town
34,61.0,87.0,74.0,70.0,96.0,83.0,80.0,107.0,90.0,94.0,...,97.0,129.0,106.0,91.0,120.0,102.0,91.0,120.0,102.0,water
42,132.0,139.0,85.0,137.0,144.0,92.0,137.0,145.0,94.0,140.0,...,74.0,134.0,100.0,76.0,131.0,99.0,76.0,131.0,99.0,water


In [26]:
model = Classifier()
model.fit(train_df, features, "class")

In [27]:
true_y = test_df["class"]



In [28]:
pred_y = model.predict(test_df)

In [29]:
accuracy_score(true_y, pred_y)

1.0

In [30]:
for i in range(7500, 100, -100):
    local_features_to_use = features[:i]
    model.fit(train_df, local_features_to_use, "class")
    local_predicted_classes = model.predict(test_df)
    local_acc = round(accuracy_score(true_y, local_predicted_classes), 3)
    print(f"{local_acc * 100}% accuracy score with {len(local_features_to_use)} features")

100.0% accuracy score with 7500 features
100.0% accuracy score with 7400 features
100.0% accuracy score with 7300 features
100.0% accuracy score with 7200 features
100.0% accuracy score with 7100 features
100.0% accuracy score with 7000 features
100.0% accuracy score with 6900 features
100.0% accuracy score with 6800 features
100.0% accuracy score with 6700 features
100.0% accuracy score with 6600 features
100.0% accuracy score with 6500 features
100.0% accuracy score with 6400 features
100.0% accuracy score with 6300 features
100.0% accuracy score with 6200 features
100.0% accuracy score with 6100 features
100.0% accuracy score with 6000 features
100.0% accuracy score with 5900 features
100.0% accuracy score with 5800 features
100.0% accuracy score with 5700 features
100.0% accuracy score with 5600 features
100.0% accuracy score with 5500 features
100.0% accuracy score with 5400 features
100.0% accuracy score with 5300 features
100.0% accuracy score with 5200 features
100.0% accuracy 

In [31]:
for class_number in range(len(classes), 1, -1):
    local_classes = classes[:class_number]
    local_dataset = dataset[dataset["class"].isin(local_classes)]
    local_train_df = local_dataset.sample(frac=0.85)
    local_test_df = local_dataset.drop(local_train_df.index)
    model = Classifier()
    model.fit(local_train_df, features, "class")
    local_predicted = model.predict(test_df)
    real = test_df["class"].values
    acc_val = accuracy_score(real, local_predicted)
    print(f"{acc_val * 100} % accuracy with {class_number} classes")



100.0 % accuracy with 4 classes
71.42857142857143 % accuracy with 3 classes
57.14285714285714 % accuracy with 2 classes
