In [11]:
file = open("C:\\Users\\NedyaIbrahim-AI23GBG\\Documents\\Github\\Python-Nedya-Ibrahim\\data\\testpoints.txt", "r")

content = file.readlines()


print(content)

file.close()

['Test points:\n', '1. (25, 32)\n', '2. (24.2, 31.5)\n', '3. (22, 34)\n', '4. (20.5, 34)\n']


In [2]:
import matplotlib.pyplot as plt
import random as rnd
from math import dist

def generate_data_from_file(file_path):
    """
    Read and process data from the main data file.
    Return a dictionary containing data categorized as Pikachu and Pichu.
    """
    with open(file_path, "r") as file_r:
        lines = file_r.read().splitlines()[1:]
        data = {"pikachu": {"width": [], "height": []}, "pichu": {"width": [], "height": []}}
        for line in lines:
            width, height, pokemon = map(float, line.split(", "))
            category = "pikachu" if pokemon == 1 else "pichu"
            data[category]["width"].append(width)
            data[category]["height"].append(height)
    return data

def generate_test_data_from_file(file_path):
    """
    Read and process data from the test data file.
    Return a dictionary containing undefined data points.
    """
    with open(file_path, "r") as file_r:
        lines = file_r.read().splitlines()[1:]
        test_data = {"undefined": {"width": [], "height": []}}
        for line in lines:
            width, height = map(float, line[4:-2].split(", "))
            test_data["undefined"]["width"].append(width)
            test_data["undefined"]["height"].append(height)
    return test_data

def plot_data(data, test_data=None, incorrect_predictions=None):
    """
    Create scatterplots of data and optionally test data and incorrect predictions.
    """
    datasets = [
        (data["pikachu"], "yellow", "Pikachus"),
        (data["pichu"], "green", "Pichus")
    ]
    if test_data:
        for category, color in [("undefined", "red"), ("pikachu", "black"), ("pichu", "blue")]:
            category_data = test_data.get(category)
            if category_data:
                datasets.append((category_data, color, f"Undefined ({category.capitalize()})"))

    for dataset, color, label in datasets:
        plt.scatter(dataset["width"], dataset["height"], color=color, label=label)

    if incorrect_predictions:
        for category, color in [("pikachu", "green"), ("pichu", "purple")]:
            category_data = incorrect_predictions.get(category)
            if category_data:
                plt.scatter(category_data["width"], category_data["height"], color=color, label=f"Incorrect (actual {category.capitalize()})")

    plt.title("Scatterplot of width/height measurements of Pokemon")
    plt.xlabel("Width")
    plt.ylabel("Height")
    plt.legend()
    plt.show()

def clean_user_input():
    """
    Prompt the user for input data and return cleaned and formatted coordinates (float, x, y) and the number of votes (int, default 1).
    """
    while True:
        user_input = input("Enter a data point for categorization, formatted as 'width, height' (x, y):\n(You may optionally enter a third number for the number of votes, default: 1)")
        try:
            user_input = user_input.split(",")
            user_input = [float(i.strip()) for i in user_input]
            if all(i > 0 for i in user_input):
                width, height = user_input[0], user_input[1]
                amount_votes = int(user_input[2]) if len(user_input) >= 3 else 1
                return width, height, amount_votes
            else:
                print("All values must be positive.")
        except ValueError as err:
            print(f"{err} is not a valid input, make sure to enter 2-3 positive numbers")

def categorize_point(point, data, amount_votes=1):
    """
    Categorize a point (x, y) based on its proximity to data points.
    Return the category (str) and certainty (float) of the categorization.
    """
    def distance(p1, p2):
        # Calculate the distance between two points (p1 and p2)
        return ((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2) ** 0.5

    pikachu_distances = [distance((w, h), point) for w, h in zip(data["pikachu"]["width"], data["pikachu"]["height"])]
    pichu_distances = [distance((w, h), point) for w, h in zip(data["pichu"]["width"], data["pichu"]["height"])]

    pikachu_neighbors = sorted(enumerate(pikachu_distances), key=lambda x: x[1])[:amount_votes]
    pichu_neighbors = sorted(enumerate(pichu_distances), key=lambda x: x[1])[:amount_votes]

    pikachu_votes = sum(1 for _, d in pikachu_neighbors if d <= pichu_neighbors[0][1])
    pichu_votes = sum(1 for _, d in pichu_neighbors if d <= pikachu_neighbors[0][1])

    if pikachu_votes > pichu_votes:
        category = "pikachu"
        certainty = pikachu_votes / amount_votes
    else:
        category = "pichu"
        certainty = pichu_votes / amount_votes

    return category, certainty

def generate_test_training_data(data):
    """
    Split the data into test and training datasets randomly.
    Return test_data and training_data dictionaries.
    """
    test_data = {"pikachu": {"width": [], "height": []}, "pichu": {"width": [], "height": []}}
    training_data = {"pikachu": {"width": [], "height": []}, "pichu": {"width": [], "height": []}}

    random_point_pikachu = rnd.sample(range(0, 75), 25)
    test_data["pikachu"]["width"] = [data["pikachu"]["width"][i] for i in random_point_pikachu]
    test_data["pikachu"]["height"] = [data["pikachu"]["height"][i] for i in random_point_pikachu]
    training_data["pikachu"]["width"] = [data["pikachu"]["width"][i] for i in range(75) if i not in random_point_pikachu]
    training_data["pikachu"]["height"] = [data["pikachu"]["height"][i] for i in range(75) if i not in random_point_pikachu]

    random_point_pichu = rnd.sample(range(0, 75), 25)
    test_data["pichu"]["width"] = [data["pichu"]["width"][i] for i in random_point_pichu]
    test_data["pichu"]["height"] = [data["pichu"]["height"][i] for i in random_point_pichu]
    training_data["pichu"]["width"] = [data["pichu"]["width"][i] for i in range(75) if i not in random_point_pichu]
    training_data["pichu"]["height"] = [data["pichu"]["height"][i] for i in range(75) if i not in random_point_pichu]

    return test_data, training_data

def calculate_accuracy(test_data, data, amount_votes=1):
    """
    Calculate the accuracy of categorizing test data points.
    Return accuracy, TP, TN, FP, FN, and incorrect predictions.
    """
    incorrect_predictions = {"pikachu": {"width": [], "height": []}, "pichu": {"width": [], "height": []}}
    TP, TN, FP, FN = 0, 0, 0, 0

    for category in ["pikachu", "pichu"]:
        for i in range(len(test_data[category]["width"])):
            result = categorize_point([test_data[category]["width"][i], test_data[category]["height"][i]], data, amount_votes)
            if result[0] == category:
                if category == "pikachu":
                    TP += 1
                else:
                    TN += 1
            else:
                FN += 1
                incorrect_predictions[category]["width"].append(test_data[category]["width"][i])
                incorrect_predictions[category]["height"].append(test_data[category]["height"][i])

    accuracy = (TP + TN) / (TP + TN + FP + FN)
    return accuracy, TP, TN, FP, FN, incorrect_predictions