# Data Quality Assessment für die Bachelorarbeit: Schadenserkennung bei Büchern mit machine learning

## Version 1.0

Besteht aus einem YOLO und COCO Teil. 
Für den Datenexport sollte COCO verwendet werden. 

# YOLO DATA

In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import glob
import shutil
import copy
import random

In [None]:
# einlesen der Daten von json
data = pd.read_json("../../../old/BAA/Annotierte_Bilder/2024-02-21-Schadenserkennung-Annotationen.json")
data = data.set_index("id")

## Verwerfen von nicht brauchbaren Daten

In [None]:
# verwerfen von nicht zu verwendenen Daten
#
#   created_at und updated_at geben keine informationen über den inhalt des bildes bzw der annotation
#   lead_time interessiert nicht
#   annotator_id ist die id + 54
#   annotator ist immer 1
#   choice hat 974/1081 nullwerte (Nan)
#   
# Die restlichen Werte können relevant sein für das trainieren des algorithmus

data_dropped = data.drop(["created_at", "updated_at", "lead_time", "annotation_id", "annotator", "choice"], axis=1)

## Kategorie "label" weiter aufteilen
Extrahieren der einzelnen Schadensarten und festlegen ob es überhaupt schäden hat.

In [None]:
label = pd.json_normalize(data_dropped["label"])
total = ['Schimmel', 'Wasser', 'Sonstiges', 'Schäden', 'Schmutz', 'Schädlinge']

# Add column for every category used
for x in total:
    data_dropped[x] = False
data_dropped["hat_schäden"] = None

# True where category is true
# rows
for x in range(label.shape[0]):
    rowx = set()
    # colums
    for y in range(label.shape[1]):
        if label[y][x] != None:
            rowx.add(label[y][x]["polygonlabels"][0])
    if label[0][x] == None:
        data_dropped.loc[x+1, "hat_schäden"]= False
    else:
        for s in rowx:
            data_dropped.loc[x+1, s] = True
        data_dropped.loc[x+1, "hat_schäden"] = True

## Datentyps kontrollieren
image ist ein objekt, da strings in pandas so gespeichert werden.

label ist ein objekt da es gemischte Datentypen enthält: strings, int, float, bool

In [None]:
# change "hat_schäden" da es nur True und Fale drin hat
data_preprocessed = data_dropped.astype({"hat_schäden" : "bool"})

In [None]:
data_preprocessed.dtypes

## Auf None/Nan kontrollieren
Wenn diese in "image" auftreten --> löschen

In [None]:
# False = keine None/Nan
nonetest = data_preprocessed["image"].isnull().any()

if nonetest == True:
    # drop rows with None only
    data_preprocessed.dropna(axis=0, how="all", inplace=True)
    # drop rows with none in image, without areference it is impossible to get to the right picture
    data_preprocessed.dropna(axis=0, subset=["image"], inplace=True)

## Mengen und verschiedene Anhaltspunkte

In [None]:
hat_schäden = data_preprocessed["hat_schäden"].value_counts()
print(dict(hat_schäden))

Es hat 882 Einträge die schäden aufweisen und 199 die keine haben

In [None]:
truthlist = {}
for x in total:
    truthlist[x] = data_preprocessed[x].value_counts()[True]
print(truthlist)

Sehr grosse unterschiede in den Mengen der Daten, viel schmutz und viel sonstiges aber wenige Schimmel und Schädlingsfälle
In 871 von 882 Bildern mit beschädigungen sind Schmutzschäden drin.

In [None]:
truthlist.update({"hat_schäden":hat_schäden[True], "ohne_schäden": hat_schäden[False]})

# Plot of the values

labels = list(truthlist.keys())
values = list(truthlist.values())


plt.figure(figsize=(12, 6))  
bars = plt.bar(labels, values, color='skyblue')
plt.xlabel('Kategorien')
plt.ylabel('Anzahl Bilder')
plt.title('Schadensverteilung')
plt.xticks(rotation=45)  
# Add value on top of bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval, int(yval), va='bottom', ha="center")
plt.tight_layout() 
plt.show()

In [None]:
# took the label value from: Kategorie weiter aufteilen
# Displays the total amount of as damaged instances
label.notnull().sum().sum()


## Extrahieren des image namens und droppen von image

In [None]:
image_namen_liste = []
data_preprocessed["image_name"] = None
for i, row in data_preprocessed.iterrows():
    data_preprocessed.loc[i, "image_name"] = row["image"].split("/")[-1]
    z = row["image"].split("/")[-1]
    image_namen_liste.append(f"{z}")
data_preprocessed = data_preprocessed.drop("image", axis=1)

## Koordinate in Listen zusammenfassen

In [None]:


label = pd.json_normalize(data_preprocessed["label"])

data_preprocessed["boxpoints"] = None
for x in range(label.shape[0]):
    info = {}
    # colums
    for y in range(label.shape[1]):
        
        temp = []
        if label[y][x] != None:
            temp.extend(label[y][x]["polygonlabels"])
            temp.append(label[y][x]["points"])
            info.update({y:temp})
    info = np.asarray(info)
    print(data_preprocessed["hat_schäden"][46])
    if data_preprocessed["hat_schäden"][x+1]:
        data_preprocessed.loc[x + 1, "boxpoints"] = info
    

## Extrahieren eines kleinen Datensets für erste Experimente

Das Ziel ist es ein möglichst ausgeglichenes Datenset zu erhalten

In [None]:

data_sample_temp = []
random_state = 42
sample_size = 7
for x in total:
    size = len(data_preprocessed.loc[data_preprocessed[x] == True].index)
    if size < sample_size:
        data_sample_temp.append(data_preprocessed.loc[data_preprocessed[x] == True].sample(n=size, random_state=random_state).index)
        print("Sample size for " + x + " is " + str(size) + " instead of the wanted sample size of " + str(sample_size) + ". Has not more entries then that.")
    else:
        data_sample_temp.append(data_preprocessed.loc[data_preprocessed[x] == True].sample(n=sample_size, random_state=random_state).index)

# Takes a sample from not damaged bookpictures
size = len(data_preprocessed.loc[data_preprocessed["hat_schäden"] == False].index)
if size < sample_size:
    data_sample_temp.append(data_preprocessed.loc[data_preprocessed["hat_schäden"] == False].sample(n=size, random_state=random_state).index)
    print("Sample size for \"hat_schäden\" is " + str(size) + " instead of the wanted sample size of " + str(sample_size) + ". Has not more entries then that.")
else:
    data_sample_temp.append(data_preprocessed.loc[data_preprocessed["hat_schäden"] == False].sample(n=sample_size, random_state=random_state).index)

data_sample = pd.DataFrame(data_sample_temp)
print(data_sample)
testrow = 3
valrow = 4
data_test = pd.DataFrame(data_sample[testrow]) # nur in dem kleinen fall verwendbar wenn die daten noch nicht gelöscht wurden. 
data_val = pd.DataFrame(data_sample[valrow])
data_train = data_sample.loc[:, data_sample.columns != testrow]

## verschiedene Datenlisten erstellen für die verwendung im coco dataset (filtern nach den korrekten daten)

In [None]:
def into_list(data):
    flat = data.to_numpy().flatten()
    # list is optional, without it is a numpy array
    index_list = list(flat[~np.isnan(flat)].astype(int))
    return index_list


datalist_train = into_list(data_train)
datalist_val = into_list(data_val)
# datalist_test = into_list(data_test)


## export data into seperate Datasets
werden nicht weiter verwendet

In [None]:
def extractsublist(originaldata, datalist):
    dataset = originaldata.iloc[datalist]
    return dataset

def extract_nameending(data):
    """Input must have the column: image"""
    data["image_name"] = None
    for i, row in data.iterrows():
        data.loc[i, "image_name"] = row["image"].split("/")[-1]
    data = data.drop("image", axis=1)
    return data

def export(data, name):
    filename = f"{name}.json"
    data.to_json(filename)

dataexport_train = extractsublist(data, datalist_train)
dataexport_train = extract_nameending(dataexport_train)
dataexport_train.head()
# export(dataexport_train, "Trainingdata")

# COCO Datasets
https://cocodataset.org/#format-data

## Export from COCO json file

erstellen eines zweiten files aus dem die doppelten und nicht annotierten image daten gelöscht wurden.

Diverese zeilen sind auskommentiert um zu verhindern, dass ausversehen Daten auf die festplatte geschrieben werden bei einem run all

In [None]:
import json
import datetime

path = "../../../old/BAA/coco_annotations.json"

newpath = "../../../old/BAA/Data/"
newfilename = "coco_annotations_cleaned.json"
completeNewPath = "../../../old/BAA/coco_annotations_cleaned.json" # gecleantes Dataset

## filtern des hineingegebenen COCO files, anhand der YOLO Daten
Nur nötig wenn das hineingeladene coco.json nicht zu verwendende daten enthält

Nur einmal durchlaufen lassen

In [None]:
# with open(path, "r") as f:
#     cocoOriginal = json.load(f)

def createSubsetStructure(originalData, description):
    """erstellt eine COCO Struktur zum füllen und exportieren als json"""
    subsetStructure = {
        "info": [{"year": int(datetime.date.today().year)},
                {"version": "1.0"},
                {"description": description},
                {"contributer": "Michael Infanger"},
                {"url": ""},
                {"date_created": str(datetime.datetime.now())}],
        "categories": originalData["categories"],
        "images": [],
        "annotations": []
    }
    return subsetStructure

def clearCOCO(cocoOriginal, image_namen_ls):
    imageIDlist = []
    notUsedImageNames = []
    descr = "gecleantes COCO Datenset, ohne doppelte Bilder, ohne nicht annotierten Bilder, nur daten die in dem Ursprungsexport dieses Projekts enthalten waren"
    cocoNew = createSubsetStructure(cocoOriginal, description=descr)
    minID = cocoOriginal["annotations"][0]["image_id"] # kleineste ID der annotierten Daten
    lastID = cocoOriginal["images"][-1]["id"]
    for n in range(minID, lastID + 1): #last ID included
        cocoOriginal["images"][n]["file_name"] = cocoOriginal["images"][n]["file_name"].split("/")[-1]
        if cocoOriginal["images"][n]["file_name"] in image_namen_ls:
            imageIDlist.append(cocoOriginal["images"][n]["id"])
            cocoNew["images"].append(cocoOriginal["images"][n])
        elif n == minID: # need this because the first three images begin with a %, this was used to format strings, and it is still working apparently. Because of it, this images werent catched with the first if statement...
            imageIDlist.append(cocoOriginal["images"][n]["id"])
            cocoNew["images"].append(cocoOriginal["images"][n])
        elif n == minID + 1:
            imageIDlist.append(cocoOriginal["images"][n]["id"])
            cocoNew["images"].append(cocoOriginal["images"][n])
        elif n == minID + 2:
            imageIDlist.append(cocoOriginal["images"][n]["id"])
            cocoNew["images"].append(cocoOriginal["images"][n])
        else: 
            notUsedImageNames.append(cocoOriginal["images"][n]["file_name"])
    for i in range(len(cocoOriginal["annotations"])):
        if cocoOriginal["annotations"][i]["image_id"] in imageIDlist:
            cocoNew["annotations"].append(cocoOriginal["annotations"][i])
    return cocoNew

def writeJson(jsonObject, filename):
    if filename.split(".")[-1] != "json":
        filename = f"{filename}.json"
    with open(filename, "w") as g:
        g.write(jsonObject)
        
# Uncomment if a new dataset needs to be loaded and cleared (thogether with the load statement)
# cocoCleared = clearCOCO(cocoOriginal=cocoOriginal, image_namen_ls=image_namen_liste)
# coco_cleared = json.dumps(cocoCleared, indent=4)
# writeJson(coco_cleared, completeNewPath)

## Erstellen eines COCO Subsets (Train, Validation, Test)
Testset ist ausgeklammert, da die Daten aus dem Ursprünglichen Datenset entfernt wurden

In [None]:
with open(completeNewPath, "r") as f:
    cocox = json.load(f)

# Say descriptions
description_train = "trainingset_small"
description_val = "validierungsset_small"
description_test = "testset_small"

# to use the original split from the YOLO format in the newer export of COCO that has high ID's
def adapt_IDList(cocodata, subsetIDList):
    namelist = []
    ID_cleaned_list = []
    for n in subsetIDList:
        namelist.append(data_preprocessed["image_name"].loc[n])
    for i in cocodata["images"]:
        if i["file_name"] in namelist:
            ID_cleaned_list.append(i["id"])
    return ID_cleaned_list

def createSubsetImages(subset, cocodata, subsetIDList):
    # change file_name for the actual name without the whole path
    for n in range(len(cocodata["images"])):
        # not needed with a cleaned dataset
        # cocodata["images"][n]["file_name"] = cocodata["images"][n]["file_name"].split("/")[-1]
        if cocodata["images"][n]["id"] in subsetIDList:
            subset["images"].append(cocodata["images"][n])

def createSubsetAnnotations(subset, cocodata, subsetIDList, label_filter=None):
	if label_filter == None:
		for n in range(len(cocodata["annotations"])):
		# not needed with a cleaned IDlist
		# cocodata["annotations"][n]["image_id"] -= 20000
			if cocodata["annotations"][n]["image_id"] in subsetIDList:
				subset["annotations"].append(cocodata["annotations"][n])
	elif type(label_filter) == int:
		for n in range(len(cocodata["annotations"])):
		# not needed with a cleaned IDlist
		# cocodata["annotations"][n]["image_id"] -= 20000
			if cocodata["annotations"][n]["image_id"] in subsetIDList:
				if cocodata["annotations"][n]["category_id"] == label_filter:
					subset["annotations"].append(cocodata["annotations"][n])
	else:
		raise ValueError("Only None or Integer are accepted")

def createCOCOSubset(cocodata, subsetIDList, description, lb = None):
    cocosub = createSubsetStructure(cocodata, description=description)
    createSubsetImages(cocosub, cocodata, subsetIDList)
    createSubsetAnnotations(cocosub, cocodata, subsetIDList, label_filter=lb)
    return cocosub

# clean subsetIDLists in order to be used with the coco dataset that uses way too hight id numbers
d_train = adapt_IDList(cocox, datalist_train)
d_val = adapt_IDList(cocox, datalist_val)
# d_test = adapt_IDList(cocox, datalist_test)


trainset = createCOCOSubset(cocox, d_train, description=description_train)
valset = createCOCOSubset(cocox, d_val, description=description_val)
# testset = createCOCOSubset(cocox, d_test, description=description_test)

json_train = json.dumps(trainset, indent=4)
json_val = json.dumps(valset, indent=4)
# json_test = json.dumps(testset, indent=4)

target_directory = "../../../old/BAA/Annotierte_Bilder/2024-02-21-Schadenserkennung-Annotationen/"


# export data to separate files
trainpath = newpath + "train/"
valpath = newpath + "val/"
# testpath = newpath + "test/"


# writeJson(json_train, trainpath + "coco_train.json")
# writeJson(json_val, valpath + "coco_val.json")
# writeJson(json_test, testpath + "coco_test.json")


image_folder = target_directory + "images/"




## Löschen von Testdaten und verschieben/kopieren der restlichen Daten

In [None]:
def copyImages(image_list, target_dict):
    images = glob.glob(os.path.join(image_folder, "*.png"))
    for i in images:
        basename = os.path.basename(i)
        if basename in image_list:
            shutil.copy(i, target_dict)

def moveImages(image_list, target_dict):
    images = glob.glob(os.path.join(image_folder, "*.png"))
    for i in images:
        basename = os.path.basename(i)
        if basename in image_list:
            shutil.move(i, target_dict)

def delInfos(dataset, dataset_to_delete_stuff):
    # should only be used once on the data for the testdata
    # Iterate through dataset and delete any image and annotation that has the id of the image list
    return_data = copy.deepcopy(dataset_to_delete_stuff)
    img_id = []
    run_idx = []
    anno_idx = []
    for n in range(len(dataset["images"])):
        for i in range(len(dataset_to_delete_stuff["images"])):
            if dataset["images"][n]["id"] == dataset_to_delete_stuff["images"][i]["id"]:
                img_id.append(dataset_to_delete_stuff["images"][i]["id"])
                run_idx.append(i)
        for y in range(len(dataset_to_delete_stuff["annotations"])):
            if dataset_to_delete_stuff["annotations"][y]["image_id"] in img_id:
                # klasse herausfiltern für dataset mit nur Wasser
                anno_idx.append(y)
    
    delcount = 0

    for z in run_idx:
        del return_data["images"][z-delcount]
        delcount += 1
    
    delcount = 0

    for x in anno_idx:
        del return_data["annotations"][x-delcount]
        delcount += 1

    return return_data


def getImageNames(cocodata):
    nameList = []
    for n in cocodata["images"]:
        nameList.append(n["file_name"])
    return nameList

train_names = getImageNames(trainset)
val_names = getImageNames(valset)
# test_names = getImageNames(testset)

# copyImages(train_names, trainpath)
# copyImages(val_names, valpath)
# moveImages(test_names, testpath)

# data_deleted = delInfos(testset, cocox)
# d_delete = json.dumps(data_deleted, indent=4)
# writeJson(d_delete, completeNewPath)


## grosses Datenset erstellen (Mit allen Daten)

In [None]:
indexListg = []
for i in range(len(cocox["images"])):
    indexListg.append(cocox["images"][i]["id"])


### Train / Validation Split

In [None]:
np.random.seed(42) # für die reproduktion des Experiments
splitvalue = int(len(indexListg) * 0.8) # 80/20 split von training und validationsdaten (Testdaten wurden in einem vorherigen Schritt bereits entfernt), bzw, wie viele sind 80% der Daten
indexListn = np.asarray(indexListg, dtype=int)
np.random.shuffle(indexListn)
training, validation = indexListn[:splitvalue], indexListn[splitvalue:] # Datasplit

### Trainings und Validierungsset erstellen und in benannte ordner kopieren / verschieben, und Annotationen als json dazuschreiben

In [None]:
description_train_max = "trainingset_max"
description_val_max = "validierungsset_max"

train_set_max = createCOCOSubset(cocox, training, description_train_max)
val_set_max = createCOCOSubset(cocox, validation, description_val_max)

json_train_max = json.dumps(train_set_max, indent=4)
json_val_max = json.dumps(val_set_max, indent=4)

save_path = "../../../old/BAA/Data/"
train_set_max_path = os.path.join(save_path, "train_max/coco_train_max.json")
val_set_max_path = os.path.join(save_path, "val_max/coco_val_max.json")

#writeJson(json_train_max, train_set_max_path)
#writeJson(json_val_max, val_set_max_path)

In [None]:
train_names_max = getImageNames(train_set_max)
val_names_max = getImageNames(val_set_max)

#copyImages(train_names_max, os.path.join(save_path, "train_max/"))
#copyImages(val_names_max, os.path.join(save_path, "val_max/"))

## Dataset mit nur Schmutzschäden nichts sonst

In [None]:
np.random.seed(42)
splitvalue = int(len(indexListg) * 0.8)
indexListn = np.asarray(indexListg, dtype=int)
np.random.shuffle(indexListn)
training, validation = indexListn[:splitvalue], indexListn[splitvalue:]

In [None]:
description_train = "trainingset_Schmutz"
description_val = "validierungsset_Schmutz"

train_set = createCOCOSubset(cocox, training, description_train, 3)
val_set = createCOCOSubset(cocox, validation, description_val, 3)

json_train = json.dumps(train_set, indent=4)
json_val = json.dumps(val_set, indent=4)

save_path = "../../../old/BAA/Data/"
if os.path.exists(save_path) != True:
    os.mkdir(save_path)
folder_train = os.path.join(save_path, "train_Schmutz/")
if os.path.exists(folder_train) != True:
    os.mkdir(folder_train)
folder_val = os.path.join(save_path, "val_Schmutz/")
if os.path.exists(folder_val) != True:
    os.mkdir(folder_val)
train_set_path = os.path.join(folder_train, "coco_train.json")
val_set_path = os.path.join(folder_val, "coco_val.json")

#writeJson(json_train, train_set_path)
#writeJson(json_val, val_set_path)

train_names = getImageNames(train_set)
val_names = getImageNames(val_set)

#copyImages(train_names, folder_train)
#copyImages(val_names, folder_val)

## Dataset mit nur Wasserschäden nichts sonst

In [None]:
np.random.seed(42)
splitvalue = int(len(indexListg) * 0.8)
indexListn = np.asarray(indexListg, dtype=int)
np.random.shuffle(indexListn)
training, validation = indexListn[:splitvalue], indexListn[splitvalue:]

description_train = "trainingset_Wasser"
description_val = "validierungsset_Wasser"

train_set = createCOCOSubset(cocox, training, description_train, 9)
val_set = createCOCOSubset(cocox, validation, description_val, 9)

json_train = json.dumps(train_set, indent=4)
json_val = json.dumps(val_set, indent=4)

save_path = "../../../old/BAA/Data/"
if os.path.exists(save_path) != True:
    os.mkdir(save_path)
folder_train = os.path.join(save_path, "train_Wasser/")
if os.path.exists(folder_train) != True:
    os.mkdir(folder_train)
folder_val = os.path.join(save_path, "val_Wasser/")
if os.path.exists(folder_val) != True:
    os.mkdir(folder_val)
train_set_path = os.path.join(folder_train, "coco_train.json")
val_set_path = os.path.join(folder_val, "coco_val.json")

#writeJson(json_train, train_set_path)
#writeJson(json_val, val_set_path)

train_names = getImageNames(train_set)
val_names = getImageNames(val_set)

#copyImages(train_names, folder_train)
#copyImages(val_names, folder_val)

## Dataset mit nur Schäden nichts sonst

In [None]:
np.random.seed(42)
splitvalue = int(len(indexListg) * 0.8)
indexListn = np.asarray(indexListg, dtype=int)
np.random.shuffle(indexListn)
training, validation = indexListn[:splitvalue], indexListn[splitvalue:]

description_train = "trainingset_Schäden"
description_val = "validierungsset_Schäden"

train_set = createCOCOSubset(cocox, training, description_train, 4)
val_set = createCOCOSubset(cocox, validation, description_val, 4)

json_train = json.dumps(train_set, indent=4)
json_val = json.dumps(val_set, indent=4)

save_path = "../../../old/BAA/Data/"
if os.path.exists(save_path) != True:
    os.mkdir(save_path)
folder_train = os.path.join(save_path, "train_Schäden/")
if os.path.exists(folder_train) != True:
    os.mkdir(folder_train)
folder_val = os.path.join(save_path, "val_Schäden/")
if os.path.exists(folder_val) != True:
    os.mkdir(folder_val)
train_set_path = os.path.join(folder_train, "coco_train.json")
val_set_path = os.path.join(folder_val, "coco_val.json")

#writeJson(json_train, train_set_path)
#writeJson(json_val, val_set_path)

train_names = getImageNames(train_set)
val_names = getImageNames(val_set)

#copyImages(train_names, folder_train)
#copyImages(val_names, folder_val)

## Dataset mit nur Sonstigen Sachen nichts sonst

In [None]:
np.random.seed(42)
splitvalue = int(len(indexListg) * 0.8)
indexListn = np.asarray(indexListg, dtype=int)
np.random.shuffle(indexListn)
training, validation = indexListn[:splitvalue], indexListn[splitvalue:]

description_train = "trainingset_Sonstiges"
description_val = "validierungsset_Sonstiges"

train_set = createCOCOSubset(cocox, training, description_train, 6)
val_set = createCOCOSubset(cocox, validation, description_val, 6)

json_train = json.dumps(train_set, indent=4)
json_val = json.dumps(val_set, indent=4)

save_path = "../../../old/BAA/Data/"
if os.path.exists(save_path) != True:
    os.mkdir(save_path)
folder_train = os.path.join(save_path, "train_Sonstiges/")
if os.path.exists(folder_train) != True:
    os.mkdir(folder_train)
folder_val = os.path.join(save_path, "val_Sonstiges/")
if os.path.exists(folder_val) != True:
    os.mkdir(folder_val)
train_set_path = os.path.join(folder_train, "coco_train.json")
val_set_path = os.path.join(folder_val, "coco_val.json")

#writeJson(json_train, train_set_path)
#writeJson(json_val, val_set_path)

train_names = getImageNames(train_set)
val_names = getImageNames(val_set)

#copyImages(train_names, folder_train)
#copyImages(val_names, folder_val)

## Dataset mit nur Schimmelschäden nichts sonst

In [None]:
np.random.seed(42)
splitvalue = int(len(indexListg) * 0.8)
indexListn = np.asarray(indexListg, dtype=int)
np.random.shuffle(indexListn)
training, validation = indexListn[:splitvalue], indexListn[splitvalue:]

description_train = "trainingset_Schimmel"
description_val = "validierungsset_Schimmel"

train_set = createCOCOSubset(cocox, training, description_train, 2)
val_set = createCOCOSubset(cocox, validation, description_val, 2)

json_train = json.dumps(train_set, indent=4)
json_val = json.dumps(val_set, indent=4)

save_path = "../../../old/BAA/Data/"
if os.path.exists(save_path) != True:
    os.mkdir(save_path)
folder_train = os.path.join(save_path, "train_Schimmel/")
if os.path.exists(folder_train) != True:
    os.mkdir(folder_train)
folder_val = os.path.join(save_path, "val_Schimmel/")
if os.path.exists(folder_val) != True:
    os.mkdir(folder_val)
train_set_path = os.path.join(folder_train, "coco_train.json")
val_set_path = os.path.join(folder_val, "coco_val.json")

#writeJson(json_train, train_set_path)
#writeJson(json_val, val_set_path)

train_names = getImageNames(train_set)
val_names = getImageNames(val_set)

#copyImages(train_names, folder_train)
#copyImages(val_names, folder_val)

## Dataset mit nur Schädlingen nichts sonst

In [None]:
np.random.seed(42)
splitvalue = int(len(indexListg) * 0.8)
indexListn = np.asarray(indexListg, dtype=int)
np.random.shuffle(indexListn)
training, validation = indexListn[:splitvalue], indexListn[splitvalue:]

description_train = "trainingset_Schädlinge"
description_val = "validierungsset_Schädlinge"

train_set = createCOCOSubset(cocox, training, description_train, 5)
val_set = createCOCOSubset(cocox, validation, description_val, 5)

json_train = json.dumps(train_set, indent=4)
json_val = json.dumps(val_set, indent=4)

save_path = "../../../old/BAA/Data/"
if os.path.exists(save_path) != True:
    os.mkdir(save_path)
folder_train = os.path.join(save_path, "train_Schädlinge/")
if os.path.exists(folder_train) != True:
    os.mkdir(folder_train)
folder_val = os.path.join(save_path, "val_Schädlinge/")
if os.path.exists(folder_val) != True:
    os.mkdir(folder_val)
train_set_path = os.path.join(folder_train, "coco_train.json")
val_set_path = os.path.join(folder_val, "coco_val.json")

#writeJson(json_train, train_set_path)
#writeJson(json_val, val_set_path)

train_names = getImageNames(train_set)
val_names = getImageNames(val_set)

#copyImages(train_names, folder_train)
#copyImages(val_names, folder_val)

Zeile 216 hat die meisten schadenseinträge, gesamt 37

## Fazit

1. Es ist im grossen und ganzen ein guter Datensatz
    - musste keine rows löschen
2. Die verlinkung der einzelnen Bilder sind vollständig
3. Die Datenmenge ist sehr dürftig für einen Ai algorithmus
    - Data Augmentation als gegenmassnahme
4. Die verteilung der Daten ist sehr ungleich. Bsp. es hat 871 Bilder mit Schmutzschäden aber nur vier mit Schädlingen
    - Es braucht mehr Daten, es ist eine überlegung wert die fälle mit den wenigsten vorkommnissen nicht zu gebrauchen oder erst in einem späteren stadium, alternativ könnten im allgemeinen wenige daten verwendet werden um die unterschiede in den mengen zu verkleinern.