In [2]:
# Notwendige Bibliotheken importieren

import pandas as pd
import random
import matplotlib.pyplot as plt
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

In [5]:
# Funktion zum Einlesen der CSV-Dateien

df_Water = pd.read_csv('waterFull.csv')
df_Micro = pd.read_csv('microFull.csv')
df_Knocking = pd.read_csv('knockingFull.csv')

In [None]:
#Datenvorbereitung

df_Water.drop(['time'], axis=1, inplace=True)
df_Knocking.drop(['time'], axis=1, inplace=True)
df_Micro.drop(['time'], axis=1, inplace=True)

# Label hinzufügen

df_Water['Label'] = 'Water'
df_Knocking['Label'] = 'Knocking'
df_Micro['Label'] = 'Microwave'

# Auf ganze Zahl runden

df_Water['dBFS'] = df_Water['dBFS'].round(decimals = 0)
df_Knocking['dBFS'] =  df_Knocking['dBFS'].round(decimals = 0)
df_Micro['dBFS'] =  df_Micro['dBFS'].round(decimals = 0)

In [None]:
# Daten in einzelne Samples aufteilen

def split_dataframe(dataframe, sampleSize, sampleCount, max_attempts=100):
    df_length = len(dataframe)
    dataframes = []
    unique_ids = set()
    attempts = 0

    # While Schleife definiert durch gewünschte Sample-Menge

    while len(dataframes) < sampleCount and attempts < max_attempts:
        
        # Datensatzgröße prüfen
        
        max_start_idx = df_length - sampleSize
        if max_start_idx < 0:
            break
        
        # Potentielle Eckpunkte für Sample finden 

        start_idx = random.randint(0, max_start_idx)
        end_idx = start_idx + sampleSize

        if end_idx > df_length:
            start_idx = df_length - sampleSize

        # Sample generieren und Hash zuweisen
        
        df_slice = dataframe.iloc[start_idx:end_idx].copy()
        df_id = hash(df_slice.to_string())

        # Doppelte Samples filtern
        
        if df_id in unique_ids:
            attempts += 1
            continue

        # Sample in Liste aufnehmen
        
        dataframes.append(df_slice)
        unique_ids.add(df_id)
        attempts = 0

    return dataframes

# Auswahl der Trainingsdaten-Länge / -Menge: 
# 10 Zeilen ~ 1 Sekunde
# max Sample Menge = Zeilen in csv Datei

split_Water = split_dataframe(df_Water, 50, 2000)
split_Knocking = split_dataframe(df_Knocking, 50, 2000)
split_Micro = split_dataframe(df_Micro, 50, 2000)

In [None]:
# Zusammenführen der Dataframes

combined_df = []
features = pd.DataFrame()

# Feature-Enineering

for i in split_Knocking:
    combined_df.append(pd.DataFrame({"dBFS_Varianz": i["dBFS"].var(),"dBFS_STD" : i["dBFS"].std(), "dBFS_mean" : i["dBFS"].mean(),
    "dBFS_min" : i["dBFS"].min(), "dBFS_max" : i["dBFS"].max(), "dBFS_absMax" : i["dBFS"].abs().max(), "dBFS_sum" : i["dBFS"].sum(), "dBFS_median" : i["dBFS"].median(), "Label" : i["Label"]}))
        
for i in split_Water:
    combined_df.append(pd.DataFrame({"dBFS_Varianz": i["dBFS"].var(),"dBFS_STD" : i["dBFS"].std(), "dBFS_mean" : i["dBFS"].mean(),
    "dBFS_min" : i["dBFS"].min(), "dBFS_max" : i["dBFS"].max(), "dBFS_absMax" : i["dBFS"].abs().max(), "dBFS_sum" : i["dBFS"].sum(), "dBFS_median" : i["dBFS"].median(), "Label" : i["Label"]}))
    
for i in split_Micro:
    combined_df.append(pd.DataFrame({"dBFS_Varianz": i["dBFS"].var(),"dBFS_STD" : i["dBFS"].std(), "dBFS_mean" : i["dBFS"].mean(),
    "dBFS_min" : i["dBFS"].min(), "dBFS_max" : i["dBFS"].max(), "dBFS_absMax" : i["dBFS"].abs().max(), "dBFS_sum" : i["dBFS"].sum(), "dBFS_median" : i["dBFS"].median(), "Label" : i["Label"]}))    

finalSamples = []
for x in combined_df:
    finalSamples.append(x.head(1))
      
df_final = pd.concat(finalSamples)

In [None]:
# Aufteilen der Daten in Trainings- und Testdaten

x = df_final.drop(columns = ["Label"])
y = df_final["Label"]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Klassifikator-Wahl

classifier = RandomForestClassifier()

# Trainiere den Klassifikator

classifier.fit(X_train, y_train)

# Vorhersagen für die Testdaten machen

y_pred = classifier.predict(X_test)

In [None]:
# Bewertung der Genauigkeit des Klassifikators

accuracy = accuracy_score(y_test, y_pred)
x_ = classification_report(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print('Genauigkeit:', accuracy)
print('Präzision:', precision)
print('Recall:', recall)
print('Neu:', x_)

# Berechne die Konfusionsmatrix

cm = confusion_matrix(y_test, y_pred)

# Gib die Konfusionsmatrix aus

print('Konfusionsmatrix:')
print(cm)

# Falls RandomForest: Ausgabe der Feature-Gewichtung

importance = classifier.feature_importances_
std = np.std([tree.feature_importances_ for tree in classifier.estimators_], axis=0)

forest_importances = pd.Series(importance, x.keys())

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)

ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()