In [1]:
import pandas as pd
from scipy import stats

# Datensatz laden
data = pd.read_csv("/Users/olivialawinski/IKT/analytics/Semesterabgabe/ikt-semesterabgabe/combined_dataset_sorted.csv", delimiter=';', header=None,low_memory=False)

print(data.columns)

Index([0, 1, 2, 3, 4, 5, 6], dtype='int64')


In [2]:
print(data)

                    0              1              2                    3  \
0          EPOCH_TIME    DEVICE_NAME    SENSOR_TYPE    MEASUREMENT_VALUE   
1       #[EPOCH_TIME]  [DEVICE_NAME]  [SENSOR_TYPE]  [MEASUREMENT_VALUE]   
2       1629745197972    LOCAL_PHONE              Y                   42   
3       1629745197972    LOCAL_PHONE             80                 SL_0   
4       1629745197972    LOCAL_PHONE             83                 RT_O   
...               ...            ...            ...                  ...   
424236  1630527720257    LOCAL_PHONE             20       -0.72,0.05,0.0   
424237  1630527720266    LOCAL_PHONE             20     -0.64,0.06,-0.19   
424238  1630527720276    LOCAL_PHONE             20        -0.72,0.1,0.2   
424239  1630527720286    LOCAL_PHONE             20      -0.7,-0.03,0.05   
424240  1630527720299    LOCAL_PHONE             20      -0.52,0.18,0.28   

              4         5              6  
0       GPS_LAT  GPS_LONG  SOURCE_FOLDER  
1

In [3]:
# relevante Spalten (Sensor Type 20: Acceleration) filtern 
acceleration_data = data[data[2] == '20']

# die X, Y, Z-Werte extrahieren 
acceleration_values = acceleration_data[3].str.split(',', expand=True).astype(float)
acceleration_values.columns = ['acceleration_x', 'acceleration_y', 'acceleration_z']

# source_folder-Spalte hinzufügen
acceleration_values['source_folder'] = acceleration_data[6]

# Ausreißer erkennen und entfernen (z-Score)
z_scores = stats.zscore(acceleration_values[['acceleration_x', 'acceleration_y', 'acceleration_z']])
filtered_data = acceleration_values[(abs(z_scores) < 3).all(axis=1)]

# Daten nach der Bereinigung anzeigen
print(filtered_data)

        acceleration_x  acceleration_y  acceleration_z source_folder
10                1.16           -0.59           -0.56   bumpy_roads
11               -0.32            0.39            0.16   bumpy_roads
12                0.30           -0.50           -1.04   bumpy_roads
13                0.91            0.21           -0.40   bumpy_roads
14               -0.82           -0.03            0.08   bumpy_roads
...                ...             ...             ...           ...
424236           -0.72            0.05            0.00  flat_streets
424237           -0.64            0.06           -0.19  flat_streets
424238           -0.72            0.10            0.20  flat_streets
424239           -0.70           -0.03            0.05  flat_streets
424240           -0.52            0.18            0.28  flat_streets

[390874 rows x 4 columns]


In [4]:
# fehlende Werte mit .loc[] entfernen 
filtered_data = filtered_data.loc[filtered_data.notnull().all(axis=1)]

In [8]:
import pandas as pd
import numpy as np

# Z-Beschleunigungswerte aus den gefilterten Daten 
data = filtered_data['acceleration_z'].values  
straßentypen = filtered_data['source_folder'].values 

# Sliding Window Parameter
fenstergröße = 100  # 100 Werte (entspricht ca. 1 Sekunde)
schrittweite = 50   # Überlappung von 50 Werten

# Sliding Window Funktion anwenden
def sliding_window(data, fenstergröße, schrittweite):
    result = []
    for i in range(0, len(data) - fenstergröße + 1, schrittweite):
        window = data[i:i + fenstergröße]
        result.append(window)
    return result

# Sliding Window anwenden auf Z-Beschleunigungswerte
windows = sliding_window(data, fenstergröße, schrittweite)

# Feature Engineering für jedes Fenster --> Durchschnitt und Standardabweichung
feature_list = []
for window in windows:
    mean_value = np.mean(window)
    std_value = np.std(window)
    feature_list.append((mean_value, std_value))

# Die berechneten Features in einem DataFrame speichern
features_df = pd.DataFrame(feature_list, columns=['Durchschnitt', 'Standardabweichung'])

# Labels für die Sliding Windows zuweisen
labels_for_windows = []

# Sliding Window auf Straßentypen anwenden
for i in range(0, len(straßentypen) - fenstergröße + 1, schrittweite):
    window_straßentyp = straßentypen[i:i + fenstergröße]
    
    # Nehme den häufigsten Straßentyp im Fenster als Label
    unique, counts = np.unique(window_straßentyp, return_counts=True)
    majority_type = unique[np.argmax(counts)]
    
    # Füge das Label dem Fenster hinzu
    labels_for_windows.append(majority_type)

# Labels in eine Series konvertieren
labels = pd.Series(labels_for_windows)

# Die Anzahl der Labels und Features vergleichen
print(f"Anzahl der Labels: {len(labels)}, Anzahl der Features: {len(features_df)}")

Anzahl der Labels: 7816, Anzahl der Features: 7816


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


feature_list = []
for window in windows:
    mean_value = np.mean(window)
    std_value = np.std(window)
    feature_list.append((mean_value, std_value))

# berechneten Features in einem DataFrame speichern
features_df = pd.DataFrame(feature_list, columns=['Durchschnitt', 'Standardabweichung'])

# Mapping der Straßentypen auf numerische Labels
label_mapping = {
    'bumpy_roads': 1,
    'cobblestone_street': 2,
    'flat_streets': 3
}

# Sicherstellen, dass die Labels der Anzahl der Sliding Windows entsprechen
labels_for_windows = []

# Sliding Windows basierend auf dem Straßentyp anwenden
for i in range(0, len(filtered_data['source_folder']) - fenstergröße + 1, schrittweite):
    window_labels = filtered_data['source_folder'][i:i + fenstergröße]
    
    # Verwende den häufigsten Straßentyp innerhalb des Sliding Windows
    unique, counts = np.unique(window_labels, return_counts=True)
    majority_type = unique[np.argmax(counts)]
    labels_for_windows.append(majority_type)

# Labels als Series
labels = pd.Series(labels_for_windows)

# 1. Daten in Trainings- und Testdatensatz splitten
X_train, X_test, y_train, y_test = train_test_split(features_df, labels, test_size=0.3, random_state=42)

# 2. KNN-Modell trainieren
knn_model = KNeighborsClassifier(n_neighbors=3)  # k=3, kann optimiert werden
knn_model.fit(X_train, y_train)

# 3. Random Forest-Modell trainieren
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 4. Modellvorhersagen auf dem Testdatensatz
knn_predictions = knn_model.predict(X_test)
rf_predictions = rf_model.predict(X_test)

# 5. Modellbewertung - KNN
print("KNN Model Evaluation")
print(confusion_matrix(y_test, knn_predictions))
print(classification_report(y_test, knn_predictions))
print("Accuracy:", accuracy_score(y_test, knn_predictions))

# 6. Modellbewertung - Random Forest
print("Random Forest Model Evaluation")
print(confusion_matrix(y_test, rf_predictions))
print(classification_report(y_test, rf_predictions))
print("Accuracy:", accuracy_score(y_test, rf_predictions))

KNN Model Evaluation
[[  56   82  114]
 [ 106  440  204]
 [ 107  151 1085]]
                    precision    recall  f1-score   support

       bumpy_roads       0.21      0.22      0.21       252
cobblestone_street       0.65      0.59      0.62       750
      flat_streets       0.77      0.81      0.79      1343

          accuracy                           0.67      2345
         macro avg       0.55      0.54      0.54      2345
      weighted avg       0.67      0.67      0.67      2345

Accuracy: 0.6742004264392324
Random Forest Model Evaluation
[[  21   91  140]
 [  37  466  247]
 [  32  158 1153]]
                    precision    recall  f1-score   support

       bumpy_roads       0.23      0.08      0.12       252
cobblestone_street       0.65      0.62      0.64       750
      flat_streets       0.75      0.86      0.80      1343

          accuracy                           0.70      2345
         macro avg       0.54      0.52      0.52      2345
      weighted avg      