In [2]:
import os
import pandas as pd

# Pfad zum Hauptordner
root_directory = 'measurement_data'

# Initialisieren des DataFrames
# Haupt-DataFrames
df = pd.DataFrame(columns=["EPOCH_TIME", "DEVICE_NAME", "SENSOR_TYPE", "MEASUREMENT_VALUE", "GPS_LAT", "GPS_LONG", "SOURCE_FOLDER", "Z_VALUE"])

# Funktion zum Extrahieren der Daten
def extract_data(file_path, source_folder):
    data = []
    gps_data = {}  # Zwischenablage für GPS-Daten
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line:
                parts = line.split(';')
                if len(parts) == 4:
                    epoch_time, device_name, sensor_type, measurement_value = parts
                    if sensor_type == "AB":
                        gps_lat, gps_long = measurement_value.split(',')
                        gps_data[epoch_time] = (gps_lat, gps_long)
                        # die Zeile mit GPS-Daten ebenfalls in die data-Liste einfügen
                        data.append([epoch_time, device_name, sensor_type, measurement_value, gps_lat, gps_long, source_folder])
                    elif sensor_type == "20":
                        x, y, z = measurement_value.split(',')
                        # die Zeile mit z_daten ebenfalls in die data-Liste einfügen
                        data.append([epoch_time, device_name, sensor_type, measurement_value, None, None, source_folder, z])
                    else:
                        gps_lat, gps_long = gps_data.get(epoch_time, (None, None))
                        data.append([epoch_time, device_name, sensor_type, measurement_value, gps_lat, gps_long, source_folder])
    return data

# Rekursiv alle txt-Dateien durchsuchen
for subdir, _, files in os.walk(root_directory):
    for file in files:
        if file.endswith('.txt'):
            file_path = os.path.join(subdir, file)
            source_folder = os.path.relpath(subdir, root_directory)
            df = pd.concat([df, pd.DataFrame(extract_data(file_path, source_folder), columns=df.columns)], ignore_index=True)

# Speichern in CSV
output_file = 'measurement_data/combined_dataset_with_ab.csv'
df.to_csv(output_file, sep=';', index=False)

print(f'Daten wurden erfolgreich in {output_file} gespeichert.')


Daten wurden erfolgreich in measurement_data/combined_dataset_with_ab.csv gespeichert.


In [5]:
# Daten nach Sensortyp 20 filtern
df_filtered = df[df["SENSOR_TYPE"] == "20"]

# Spalten löschen bis auf epoch_time, measurement_value, source_folder, z
df_filtered = df_filtered[["EPOCH_TIME", "SOURCE_FOLDER", "Z_VALUE"]]

output_filtered_file = 'measurement_data/filtered_dataset.csv'
df_filtered.to_csv(output_filtered_file, sep=";", index=False)

print(f'Gefilterte Daten wurden erfolgreich in {output_filtered_file} gespeichert.')

Gefilterte Daten wurden erfolgreich in measurement_data/filtered_dataset.csv gespeichert.


In [7]:
# Hier lassen wir uns die ersten 10 Zeilen ausgeben, um einen ersten Eindruck zubekommen

df_filtered.head(10)

print(df_filtered.head)

<bound method NDFrame.head of            EPOCH_TIME SOURCE_FOLDER Z_VALUE
9       1629745198027   bumpy_roads   -0.56
10      1629745198028   bumpy_roads    0.16
11      1629745198029   bumpy_roads   -1.04
12      1629745198029   bumpy_roads    -0.4
13      1629745198029   bumpy_roads    0.08
...               ...           ...     ...
424235  1630527720257  flat_streets     0.0
424236  1630527720266  flat_streets   -0.19
424237  1630527720276  flat_streets     0.2
424238  1630527720286  flat_streets    0.05
424239  1630527720299  flat_streets    0.28

[411543 rows x 3 columns]>


In [8]:
# Statistische Zusammenfassung
df_filtered.describe()

Unnamed: 0,EPOCH_TIME,SOURCE_FOLDER,Z_VALUE
count,411543,411543,411543.0
unique,411260,3,1120.0
top,1630851379101,flat_streets,0.14
freq,6,231240,3320.0


In [9]:
# Informationen über die Datentypen und Nullwerte
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424240 entries, 0 to 424239
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   EPOCH_TIME         424240 non-null  object
 1   DEVICE_NAME        424240 non-null  object
 2   SENSOR_TYPE        424240 non-null  object
 3   MEASUREMENT_VALUE  424240 non-null  object
 4   GPS_LAT            1485 non-null    object
 5   GPS_LONG           1485 non-null    object
 6   SOURCE_FOLDER      424240 non-null  object
 7   Z_VALUE            411543 non-null  object
dtypes: object(8)
memory usage: 25.9+ MB


In [13]:
for col in df_filtered.columns:
    print(col)
    print(df_filtered[col] .nunique())

EPOCH_TIME
411260
SOURCE_FOLDER
3
Z_VALUE
1120
