In [2]:
import os
import pandas as pd

In [3]:
activity_map = {
    "A": "walking", 
    "B": "jogging", 
    "C": "stairs", 
    "D": "sitting", 
    "E": "standing", 
    "F": "typing", 
    "G": "teeth", 
    "H": "soup", 
    "I": "chips", 
    "J": "pasta", 
    "K": "drinking", 
    "L": "sandwich", 
    "M": "kicking", 
    "O": "catch", 
    "P": "dribbling", 
    "Q": "writing", 
    "R": "clapping", 
    "S": "folding"
}

In [4]:
def load_txt_data(file_path, header_line):
    """
    Wczytuje dane z pliku tekstowego, dodaje nagłówki i zwraca DataFrame.

    Args:
        file_path (str): Ścieżka do pliku tekstowego.
        header_line (str): Nagłówki kolumn w formacie CSV.

    Returns:
        pd.DataFrame: DataFrame z wczytanymi danymi.
    """
    try:
        # Rozdziel nagłówki na listę
        headers = header_line.split(",")
        
        # Wczytaj dane, usuwając średnik na końcu każdej linii
        data = pd.read_csv(file_path, sep=",", header=None, names=headers, engine="python")
        
        # Usuń ewentualne spacje w nazwach kolumn
        data.columns = data.columns.str.strip()
        
        return data
    except Exception as e:
        print(f"Error loading file {file_path}: {e}")
        return None

In [5]:
def load_all_files_in_folder(folder_path, header_line):
    """
    Wczytuje wszystkie pliki z danego folderu, łączy je w jedną ramkę danych.

    Args:
        folder_path (str): Ścieżka do folderu z plikami.
        header_line (str): Nagłówki kolumn w formacie CSV.

    Returns:
        pd.DataFrame: Połączona ramka danych z wszystkich plików.
    """
    all_data = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path) and file_name.endswith(".txt"):
            data = load_txt_data(file_path, header_line)
            if data is not None:
                all_data.append(data)
    
    # Połącz wszystkie ramki danych w jedną
    combined_data = pd.concat(all_data, ignore_index=True)
    return combined_data

In [6]:

accel_header_line = "Subject-id,Activity Label,Timestamp,ac_x,ac_y,ac_z"
gyro_header_line = "Subject-id,Activity Label,Timestamp,g_x,g_y,g_z"
accel_dir_path = "d:/metamotion-ml/data_loader/wisdm-dataset/raw/accel/"
gyro_dir_path = "d:/metamotion-ml/data_loader/wisdm-dataset/raw/gyro/"

accel_data = load_all_files_in_folder(accel_dir_path, accel_header_line)
print(accel_data.info())
gyro_data = load_all_files_in_folder(gyro_dir_path, gyro_header_line)
print(gyro_data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3777046 entries, 0 to 3777045
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   Subject-id      int64  
 1   Activity Label  object 
 2   Timestamp       int64  
 3   ac_x            float64
 4   ac_y            float64
 5   ac_z            object 
dtypes: float64(2), int64(2), object(2)
memory usage: 172.9+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3440342 entries, 0 to 3440341
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   Subject-id      int64  
 1   Activity Label  object 
 2   Timestamp       int64  
 3   g_x             float64
 4   g_y             float64
 5   g_z             object 
dtypes: float64(2), int64(2), object(2)
memory usage: 157.5+ MB
None


In [7]:
merged_df = pd.merge(accel_data, gyro_data, on=["Subject-id", "Timestamp", "Activity Label"], how="inner")

In [8]:
merged_df['Activity Label'] = merged_df['Activity Label'].replace(activity_map)

In [9]:
allowed_activities = ['walking', 'jogging', 'stairs', 'sitting', 'standing']  # Lista dozwolonych wartości
filtered_df = merged_df[merged_df['Activity Label'].isin(allowed_activities)]

In [10]:
filtered_df['Activity Label'].value_counts()

Activity Label
standing    195324
sitting     192594
walking     186149
stairs      183264
jogging     181618
Name: count, dtype: int64

In [11]:
filtered_df

Unnamed: 0,Subject-id,Activity Label,Timestamp,ac_x,ac_y,ac_z,g_x,g_y,g_z
0,1600,walking,90426757696641,4.972757,-0.158317,6.6967316;,0.314944,-1.022277,-0.3099616;
1,1600,walking,90426807196641,3.253720,-0.191835,6.107758;,0.387382,-0.618541,-0.048971802;
2,1600,walking,90426856696641,2.801216,-0.155922,5.997625;,0.070999,-0.209480,-0.1959783;
3,1600,walking,90426906196641,3.770868,-1.051354,7.731027;,0.037975,0.254976,-0.1565635;
4,1600,walking,90426955696641,4.661511,0.169689,9.684695;,0.073129,0.719431,-0.0010349044;
...,...,...,...,...,...,...,...,...,...
3321742,1650,standing,2425530011432855,2.357989,-1.719186,10.213066;,-0.248304,-0.548543,-0.22623792;
3321743,1650,standing,2425530061425505,2.487276,-1.676091,10.12448;,-0.268544,-0.394079,-0.18149683;
3321744,1650,standing,2425530111418155,2.851195,-1.702427,9.988011;,-0.255761,-0.246007,-0.19960631;
3321745,1650,standing,2425530161410805,2.884714,-1.611447,9.664794;,-0.322873,-0.096870,-0.23582532;


In [31]:
filtered_df.to_parquet("wsidm.parquet", compression="snappy")