In [None]:
import os
import pandas as pd

In [17]:
activity_map = {
    "A": "walking", 
    "B": "jogging", 
    "C": "stairs", 
    "D": "sitting", 
    "E": "standing", 
    "F": "typing", 
    "G": "teeth", 
    "H": "soup", 
    "I": "chips", 
    "J": "pasta", 
    "K": "drinking", 
    "L": "sandwich", 
    "M": "kicking", 
    "O": "catch", 
    "P": "dribbling", 
    "Q": "writing", 
    "R": "clapping", 
    "S": "folding"
}

In [3]:
def load_txt_data(file_path, header_line):
    """
    Wczytuje dane z pliku tekstowego, dodaje nagłówki i zwraca DataFrame.

    Args:
        file_path (str): Ścieżka do pliku tekstowego.
        header_line (str): Nagłówki kolumn w formacie CSV.

    Returns:
        pd.DataFrame: DataFrame z wczytanymi danymi.
    """
    try:
        # Rozdziel nagłówki na listę
        headers = header_line.split(",")
        
        # Wczytaj dane, usuwając średnik na końcu każdej linii
        data = pd.read_csv(file_path, sep=",", header=None, names=headers, engine="python")
        
        # Usuń ewentualne spacje w nazwach kolumn
        data.columns = data.columns.str.strip()
        
        return data
    except Exception as e:
        print(f"Error loading file {file_path}: {e}")
        return None

In [11]:
def load_all_files_in_folder(folder_path, header_line):
    """
    Wczytuje wszystkie pliki z danego folderu, łączy je w jedną ramkę danych.

    Args:
        folder_path (str): Ścieżka do folderu z plikami.
        header_line (str): Nagłówki kolumn w formacie CSV.

    Returns:
        pd.DataFrame: Połączona ramka danych z wszystkich plików.
    """
    all_data = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path) and file_name.endswith(".txt"):
            data = load_txt_data(file_path, header_line)
            if data is not None:
                all_data.append(data)
    
    # Połącz wszystkie ramki danych w jedną
    combined_data = pd.concat(all_data, ignore_index=True)
    return combined_data

In [25]:

accel_header_line = "Subject-id,Activity Label,Timestamp,ac_x,ac_y,ac_z"
gyro_header_line = "Subject-id,Activity Label,Timestamp,g_x,g_y,g_z"
accel_dir_path = "d:/metamotion-ml/data_loader/wisdm-dataset/raw/accel/"
gyro_dir_path = "d:/metamotion-ml/data_loader/wisdm-dataset/raw/gyro/"

accel_data = load_all_files_in_folder(accel_dir_path, accel_header_line)
print(accel_data.info())
gyro_data = load_all_files_in_folder(gyro_dir_path, gyro_header_line)
print(gyro_data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3777046 entries, 0 to 3777045
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   Subject-id      int64  
 1   Activity Label  object 
 2   Timestamp       int64  
 3   ac_x            float64
 4   ac_y            float64
 5   ac_z            object 
dtypes: float64(2), int64(2), object(2)
memory usage: 172.9+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3440342 entries, 0 to 3440341
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   Subject-id      int64  
 1   Activity Label  object 
 2   Timestamp       int64  
 3   g_x             float64
 4   g_y             float64
 5   g_z             object 
dtypes: float64(2), int64(2), object(2)
memory usage: 157.5+ MB
None


In [26]:
merged_df = pd.merge(accel_data, gyro_data, on=["Subject-id", "Timestamp", "Activity Label"], how="inner")

In [27]:
merged_df['Activity Label'] = merged_df['Activity Label'].replace(activity_map)

In [28]:
allowed_activities = ['walking', 'jogging', 'stairs', 'sitting', 'standing']  # Lista dozwolonych wartości
filtered_df = merged_df[merged_df['Activity Label'].isin(allowed_activities)]

In [29]:
filtered_df['Activity Label'].value_counts()

Activity Label
standing    195324
sitting     192594
walking     186149
stairs      183264
jogging     181618
Name: count, dtype: int64

In [32]:
filtered_df['Subject-id'].value_counts()

Subject-id
1629    72064
1621    24534
1628    23509
1645    20158
1650    19761
1647    18671
1602    18111
1625    18030
1624    18027
1600    18019
1614    18019
1630    18018
1626    18018
1627    18017
1631    18017
1632    18017
1611    18016
1634    18016
1604    18016
1622    18016
1635    18015
1636    18015
1610    18015
1606    18015
1617    18015
1633    18015
1615    18015
1608    18015
1609    18015
1612    18015
1620    18015
1618    18014
1605    18012
1603    18011
1607    18010
1648    18009
1649    18007
1601    18007
1613    18006
1641    18005
1619    18000
1623    18000
1646    17994
1644    17992
1643    17991
1616    14412
1642    14396
1638    11682
1639     6819
1640     5965
1637     4398
Name: count, dtype: int64

In [31]:
filtered_df.to_parquet("wsidm.parquet", compression="snappy")