## Kids Data frame 

In [2]:
#Librairie Importation:
import os
import pandas as pd


In [47]:
import os
import pandas as pd

def load_and_concat_sensor_data(root_dir, i):
    concatenated_data_frames = []

    # Get the dynamic folder that matches the pattern *_kid
    dynamic_folder = None
    for d in os.listdir(root_dir):
        if os.path.isdir(os.path.join(root_dir, d)) and '_kid' in d:
            dynamic_folder = d
            break

    if dynamic_folder is None:
        print("No folder matching the pattern '*_kid' found.")
        return concatenated_data_frames
    else:
        # Construct the full path to the dynamic folder
        dynamic_folder_path = os.path.join(root_dir, dynamic_folder).replace("\\", "/")

        # Sensor folders to process
        sensor_folders = ['rotation', 'LinearAcceleration']

        # Construct paths to the sensor folders
        sensor_paths = {sensor: os.path.join(dynamic_folder_path, sensor).replace("\\", "/") for sensor in sensor_folders}

        # Check if sensor folders exist
        if not all(os.path.exists(sensor_path) for sensor_path in sensor_paths.values()):
            print(f"One or more sensor folders not found: {sensor_paths}")
            return concatenated_data_frames

        # Get the list of .txt files in each sensor folder (assuming they have the same number of files)
        sensor_files = {sensor: sorted([f for f in os.listdir(sensor_paths[sensor]) if f.endswith('.txt')]) for sensor in sensor_folders}

        # Iterate through the files and process them
        for rotation_file, linear_acc_file in zip(sensor_files['rotation'], sensor_files['LinearAcceleration']):
            # Construct the full file paths
            rotation_file_path = os.path.join(sensor_paths['rotation'], rotation_file).replace("\\", "/")
            linear_acc_file_path = os.path.join(sensor_paths['LinearAcceleration'], linear_acc_file).replace("\\", "/")
            
            # Read the rotation file into a DataFrame
            try:
                rotation_df = pd.read_csv(rotation_file_path, delimiter='\t', header=None, usecols=[0, 1, 2, 3, 4, 5])
                rotation_df.columns = ['ro_timestamp', 'ro_internal_ts', 'ro_x', 'ro_y', 'ro_z', 'ro_app_id']
                # Calculate magnitude of rotation vector
                rotation_df['ro_mag'] = (rotation_df['ro_x']**2 + rotation_df['ro_y']**2 + rotation_df['ro_z']**2)**0.5
            except Exception as e:
                print(f"Error reading {rotation_file_path}: {e}")
                continue
            
            # Read the linear acceleration file into a DataFrame
            try:
                linear_acc_df = pd.read_csv(linear_acc_file_path, delimiter='\t', header=None, usecols=[0, 1, 2, 3, 4, 5])
                linear_acc_df.columns = ['la_timestamp', 'la_internal_ts', 'la_x', 'la_y', 'la_z', 'la_app_id']
                # Calculate magnitude of linear acceleration vector
                linear_acc_df['la_mag'] = (linear_acc_df['la_x']**2 + linear_acc_df['la_y']**2 + linear_acc_df['la_z']**2)**0.5
            except Exception as e:
                print(f"Error reading {linear_acc_file_path}: {e}")
                continue

            # Concatenate the two DataFrames along columns
            concatenated_df = pd.concat([rotation_df, linear_acc_df], axis=1)
            
            # Add 'id' and 'label' columns
            concatenated_df['id'] = i
            concatenated_df['label'] = 1
            
            # Append the concatenated DataFrame to the list
            concatenated_data_frames.append(concatenated_df)

    return concatenated_data_frames

# Example usage to load data for Kid1 to Kid25
root_base_dir = 'D:/internship/code/KidsOnThePhone_dataset/Kids'
kids_sensors = {}

for i in range(1, 26):
    root_dir = os.path.join(root_base_dir, f'Kid{i}', 'Sensors')
    kids_sensors[f"kid{i}_sensors"] = load_and_concat_sensor_data(root_dir, i)

# Display the first concatenated DataFrame for kid1_sensors
if "kid1_sensors" in kids_sensors and kids_sensors["kid1_sensors"]:
    print(kids_sensors["kid1_sensors"][0].head(1))


   ro_timestamp  ro_internal_ts      ro_x      ro_y      ro_z  ro_app_id  \
0  1.498776e+12    5.338069e+14  0.309998  0.073929  0.143112      105.0   

     ro_mag   la_timestamp   la_internal_ts      la_x      la_y      la_z  \
0  0.349349  1498776153671  533806775477233  0.177094 -0.224823 -0.130081   

   la_app_id   la_mag  id  label  
0      105.0  0.31437   1      1  


In [43]:
print(len(kids_sensors["kid1_sensors"]))


23


## Data Preprocessing : 

In [44]:
# Example usage assuming `kids_sensors` dictionary is populated as before

for key, data_frames in kids_sensors.items():
    print(f"Processing {key}:")
    for df_idx, df in enumerate(data_frames):
        print(f"DataFrame {df_idx + 1}:")
        
        # Count duplicates
        num_duplicates = df.duplicated().sum()
        print(f"Number of duplicates: {num_duplicates}")
        
        # Count null values
        num_null_values = df.isnull().sum().sum()
        print(f"Number of null values: {num_null_values}")
        
        print("-" * 30)


Processing kid1_sensors:
DataFrame 1:
Number of duplicates: 0
Number of null values: 36
------------------------------
DataFrame 2:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 3:
Number of duplicates: 0
Number of null values: 6
------------------------------
DataFrame 4:
Number of duplicates: 0
Number of null values: 6
------------------------------
DataFrame 5:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 6:
Number of duplicates: 0
Number of null values: 6
------------------------------
DataFrame 7:
Number of duplicates: 0
Number of null values: 12
------------------------------
DataFrame 8:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 9:
Number of duplicates: 0
Number of null values: 12
------------------------------
DataFrame 10:
Number of duplicates: 0
Number of null values: 6
------------------------------
DataFrame 11:
Number of duplicates: 0
Num

In [45]:
# Example usage assuming `kids_sensors` dictionary is populated as before

for key, data_frames in kids_sensors.items():
    print(f"Processing {key}:")
    for df_idx, df in enumerate(data_frames):
        print(f"DataFrame {df_idx + 1}:")
        
        # Drop duplicates and null values inplace
        df.drop_duplicates(inplace=True)
        df.dropna(inplace=True)
        
        # Count after dropping
        num_rows_after_cleaning = len(df)
        print(f"Number of rows after cleaning: {num_rows_after_cleaning}")
        
        print("-" * 30)
    
    # No need to append, directly modify the original list in kids_sensors[key]

    print(f"Finished processing {key}")
    print("=" * 40)


Processing kid1_sensors:
DataFrame 1:
Number of rows after cleaning: 1730
------------------------------
DataFrame 2:
Number of rows after cleaning: 1738
------------------------------
DataFrame 3:
Number of rows after cleaning: 1738
------------------------------
DataFrame 4:
Number of rows after cleaning: 1737
------------------------------
DataFrame 5:
Number of rows after cleaning: 1738
------------------------------
DataFrame 6:
Number of rows after cleaning: 1738
------------------------------
DataFrame 7:
Number of rows after cleaning: 1737
------------------------------
DataFrame 8:
Number of rows after cleaning: 1738
------------------------------
DataFrame 9:
Number of rows after cleaning: 1737
------------------------------
DataFrame 10:
Number of rows after cleaning: 1738
------------------------------
DataFrame 11:
Number of rows after cleaning: 1738
------------------------------
DataFrame 12:
Number of rows after cleaning: 1738
------------------------------
DataFrame 13

In [46]:
# Example usage assuming `kids_sensors` dictionary is populated as before

for key, data_frames in kids_sensors.items():
    print(f"Processing {key}:")
    for df_idx, df in enumerate(data_frames):
        print(f"DataFrame {df_idx + 1}:")
        
        # Count duplicates
        num_duplicates = df.duplicated().sum()
        print(f"Number of duplicates: {num_duplicates}")
        
        # Count null values
        num_null_values = df.isnull().sum().sum()
        print(f"Number of null values: {num_null_values}")
        
        print("-" * 30)


Processing kid1_sensors:
DataFrame 1:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 2:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 3:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 4:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 5:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 6:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 7:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 8:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 9:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 10:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 11:
Number of duplicates: 0
Number

In [68]:
print(kids_sensors["kid2_sensors"][2].head(30))

     ro_timestamp   ro_internal_ts      ro_x      ro_y      ro_z  ro_app_id  \
0   1499123805646  881458752269552  0.375717  0.587265  0.647903       99.0   
1   1499123805666  881458772411154  0.376541  0.587738  0.647156       99.0   
2   1499123805683  881458792552756  0.377228  0.587830  0.646469       99.0   
3   1499123805701  881458812694357  0.377594  0.587753  0.646164       99.0   
4   1499123805723  881458832835959  0.377777  0.587814  0.646332       99.0   
5   1499123805737  881458852977560  0.378006  0.587860  0.647079       99.0   
6   1499123805771  881458873119162  0.378433  0.587570  0.648209       99.0   
7   1499123805790  881458893260763  0.378555  0.587189  0.649628       99.0   
8   1499123805807  881458913402365  0.378799  0.586426  0.650970       99.0   
9   1499123805824  881458933543967  0.378830  0.585724  0.652222       99.0   
10  1499123805843  881458953685568  0.378967  0.584641  0.653564       99.0   
11  1499123805861  881458973827170  0.379135  0.5829

In [86]:
import numpy as np
import pandas as pd

# Initialize the dictionary to store extracted features
extracted_features = {}

# Iterate through each key (e.g., 'kid1_sensors', 'kid2_sensors', etc.) in kids_sensors
for key, data_frames in kids_sensors.items():
    # Initialize lists to store feature values for each DataFrame
    ids = []
    ro_x_means = []
    ro_y_means = []
    ro_z_means = []
    ro_x_maxs = []
    ro_y_maxs = []
    ro_z_maxs = []
    ro_x_mins = []
    ro_y_mins = []
    ro_z_mins = []
    ro_x_rmse = []
    ro_y_rmse = []
    ro_z_rmse = []
    ro_x_stds = []
    ro_y_stds = []
    ro_z_stds = []
    la_x_means = []
    la_y_means = []
    la_z_means = []
    la_x_maxs = []
    la_y_maxs = []
    la_z_maxs = []
    la_x_mins = []
    la_y_mins = []
    la_z_mins = []
    la_x_rmse = []
    la_y_rmse = []
    la_z_rmse = []
    la_x_stds = []
    la_y_stds = []
    la_z_stds = []
    ro_mag_means = []
    ro_mag_maxs = []
    ro_mag_mins = []
    ro_mag_rmse = []
    ro_mag_stds = []
    la_mag_means = []
    la_mag_maxs = []
    la_mag_mins = []
    la_mag_rmse = []
    la_mag_stds = []
    labels = []
    
    # Iterate through each DataFrame in the current key's list of DataFrames
    for df in data_frames:
        # Extract id and label (assuming they are the same for rotation and linear acceleration)
        ids.append(df['id'].iloc[0])
        
        # Extract features for rotation
        ro_x_means.append(df['ro_x'].mean())
        ro_y_means.append(df['ro_y'].mean())
        ro_z_means.append(df['ro_z'].mean())
        ro_x_maxs.append(df['ro_x'].max())
        ro_y_maxs.append(df['ro_y'].max())
        ro_z_maxs.append(df['ro_z'].max())
        ro_x_mins.append(df['ro_x'].min())
        ro_y_mins.append(df['ro_y'].min())
        ro_z_mins.append(df['ro_z'].min())
        ro_x_rmse.append(np.sqrt(np.mean(df['ro_x']**2)))
        ro_y_rmse.append(np.sqrt(np.mean(df['ro_y']**2)))
        ro_z_rmse.append(np.sqrt(np.mean(df['ro_z']**2)))
        ro_x_stds.append(df['ro_x'].std())
        ro_y_stds.append(df['ro_y'].std())
        ro_z_stds.append(df['ro_z'].std())

        # Compute ro_mag and extract features
        ro_mag = np.sqrt(df['ro_x']**2 + df['ro_y']**2 + df['ro_z']**2)
        ro_mag_means.append(ro_mag.mean())
        ro_mag_maxs.append(ro_mag.max())
        ro_mag_mins.append(ro_mag.min())
        ro_mag_rmse.append(np.sqrt(np.mean(ro_mag**2)))
        ro_mag_stds.append(ro_mag.std())
        
        # Extract features for linear acceleration
        la_x_means.append(df['la_x'].mean())
        la_y_means.append(df['la_y'].mean())
        la_z_means.append(df['la_z'].mean())
        la_x_maxs.append(df['la_x'].max())
        la_y_maxs.append(df['la_y'].max())
        la_z_maxs.append(df['la_z'].max())
        la_x_mins.append(df['la_x'].min())
        la_y_mins.append(df['la_y'].min())
        la_z_mins.append(df['la_z'].min())
        la_x_rmse.append(np.sqrt(np.mean(df['la_x']**2)))
        la_y_rmse.append(np.sqrt(np.mean(df['la_y']**2)))
        la_z_rmse.append(np.sqrt(np.mean(df['la_z']**2)))
        la_x_stds.append(df['la_x'].std())
        la_y_stds.append(df['la_y'].std())
        la_z_stds.append(df['la_z'].std())

        # Compute la_mag and extract features
        la_mag = np.sqrt(df['la_x']**2 + df['la_y']**2 + df['la_z']**2)
        la_mag_means.append(la_mag.mean())
        la_mag_maxs.append(la_mag.max())
        la_mag_mins.append(la_mag.min())
        la_mag_rmse.append(np.sqrt(np.mean(la_mag**2)))
        la_mag_stds.append(la_mag.std())

        # Extract label (assuming it's the same for all rows in this DataFrame)
        labels.append(df['label'].iloc[0])
    
    # Create a DataFrame for extracted features for the current key
    extracted_df = pd.DataFrame({
        'id': ids,
        'ro_x_mean': ro_x_means,
        'ro_y_mean': ro_y_means,
        'ro_z_mean': ro_z_means,
        'ro_x_max': ro_x_maxs,
        'ro_y_max': ro_y_maxs,
        'ro_z_max': ro_z_maxs,
        'ro_x_min': ro_x_mins,
        'ro_y_min': ro_y_mins,
        'ro_z_min': ro_z_mins,
        'ro_x_rmse': ro_x_rmse,
        'ro_y_rmse': ro_y_rmse,
        'ro_z_rmse': ro_z_rmse,
        'ro_x_std': ro_x_stds,
        'ro_y_std': ro_y_stds,
        'ro_z_std': ro_z_stds,
        'ro_mag_mean': ro_mag_means,
        'ro_mag_max': ro_mag_maxs,
        'ro_mag_min': ro_mag_mins,
        'ro_mag_rmse': ro_mag_rmse,
        'ro_mag_std': ro_mag_stds,
        'la_x_mean': la_x_means,
        'la_y_mean': la_y_means,
        'la_z_mean': la_z_means,
        'la_x_max': la_x_maxs,
        'la_y_max': la_y_maxs,
        'la_z_max': la_z_maxs,
        'la_x_min': la_x_mins,
        'la_y_min': la_y_mins,
        'la_z_min': la_z_mins,
        'la_x_rmse': la_x_rmse,
        'la_y_rmse': la_y_rmse,
        'la_z_rmse': la_z_rmse,
        'la_x_std': la_x_stds,
        'la_y_std': la_y_stds,
        'la_z_std': la_z_stds,
        'la_mag_mean': la_mag_means,
        'la_mag_max': la_mag_maxs,
        'la_mag_min': la_mag_mins,
        'la_mag_rmse': la_mag_rmse,
        'la_mag_std': la_mag_stds,
        'label': labels
    })
    
    # Assign the DataFrame to the corresponding key in extracted_features dictionary
    extracted_features[key] = extracted_df

# Example usage to print the first few rows of the extracted DataFrame for 'kid1_sensors'
if "kid1_sensors" in extracted_features and not extracted_features["kid1_sensors"].empty:
    print(extracted_features["kid1_sensors"].head())


   id  ro_x_mean  ro_y_mean  ro_z_mean  ro_x_max  ro_y_max  ro_z_max  \
0   1   0.211515  -0.089528   0.003718  0.334061  0.109970  0.345322   
1   1   0.064150  -0.334205   0.141335  0.112015 -0.273285  0.183792   
2   1   0.047013  -0.372772   0.121156  0.111160 -0.322327  0.254898   
3   1   0.055543  -0.348347   0.147268  0.152435 -0.277802  0.205978   
4   1   0.078702  -0.352490   0.169944  0.139175 -0.284180  0.275711   

   ro_x_min  ro_y_min  ro_z_min  ...  la_z_rmse  la_x_std  la_y_std  la_z_std  \
0  0.057495 -0.322662 -0.204834  ...   0.371293  0.366098  0.304171  0.371249   
1  0.024780 -0.422302  0.086090  ...   0.361042  0.196294  0.133895  0.357994   
2 -0.023392 -0.461151  0.069412  ...   0.368192  0.210187  0.225796  0.365996   
3 -0.016815 -0.445572 -0.002411  ...   0.396734  0.212522  0.179589  0.393421   
4 -0.005203 -0.404251  0.107437  ...   0.251352  0.233390  0.208717  0.244713   

   la_mag_mean  la_mag_max  la_mag_min  la_mag_rmse  la_mag_std  label  
0     0

In [87]:
print(extracted_features["kid1_sensors"].info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 42 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           23 non-null     int64  
 1   ro_x_mean    23 non-null     float64
 2   ro_y_mean    23 non-null     float64
 3   ro_z_mean    23 non-null     float64
 4   ro_x_max     23 non-null     float64
 5   ro_y_max     23 non-null     float64
 6   ro_z_max     23 non-null     float64
 7   ro_x_min     23 non-null     float64
 8   ro_y_min     23 non-null     float64
 9   ro_z_min     23 non-null     float64
 10  ro_x_rmse    23 non-null     float64
 11  ro_y_rmse    23 non-null     float64
 12  ro_z_rmse    23 non-null     float64
 13  ro_x_std     23 non-null     float64
 14  ro_y_std     23 non-null     float64
 15  ro_z_std     23 non-null     float64
 16  ro_mag_mean  23 non-null     float64
 17  ro_mag_max   23 non-null     float64
 18  ro_mag_min   23 non-null     float64
 19  ro_mag_rms

In [88]:
# Concatenate all data frames into a single data frame
kids_data = pd.concat(extracted_features.values(), ignore_index=True)


In [89]:
print(kids_data.head(5))
print(kids_data.info())

   id  ro_x_mean  ro_y_mean  ro_z_mean  ro_x_max  ro_y_max  ro_z_max  \
0   1   0.211515  -0.089528   0.003718  0.334061  0.109970  0.345322   
1   1   0.064150  -0.334205   0.141335  0.112015 -0.273285  0.183792   
2   1   0.047013  -0.372772   0.121156  0.111160 -0.322327  0.254898   
3   1   0.055543  -0.348347   0.147268  0.152435 -0.277802  0.205978   
4   1   0.078702  -0.352490   0.169944  0.139175 -0.284180  0.275711   

   ro_x_min  ro_y_min  ro_z_min  ...  la_z_rmse  la_x_std  la_y_std  la_z_std  \
0  0.057495 -0.322662 -0.204834  ...   0.371293  0.366098  0.304171  0.371249   
1  0.024780 -0.422302  0.086090  ...   0.361042  0.196294  0.133895  0.357994   
2 -0.023392 -0.461151  0.069412  ...   0.368192  0.210187  0.225796  0.365996   
3 -0.016815 -0.445572 -0.002411  ...   0.396734  0.212522  0.179589  0.393421   
4 -0.005203 -0.404251  0.107437  ...   0.251352  0.233390  0.208717  0.244713   

   la_mag_mean  la_mag_max  la_mag_min  la_mag_rmse  la_mag_std  label  
0     0

In [90]:
kids_data = kids_data.sample(frac=1, random_state=42)
print(kids_data.head())

     id  ro_x_mean  ro_y_mean  ro_z_mean  ro_x_max  ro_y_max  ro_z_max  \
634  25   0.391358  -0.024481   0.177286  0.418823  0.002319  0.192184   
220  11   0.082882  -0.127687   0.465300  0.110275 -0.091537  0.493652   
426  19   0.048871   0.212403   0.723906  0.116348  0.249390  0.836884   
428  19   0.217404   0.065850  -0.078850  0.247879  0.112488  0.143860   
72    4  -0.065928  -0.299645  -0.331634 -0.000870 -0.176041 -0.181244   

     ro_x_min  ro_y_min  ro_z_min  ...  la_z_rmse  la_x_std  la_y_std  \
634  0.361465 -0.042343  0.167801  ...   0.146353  0.051768  0.048705   
220  0.026962 -0.282455  0.394089  ...   0.505619  0.246158  0.235012   
426 -0.016693  0.170715  0.583511  ...   0.539683  0.214234  0.244968   
428  0.192719 -0.012497 -0.299057  ...   0.268936  0.178997  0.176665   
72  -0.230560 -0.432114 -0.478546  ...   0.954466  0.447212  0.369620   

     la_z_std  la_mag_mean  la_mag_max  la_mag_min  la_mag_rmse  la_mag_std  \
634  0.086618     0.166570    0.47922

In [91]:
# Define the output file path
output_file = 'kids.csv'

# Save the concatenated DataFrame to a CSV file
kids_data.to_csv(output_file, index=False)

print(f"Concatenated features saved to {output_file}")


Concatenated features saved to kids.csv


In [92]:
print(kids_data.columns)

Index(['id', 'ro_x_mean', 'ro_y_mean', 'ro_z_mean', 'ro_x_max', 'ro_y_max',
       'ro_z_max', 'ro_x_min', 'ro_y_min', 'ro_z_min', 'ro_x_rmse',
       'ro_y_rmse', 'ro_z_rmse', 'ro_x_std', 'ro_y_std', 'ro_z_std',
       'ro_mag_mean', 'ro_mag_max', 'ro_mag_min', 'ro_mag_rmse', 'ro_mag_std',
       'la_x_mean', 'la_y_mean', 'la_z_mean', 'la_x_max', 'la_y_max',
       'la_z_max', 'la_x_min', 'la_y_min', 'la_z_min', 'la_x_rmse',
       'la_y_rmse', 'la_z_rmse', 'la_x_std', 'la_y_std', 'la_z_std',
       'la_mag_mean', 'la_mag_max', 'la_mag_min', 'la_mag_rmse', 'la_mag_std',
       'label'],
      dtype='object')
