## Kids Data frame 

In [2]:
#Librairie Importation:
import os
import pandas as pd


In [3]:
import os
import pandas as pd

def load_and_concat_sensor_data(root_dir, i):
    concatenated_data_frames = []

    # Get the dynamic folder that matches the pattern parent
    dynamic_folder = None
    for d in os.listdir(root_dir):
        if os.path.isdir(os.path.join(root_dir, d)) and 'parent' in d:
            dynamic_folder = d
            break

    if dynamic_folder is None:
        print("No folder matching the pattern '*_parent' found.")
        return concatenated_data_frames
    else:
        # Construct the full path to the dynamic folder
        dynamic_folder_path = os.path.join(root_dir, dynamic_folder).replace("\\", "/")

        # Sensor folders to process
        sensor_folders = ['rotation', 'LinearAcceleration']

        # Construct paths to the sensor folders
        sensor_paths = {sensor: os.path.join(dynamic_folder_path, sensor).replace("\\", "/") for sensor in sensor_folders}

        # Check if sensor folders exist
        if not all(os.path.exists(sensor_path) for sensor_path in sensor_paths.values()):
            print(f"One or more sensor folders not found: {sensor_paths}")
            return concatenated_data_frames

        # Get the list of .txt files in each sensor folder (assuming they have the same number of files)
        sensor_files = {sensor: sorted([f for f in os.listdir(sensor_paths[sensor]) if f.endswith('.txt')]) for sensor in sensor_folders}

        # Iterate through the files and process them
        for rotation_file, linear_acc_file in zip(sensor_files['rotation'], sensor_files['LinearAcceleration']):
            # Construct the full file paths
            rotation_file_path = os.path.join(sensor_paths['rotation'], rotation_file).replace("\\", "/")
            linear_acc_file_path = os.path.join(sensor_paths['LinearAcceleration'], linear_acc_file).replace("\\", "/")
            
            # Read the rotation file into a DataFrame
            try:
                rotation_df = pd.read_csv(rotation_file_path, delimiter='\t', header=None, usecols=[0, 1, 2, 3, 4, 5])
                rotation_df.columns = ['ro_timestamp', 'ro_internal_ts', 'ro_x', 'ro_y', 'ro_z', 'ro_app_id']
                # Calculate magnitude of rotation vector
                rotation_df['ro_mag'] = (rotation_df['ro_x']**2 + rotation_df['ro_y']**2 + rotation_df['ro_z']**2)**0.5
            except Exception as e:
                print(f"Error reading {rotation_file_path}: {e}")
                continue
            
            # Read the linear acceleration file into a DataFrame
            try:
                linear_acc_df = pd.read_csv(linear_acc_file_path, delimiter='\t', header=None, usecols=[0, 1, 2, 3, 4, 5])
                linear_acc_df.columns = ['la_timestamp', 'la_internal_ts', 'la_x', 'la_y', 'la_z', 'la_app_id']
                # Calculate magnitude of linear acceleration vector
                linear_acc_df['la_mag'] = (linear_acc_df['la_x']**2 + linear_acc_df['la_y']**2 + linear_acc_df['la_z']**2)**0.5
            except Exception as e:
                print(f"Error reading {linear_acc_file_path}: {e}")
                continue

            # Concatenate the two DataFrames along columns
            concatenated_df = pd.concat([rotation_df, linear_acc_df], axis=1)
            
            # Add 'id' and 'label' columns
            concatenated_df['id'] = i+25
            concatenated_df['label'] = 0
            
            # Append the concatenated DataFrame to the list
            concatenated_data_frames.append(concatenated_df)

    return concatenated_data_frames

# Example usage to load data for Kid1 to Kid25
root_base_dir = 'D:\internship\code\KidsOnThePhone_dataset\Parents'
parents_sensors = {}

for i in range(1, 26):
    root_dir = os.path.join(root_base_dir, f'Parent{i}', 'Sensors')
    parents_sensors[f"parent{i}_sensors"] = load_and_concat_sensor_data(root_dir, i)

# Display the first concatenated DataFrame for kid1_sensors
if "parent1_sensors" in parents_sensors and parents_sensors["parent1_sensors"]:
    print(parents_sensors["parent1_sensors"][0].head(1))


   ro_timestamp  ro_internal_ts      ro_x      ro_y      ro_z  ro_app_id  \
0  1.498777e+12    5.351215e+14  0.256882  0.096115  0.324371      105.0   

     ro_mag   la_timestamp   la_internal_ts      la_x      la_y      la_z  \
0  0.424786  1498777468262  535121377667630 -0.027222  0.030685  0.206451   

   la_app_id    la_mag  id  label  
0      105.0  0.210487  26      0  


In [4]:
print(len(parents_sensors["parent1_sensors"]))


18


## Data Preprocessing : 

In [5]:
# Example usage assuming `kids_sensors` dictionary is populated as before

for key, data_frames in parents_sensors.items():
    print(f"Processing {key}:")
    for df_idx, df in enumerate(data_frames):
        print(f"DataFrame {df_idx + 1}:")
        
        # Count duplicates
        num_duplicates = df.duplicated().sum()
        print(f"Number of duplicates: {num_duplicates}")
        
        # Count null values
        num_null_values = df.isnull().sum().sum()
        print(f"Number of null values: {num_null_values}")
        
        print("-" * 30)


Processing parent1_sensors:
DataFrame 1:
Number of duplicates: 0
Number of null values: 28
------------------------------
DataFrame 2:
Number of duplicates: 0
Number of null values: 7
------------------------------
DataFrame 3:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 4:
Number of duplicates: 0
Number of null values: 7
------------------------------
DataFrame 5:
Number of duplicates: 0
Number of null values: 7
------------------------------
DataFrame 6:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 7:
Number of duplicates: 0
Number of null values: 7
------------------------------
DataFrame 8:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 9:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 10:
Number of duplicates: 0
Number of null values: 7
------------------------------
DataFrame 11:
Number of duplicates: 0
Nu

In [6]:

for key, data_frames in parents_sensors.items():
    print(f"Processing {key}:")
    for df_idx, df in enumerate(data_frames):
        print(f"DataFrame {df_idx + 1}:")
        
        # Drop duplicates and null values inplace
        df.drop_duplicates(inplace=True)
        df.dropna(inplace=True)
        
        # Count after dropping
        num_rows_after_cleaning = len(df)
        print(f"Number of rows after cleaning: {num_rows_after_cleaning}")
        
        print("-" * 30)
    
    # No need to append, directly modify the original list in parents_sensors[key]

    print(f"Finished processing {key}")
    print("=" * 40)


Processing parent1_sensors:
DataFrame 1:
Number of rows after cleaning: 1732
------------------------------
DataFrame 2:
Number of rows after cleaning: 1737
------------------------------
DataFrame 3:
Number of rows after cleaning: 1739
------------------------------
DataFrame 4:
Number of rows after cleaning: 1738
------------------------------
DataFrame 5:
Number of rows after cleaning: 1738
------------------------------
DataFrame 6:
Number of rows after cleaning: 1738
------------------------------
DataFrame 7:
Number of rows after cleaning: 1737
------------------------------
DataFrame 8:
Number of rows after cleaning: 1738
------------------------------
DataFrame 9:
Number of rows after cleaning: 1738
------------------------------
DataFrame 10:
Number of rows after cleaning: 1738
------------------------------
DataFrame 11:
Number of rows after cleaning: 1738
------------------------------
DataFrame 12:
Number of rows after cleaning: 1738
------------------------------
DataFrame

In [7]:
# Example usage assuming `kids_sensors` dictionary is populated as before

for key, data_frames in parents_sensors.items():
    print(f"Processing {key}:")
    for df_idx, df in enumerate(data_frames):
        print(f"DataFrame {df_idx + 1}:")
        
        # Count duplicates
        num_duplicates = df.duplicated().sum()
        print(f"Number of duplicates: {num_duplicates}")
        
        # Count null values
        num_null_values = df.isnull().sum().sum()
        print(f"Number of null values: {num_null_values}")
        
        print("-" * 30)


Processing parent1_sensors:
DataFrame 1:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 2:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 3:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 4:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 5:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 6:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 7:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 8:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 9:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 10:
Number of duplicates: 0
Number of null values: 0
------------------------------
DataFrame 11:
Number of duplicates: 0
Num

In [9]:
print(parents_sensors["parent2_sensors"][2].head(30))

     ro_timestamp  ro_internal_ts      ro_x      ro_y      ro_z  ro_app_id  \
0   1499364820294  67771112321485  0.110077  0.342987  0.916183       95.0   
1   1499364820311  67771132463087  0.110062  0.342957  0.916199       95.0   
2   1499364820347  67771152604688  0.109955  0.342972  0.916290       95.0   
3   1499364820364  67771172746290  0.109848  0.343033  0.916306       95.0   
4   1499364820381  67771192887891  0.109879  0.343048  0.916275       95.0   
5   1499364820399  67771213029493  0.109985  0.343018  0.916245       95.0   
6   1499364820417  67771233171094  0.110001  0.343018  0.916214       95.0   
7   1499364820434  67771253312696  0.109894  0.343033  0.916260       95.0   
8   1499364820452  67771273454298  0.109909  0.343033  0.916260       95.0   
9   1499364820487  67771293595899  0.109940  0.343063  0.916245       95.0   
10  1499364820506  67771313737501  0.109863  0.343155  0.916260       95.0   
11  1499364820522  67771333879102  0.109818  0.343155  0.916275 

In [17]:
for key, data_frames in parents_sensors.items():
    # Initialize lists to store feature values for each DataFrame
    ids = []
    ro_x_means = []
    ro_y_means = []
    ro_z_means = []
    ro_x_maxs = []
    ro_y_maxs = []
    ro_z_maxs = []
    ro_x_mins = []
    ro_y_mins = []
    ro_z_mins = []
    ro_x_rmse = []
    ro_y_rmse = []
    ro_z_rmse = []
    ro_x_stds = []
    ro_y_stds = []
    ro_z_stds = []
    la_x_means = []
    la_y_means = []
    la_z_means = []
    la_x_maxs = []
    la_y_maxs = []
    la_z_maxs = []
    la_x_mins = []
    la_y_mins = []
    la_z_mins = []
    la_x_rmse = []
    la_y_rmse = []
    la_z_rmse = []
    la_x_stds = []
    la_y_stds = []
    la_z_stds = []
    ro_mag_means = []
    ro_mag_maxs = []
    ro_mag_mins = []
    ro_mag_rmse = []
    ro_mag_stds = []
    la_mag_means = []
    la_mag_maxs = []
    la_mag_mins = []
    la_mag_rmse = []
    la_mag_stds = []
    labels = []
    
    # Iterate through each DataFrame in the current key's list of DataFrames
    for df in data_frames:
        # Extract id and label (assuming they are the same for rotation and linear acceleration)
        ids.append(df['id'].iloc[0])
        
        # Extract features for rotation
        ro_x_means.append(df['ro_x'].mean())
        ro_y_means.append(df['ro_y'].mean())
        ro_z_means.append(df['ro_z'].mean())
        ro_x_maxs.append(df['ro_x'].max())
        ro_y_maxs.append(df['ro_y'].max())
        ro_z_maxs.append(df['ro_z'].max())
        ro_x_mins.append(df['ro_x'].min())
        ro_y_mins.append(df['ro_y'].min())
        ro_z_mins.append(df['ro_z'].min())
        ro_x_rmse.append(np.sqrt(np.mean(df['ro_x']**2)))
        ro_y_rmse.append(np.sqrt(np.mean(df['ro_y']**2)))
        ro_z_rmse.append(np.sqrt(np.mean(df['ro_z']**2)))
        ro_x_stds.append(df['ro_x'].std())
        ro_y_stds.append(df['ro_y'].std())
        ro_z_stds.append(df['ro_z'].std())

        # Compute ro_mag and extract features
        ro_mag = np.sqrt(df['ro_x']**2 + df['ro_y']**2 + df['ro_z']**2)
        ro_mag_means.append(ro_mag.mean())
        ro_mag_maxs.append(ro_mag.max())
        ro_mag_mins.append(ro_mag.min())
        ro_mag_rmse.append(np.sqrt(np.mean(ro_mag**2)))
        ro_mag_stds.append(ro_mag.std())
        
        # Extract features for linear acceleration
        la_x_means.append(df['la_x'].mean())
        la_y_means.append(df['la_y'].mean())
        la_z_means.append(df['la_z'].mean())
        la_x_maxs.append(df['la_x'].max())
        la_y_maxs.append(df['la_y'].max())
        la_z_maxs.append(df['la_z'].max())
        la_x_mins.append(df['la_x'].min())
        la_y_mins.append(df['la_y'].min())
        la_z_mins.append(df['la_z'].min())
        la_x_rmse.append(np.sqrt(np.mean(df['la_x']**2)))
        la_y_rmse.append(np.sqrt(np.mean(df['la_y']**2)))
        la_z_rmse.append(np.sqrt(np.mean(df['la_z']**2)))
        la_x_stds.append(df['la_x'].std())
        la_y_stds.append(df['la_y'].std())
        la_z_stds.append(df['la_z'].std())

        # Compute la_mag and extract features
        la_mag = np.sqrt(df['la_x']**2 + df['la_y']**2 + df['la_z']**2)
        la_mag_means.append(la_mag.mean())
        la_mag_maxs.append(la_mag.max())
        la_mag_mins.append(la_mag.min())
        la_mag_rmse.append(np.sqrt(np.mean(la_mag**2)))
        la_mag_stds.append(la_mag.std())

        # Extract label (assuming it's the same for all rows in this DataFrame)
        labels.append(df['label'].iloc[0])
    
    # Create a DataFrame for extracted features for the current key
    extracted_df = pd.DataFrame({
        'id': ids,
        'ro_x_mean': ro_x_means,
        'ro_y_mean': ro_y_means,
        'ro_z_mean': ro_z_means,
        'ro_x_max': ro_x_maxs,
        'ro_y_max': ro_y_maxs,
        'ro_z_max': ro_z_maxs,
        'ro_x_min': ro_x_mins,
        'ro_y_min': ro_y_mins,
        'ro_z_min': ro_z_mins,
        'ro_x_rmse': ro_x_rmse,
        'ro_y_rmse': ro_y_rmse,
        'ro_z_rmse': ro_z_rmse,
        'ro_x_std': ro_x_stds,
        'ro_y_std': ro_y_stds,
        'ro_z_std': ro_z_stds,
        'ro_mag_mean': ro_mag_means,
        'ro_mag_max': ro_mag_maxs,
        'ro_mag_min': ro_mag_mins,
        'ro_mag_rmse': ro_mag_rmse,
        'ro_mag_std': ro_mag_stds,
        'la_x_mean': la_x_means,
        'la_y_mean': la_y_means,
        'la_z_mean': la_z_means,
        'la_x_max': la_x_maxs,
        'la_y_max': la_y_maxs,
        'la_z_max': la_z_maxs,
        'la_x_min': la_x_mins,
        'la_y_min': la_y_mins,
        'la_z_min': la_z_mins,
        'la_x_rmse': la_x_rmse,
        'la_y_rmse': la_y_rmse,
        'la_z_rmse': la_z_rmse,
        'la_x_std': la_x_stds,
        'la_y_std': la_y_stds,
        'la_z_std': la_z_stds,
        'la_mag_mean': la_mag_means,
        'la_mag_max': la_mag_maxs,
        'la_mag_min': la_mag_mins,
        'la_mag_rmse': la_mag_rmse,
        'la_mag_std': la_mag_stds,
        'label': labels
    })
    
    # Assign the DataFrame to the corresponding key in extracted_features dictionary
    extracted_features[key] = extracted_df

# Example usage to print the first few rows of the extracted DataFrame for 'kid1_sensors'
if "parent1_sensors" in extracted_features and not extracted_features["parent1_sensors"].empty:
    print(extracted_features["parent1_sensors"].head())


   id  ro_x_mean  ro_y_mean  ro_z_mean  ro_x_max  ro_y_max  ro_z_max  \
0  26   0.205590  -0.083862   0.502972  0.314972  0.123840  0.761444   
1  26   0.163781  -0.188551   0.593173  0.176697 -0.173920  0.602371   
2  26   0.174730  -0.171892   0.596189  0.193970 -0.161102  0.602570   
3  26   0.163071  -0.157918   0.600157  0.175644 -0.146378  0.604904   
4  26   0.168763  -0.193237   0.599287  0.204163 -0.151459  0.618607   

   ro_x_min  ro_y_min  ro_z_min  ...  la_z_rmse  la_x_std  la_y_std  la_z_std  \
0  0.138000 -0.216690  0.321747  ...   0.272019  0.252229  0.212338  0.269916   
1  0.155502 -0.198181  0.586685  ...   0.143083  0.073080  0.085827  0.142348   
2  0.159927 -0.186386  0.587280  ...   0.174780  0.126461  0.119795  0.172402   
3  0.153671 -0.173447  0.593552  ...   0.167808  0.081594  0.075217  0.159663   
4  0.133133 -0.209152  0.590118  ...   0.152900  0.085408  0.116846  0.138199   

   la_mag_mean  la_mag_max  la_mag_min  la_mag_rmse  la_mag_std  label  
0     0

In [18]:
print(extracted_features["parent1_sensors"].info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 42 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           18 non-null     int64  
 1   ro_x_mean    18 non-null     float64
 2   ro_y_mean    18 non-null     float64
 3   ro_z_mean    18 non-null     float64
 4   ro_x_max     18 non-null     float64
 5   ro_y_max     18 non-null     float64
 6   ro_z_max     18 non-null     float64
 7   ro_x_min     18 non-null     float64
 8   ro_y_min     18 non-null     float64
 9   ro_z_min     18 non-null     float64
 10  ro_x_rmse    18 non-null     float64
 11  ro_y_rmse    18 non-null     float64
 12  ro_z_rmse    18 non-null     float64
 13  ro_x_std     18 non-null     float64
 14  ro_y_std     18 non-null     float64
 15  ro_z_std     18 non-null     float64
 16  ro_mag_mean  18 non-null     float64
 17  ro_mag_max   18 non-null     float64
 18  ro_mag_min   18 non-null     float64
 19  ro_mag_rms

In [19]:
# Concatenate all data frames into a single data frame
parents_data = pd.concat(extracted_features.values(), ignore_index=True)



    

In [20]:
print(parents_data.head(5))
print(parents_data.info())

   id  ro_x_mean  ro_y_mean  ro_z_mean  ro_x_max  ro_y_max  ro_z_max  \
0  26   0.205590  -0.083862   0.502972  0.314972  0.123840  0.761444   
1  26   0.163781  -0.188551   0.593173  0.176697 -0.173920  0.602371   
2  26   0.174730  -0.171892   0.596189  0.193970 -0.161102  0.602570   
3  26   0.163071  -0.157918   0.600157  0.175644 -0.146378  0.604904   
4  26   0.168763  -0.193237   0.599287  0.204163 -0.151459  0.618607   

   ro_x_min  ro_y_min  ro_z_min  ...  la_z_rmse  la_x_std  la_y_std  la_z_std  \
0  0.138000 -0.216690  0.321747  ...   0.272019  0.252229  0.212338  0.269916   
1  0.155502 -0.198181  0.586685  ...   0.143083  0.073080  0.085827  0.142348   
2  0.159927 -0.186386  0.587280  ...   0.174780  0.126461  0.119795  0.172402   
3  0.153671 -0.173447  0.593552  ...   0.167808  0.081594  0.075217  0.159663   
4  0.133133 -0.209152  0.590118  ...   0.152900  0.085408  0.116846  0.138199   

   la_mag_mean  la_mag_max  la_mag_min  la_mag_rmse  la_mag_std  label  
0     0

In [24]:
parents_data = parents_data.sample(frac=1, random_state=42)
print(parents_data.head())
print(parents_data.info())

     id  ro_x_mean  ro_y_mean  ro_z_mean  ro_x_max  ro_y_max  ro_z_max  \
243  37  -0.087406  -0.100299  -0.723997  0.012756  0.014877 -0.714233   
489  48   0.217373  -0.050505  -0.154720  0.236740 -0.038193 -0.140717   
449  46   0.093807   0.297225   0.829103  0.160873  0.436707  0.949921   
407  45   0.107668  -0.090337   0.608525  0.125336 -0.077972  0.635208   
105  31   0.263340  -0.345649   0.658038  0.369156 -0.272964  0.726883   

     ro_x_min  ro_y_min  ro_z_min  ...  la_z_rmse  la_x_std  la_y_std  \
243 -0.167053 -0.184952 -0.734116  ...   0.442168  0.140109  0.090728   
489  0.177994 -0.067032 -0.171722  ...   0.214333  0.102980  0.100348   
449  0.035751  0.173920  0.758698  ...   0.243026  0.186260  0.196596   
407  0.097412 -0.096466  0.599411  ...   0.317568  0.120684  0.147029   
105  0.206436 -0.402512  0.599060  ...   0.729412  0.265786  0.267761   

     la_z_std  la_mag_mean  la_mag_max  la_mag_min  la_mag_rmse  la_mag_std  \
243  0.381589     0.284684   10.87295

In [25]:
# Define the output file path
output_file = 'parents.csv'

# Save the concatenated DataFrame to a CSV file
parents_data.to_csv(output_file, index=False)

print(f"Concatenated features saved to {output_file}")


Concatenated features saved to parents.csv


Concat the two csv to have the whole data set : 

In [28]:
import pandas as pd

# Load the CSV files
parents_df = pd.read_csv('parents.csv')
kids_df = pd.read_csv('kids.csv')

# Concatenate the DataFrames
data= pd.concat([parents_df, kids_df], ignore_index=True)

# Shuffle the combined DataFrame
data = data.sample(frac=1, random_state=42)


# Save the shuffled DataFrame to a new CSV file
data.to_csv('data.csv', index=False)



In [29]:
print(data.head(5))

      id  ro_x_mean  ro_y_mean  ro_z_mean  ro_x_max  ro_y_max  ro_z_max  \
319   40   0.038443  -0.107738  -0.495036  0.085754 -0.047119 -0.039917   
956   19  -0.058543   0.182942   0.811790  0.128311  0.238663  0.977158   
1094  20   0.028148   0.019518  -0.103616  0.153564  0.359055  0.982727   
86    37   0.166467   0.305184   0.749486  0.179657  0.310333  0.754242   
990   22   0.175506   0.003455  -0.028937  0.400620  0.071609  0.208176   

      ro_x_min  ro_y_min  ro_z_min  ...  la_z_rmse  la_x_std  la_y_std  \
319  -0.142303 -0.308090 -0.623459  ...   0.296502  0.268850  0.131000   
956  -0.120255 -0.208908 -0.974197  ...   0.355238  0.246884  0.221454   
1094 -0.021713 -0.198868 -0.987534  ...   0.496602  0.377996  0.656716   
86    0.156876  0.299896  0.746658  ...   0.159691  0.053282  0.045621   
990   0.053802 -0.128281 -0.156189  ...   0.339760  0.203997  0.167537   

      la_z_std  la_mag_mean  la_mag_max  la_mag_min  la_mag_rmse  la_mag_std  \
319   0.285267     0.202