<center><h1>Projet Robotique</h1></center>
<center><h2>Préparation des données</h2></center>

# Importation des librairies

In [41]:
import os 
import pandas as pd

from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

# Extraction des données

In [2]:
# get the index of the first alphabet in the string after the 9th index
def get_index(string):
    for i in range(9,len(string)):
        if string[i].isalpha():
            return i
    return -1

def extract_data():

    # get all the .dat files in the dataset folder
    data_dir = 'OpportunityUCIDataset/dataset'
    files = os.listdir(data_dir)
    files = [f for f in files if f.endswith('.dat')]

    # separate the ADL and Drill files
    list_of_files = [f for f in files if f.find('Drill') == -1]

    columns=[]

    with open(data_dir+"/column_names.txt",'r') as f:    
        lines = f.read().splitlines()

        for line in lines:
            if line.find('Column') != -1:
                columns.append(line[get_index(line):].split(";")[0])
       
    data_collection = pd.DataFrame(columns=columns)
    for _, file in enumerate(list_of_files):

        proc_data = pd.read_table(data_dir+"/"+file, header=None, sep='\s+')
        proc_data.columns = columns
        data_collection = pd.concat([data_collection, proc_data])      
          
    data_collection.reset_index(drop=True, inplace=True)
    
    return data_collection

In [25]:
data_collection = extract_data()

data_collection

Unnamed: 0,MILLISEC,Accelerometer RKN^ accX,Accelerometer RKN^ accY,Accelerometer RKN^ accZ,Accelerometer HIP accX,Accelerometer HIP accY,Accelerometer HIP accZ,Accelerometer LUA^ accX,Accelerometer LUA^ accY,Accelerometer LUA^ accZ,...,LOCATION TAG4 X,LOCATION TAG4 Y,LOCATION TAG4 Z,Locomotion,HL_Activity,LL_Left_Arm,LL_Left_Arm_Object,LL_Right_Arm,LL_Right_Arm_Object,ML_Both_Arms
0,0,87.0,975.0,-287.0,11.0,1001.0,163.0,95.0,975.0,152.0,...,5789.0,2907.0,1447.0,0,0,0,0,0,0,0
1,33,124.0,978.0,-389.0,-7.0,1014.0,199.0,124.0,968.0,123.0,...,5789.0,2908.0,1443.0,0,0,0,0,0,0,0
2,67,102.0,996.0,-440.0,-49.0,1024.0,193.0,127.0,1001.0,113.0,...,5789.0,2910.0,1440.0,0,0,0,0,0,0,0
3,100,59.0,861.0,-384.0,-9.0,1023.0,202.0,110.0,1007.0,106.0,...,5789.0,2912.0,1440.0,0,0,0,0,0,0,0
4,133,119.0,946.0,-426.0,-22.0,1026.0,188.0,98.0,1001.0,92.0,...,5791.0,2915.0,1442.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644630,1017390,,,,,,,,,,...,,,,0,0,0,0,0,0,0
644631,1017423,,,,,,,,,,...,,,,0,0,0,0,0,0,0
644632,1017456,,,,,,,,,,...,,,,0,0,0,0,0,0,0
644633,1017490,,,,,,,,,,...,,,,0,0,0,0,0,0,0


# Nettoyage des données

In [48]:
def data_cleaning(data_collection):
    data_collection = data_collection.dropna(thresh=int(len(data_collection.columns) * 0.9), inplace=False) #drop the columns which has NaN over 10%

    data_collection = data_collection.fillna(0)
    
    data_collection = data_collection.apply(pd.to_numeric, errors = 'coerce') #removal of non numeric data in cells
    
    data_collection = data_collection.interpolate() 

    return data_collection

In [49]:
cleaned_dataCollection = data_cleaning(data_collection)

cleaned_dataCollection

Unnamed: 0,MILLISEC,Accelerometer RKN^ accX,Accelerometer RKN^ accY,Accelerometer RKN^ accZ,Accelerometer HIP accX,Accelerometer HIP accY,Accelerometer HIP accZ,Accelerometer LUA^ accX,Accelerometer LUA^ accY,Accelerometer LUA^ accZ,...,LOCATION TAG4 X,LOCATION TAG4 Y,LOCATION TAG4 Z,Locomotion,HL_Activity,LL_Left_Arm,LL_Left_Arm_Object,LL_Right_Arm,LL_Right_Arm_Object,ML_Both_Arms
0,0,87.0,975.0,-287.0,11.0,1001.0,163.0,95.0,975.0,152.0,...,5789.0,2907.0,1447.0,0,0,0,0,0,0,0
1,33,124.0,978.0,-389.0,-7.0,1014.0,199.0,124.0,968.0,123.0,...,5789.0,2908.0,1443.0,0,0,0,0,0,0,0
2,67,102.0,996.0,-440.0,-49.0,1024.0,193.0,127.0,1001.0,113.0,...,5789.0,2910.0,1440.0,0,0,0,0,0,0,0
3,100,59.0,861.0,-384.0,-9.0,1023.0,202.0,110.0,1007.0,106.0,...,5789.0,2912.0,1440.0,0,0,0,0,0,0,0
4,133,119.0,946.0,-426.0,-22.0,1026.0,188.0,98.0,1001.0,92.0,...,5791.0,2915.0,1442.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644088,999323,243.0,951.0,311.0,-79.0,919.0,368.0,293.0,951.0,282.0,...,5756.0,2160.0,1458.0,0,0,0,0,0,0,0
644089,999357,179.0,950.0,368.0,-97.0,919.0,365.0,304.0,934.0,239.0,...,5755.0,2158.0,1457.0,0,0,0,0,0,0,0
644090,999390,255.0,915.0,377.0,-137.0,921.0,405.0,285.0,940.0,239.0,...,5755.0,2156.0,1455.0,0,0,0,0,0,0,0
644091,999423,406.0,865.0,356.0,-149.0,957.0,366.0,275.0,962.0,212.0,...,5756.0,2154.0,1454.0,0,0,0,0,0,0,0


# Encodage des labels

In [35]:
def reset_label(dataCollection): 

    labels = pd.read_csv('OpportunityUCIDataset\dataset\label_legend.txt', sep='   -   ', header=0)

    track_dict = {}

    for track in labels['Track name'].unique():
        track_dict[track] = dict(labels.loc[labels['Track name'] == track][["Unique index", "Label name"]].to_numpy())

    for track in track_dict:
        if track == 'Locomotion':
            track_dict[track][1] = 1
            track_dict[track][2] = 2
            track_dict[track][4] = 3
            track_dict[track][5] = 0
        else:
            i= 0
            for key in track_dict[track]:
                track_dict[track][key] = i
                i += 1

    for track in track_dict:
        for key in track_dict[track]:
            dataCollection.loc[dataCollection[track] == key, track] = track_dict[track][key]
    
    return dataCollection

In [50]:
df = reset_label(dataCollection=cleaned_dataCollection)

df

Unnamed: 0,MILLISEC,Accelerometer RKN^ accX,Accelerometer RKN^ accY,Accelerometer RKN^ accZ,Accelerometer HIP accX,Accelerometer HIP accY,Accelerometer HIP accZ,Accelerometer LUA^ accX,Accelerometer LUA^ accY,Accelerometer LUA^ accZ,...,LOCATION TAG4 X,LOCATION TAG4 Y,LOCATION TAG4 Z,Locomotion,HL_Activity,LL_Left_Arm,LL_Left_Arm_Object,LL_Right_Arm,LL_Right_Arm_Object,ML_Both_Arms
0,0,87.0,975.0,-287.0,11.0,1001.0,163.0,95.0,975.0,152.0,...,5789.0,2907.0,1447.0,0,0,0,0,0,0,0
1,33,124.0,978.0,-389.0,-7.0,1014.0,199.0,124.0,968.0,123.0,...,5789.0,2908.0,1443.0,0,0,0,0,0,0,0
2,67,102.0,996.0,-440.0,-49.0,1024.0,193.0,127.0,1001.0,113.0,...,5789.0,2910.0,1440.0,0,0,0,0,0,0,0
3,100,59.0,861.0,-384.0,-9.0,1023.0,202.0,110.0,1007.0,106.0,...,5789.0,2912.0,1440.0,0,0,0,0,0,0,0
4,133,119.0,946.0,-426.0,-22.0,1026.0,188.0,98.0,1001.0,92.0,...,5791.0,2915.0,1442.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644088,999323,243.0,951.0,311.0,-79.0,919.0,368.0,293.0,951.0,282.0,...,5756.0,2160.0,1458.0,0,0,0,0,0,0,0
644089,999357,179.0,950.0,368.0,-97.0,919.0,365.0,304.0,934.0,239.0,...,5755.0,2158.0,1457.0,0,0,0,0,0,0,0
644090,999390,255.0,915.0,377.0,-137.0,921.0,405.0,285.0,940.0,239.0,...,5755.0,2156.0,1455.0,0,0,0,0,0,0,0
644091,999423,406.0,865.0,356.0,-149.0,957.0,366.0,275.0,962.0,212.0,...,5756.0,2154.0,1454.0,0,0,0,0,0,0,0


# Normalisation des données 

In [51]:
# normalize the data using standard scaler
def normalize_data(df):
    scaler = StandardScaler()
    df[df.columns[:-7]] = scaler.fit_transform(df[df.columns[:-7]])
    return df

df = normalize_data(df)

df

Unnamed: 0,MILLISEC,Accelerometer RKN^ accX,Accelerometer RKN^ accY,Accelerometer RKN^ accZ,Accelerometer HIP accX,Accelerometer HIP accY,Accelerometer HIP accZ,Accelerometer LUA^ accX,Accelerometer LUA^ accY,Accelerometer LUA^ accZ,...,LOCATION TAG4 X,LOCATION TAG4 Y,LOCATION TAG4 Z,Locomotion,HL_Activity,LL_Left_Arm,LL_Left_Arm_Object,LL_Right_Arm,LL_Right_Arm_Object,ML_Both_Arms
0,-1.585284,0.146747,0.337532,-1.410530,0.693260,0.434434,0.074936,-0.199117,0.621667,-0.455638,...,0.147947,0.811792,0.186199,0,0,0,0,0,0,0
1,-1.585185,0.263802,0.342378,-1.626921,0.626338,0.487110,0.200034,-0.108635,0.598101,-0.542260,...,0.147947,0.812691,0.178270,0,0,0,0,0,0,0
2,-1.585083,0.194202,0.371457,-1.735116,0.470187,0.527630,0.179184,-0.099275,0.709202,-0.572130,...,0.147947,0.814490,0.172322,0,0,0,0,0,0,0
3,-1.584985,0.058164,0.153369,-1.616313,0.618902,0.523578,0.210459,-0.152316,0.729402,-0.593039,...,0.147947,0.816289,0.172322,0,0,0,0,0,0,0
4,-1.584886,0.247984,0.290684,-1.705415,0.570570,0.535734,0.161809,-0.189757,0.709202,-0.634857,...,0.149055,0.818988,0.176287,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644088,1.407215,0.640278,0.298761,-0.141884,0.358651,0.102168,0.787298,0.418657,0.540867,-0.067329,...,0.129671,0.139901,0.208006,0,0,0,0,0,0,0
644089,1.407317,0.437804,0.297145,-0.020960,0.291729,0.102168,0.776873,0.452977,0.483633,-0.195770,...,0.129117,0.138102,0.206023,0,0,0,0,0,0,0
644090,1.407416,0.678242,0.240604,-0.001866,0.143014,0.110272,0.915871,0.393696,0.503833,-0.195770,...,0.129117,0.136303,0.202059,0,0,0,0,0,0,0
644091,1.407514,1.155955,0.159831,-0.046417,0.098399,0.256145,0.780348,0.362495,0.577900,-0.276418,...,0.129671,0.134504,0.200076,0,0,0,0,0,0,0


# Sauvegarde du jeu de données

In [66]:
df.to_csv('preprocessed_data.csv', index=False)