In [1]:
### Version : Python 3.10.12

In [2]:
### We import packages

# To manage dataframes
import numpy as np
import pandas as pd

# To do the preprocessing
from sklearn import preprocessing

In [3]:
### We import our python files

import path_perso

In [4]:
### We import our datasets

lidar_easy = pd.read_parquet(path_perso.lidar_easy_file)
lidar_medium = pd.read_parquet(path_perso.lidar_medium_file)
lidar_hard = pd.read_parquet(path_perso.lidar_hard_file)
lidar_extrahard = pd.read_parquet(path_perso.lidar_extrahard_file)

In [5]:
# We check what the datasets look like
lidar_easy.head(n = 5)

Unnamed: 0,x,y,z
2211,6.196634,-13.157755,10.582272
2905,-10.440643,17.26446,10.926065
1411,2.977432,-3.552676,10.072814
251,5.924494,-10.97496,10.342501
794,-7.062873,12.850664,10.557997


In [6]:
### We reorder our datasets

list_difficulty = ['easy', 'medium', 'hard', 'extrahard']
list_df = [lidar_easy, lidar_medium, lidar_hard, lidar_extrahard]
dict_df = dict(zip(list_difficulty, list_df))

for df in list_df :
    df.reset_index(drop = False, inplace = True)
    df.sort_values(['index'], ascending = True, inplace = True)
    df.reset_index(drop = True, inplace = True)

In [7]:
# We check result
lidar_easy

Unnamed: 0,index,x,y,z
0,0,12.027449,-21.919416,11.557937
1,7,11.780415,-21.631120,11.554015
2,8,11.802714,-21.606439,11.468175
3,12,11.725748,-21.450815,11.473142
4,13,11.658920,-21.392229,11.492596
...,...,...,...,...
1497,2997,-12.594755,21.327550,11.458369
1498,2998,-12.662449,21.386325,11.509484
1499,2999,-12.702087,21.339329,11.480640
1500,3001,-12.749286,21.452882,11.526176


In [8]:
### Descriptive stats
# We check the completeness and the coherency of our datasets

for difficulty in dict_df.keys() :
    print(f"{difficulty} :\n")
    print(f"values : {dict_df[difficulty].shape}\n")
    print(f"missing values : {dict_df[difficulty].isnull().sum()}\n")
    print(f"types : {dict_df[difficulty].dtypes}\n")
    print(f"stats : {dict_df[difficulty].describe()}\n")
    print("\n")


easy :

values : (1502, 4)

missing values : index    0
x        0
y        0
z        0
dtype: int64

types : index      int64
x        float64
y        float64
z        float64
dtype: object

stats :              index            x            y            z
count  1502.000000  1502.000000  1502.000000  1502.000000
mean   1482.069907     0.103807    -0.186761    10.506456
std     865.964752     6.860583    12.517748     0.460115
min       0.000000   -12.749286   -22.386120     9.950736
25%     727.500000    -5.811606   -10.936971    10.092309
50%    1467.500000     0.182992    -0.506426    10.378612
75%    2229.750000     5.905952    10.778874    10.854110
max    3002.000000    12.778636    22.128342    11.630552



medium :

values : (2803, 4)

missing values : index    0
x        0
y        0
z        0
dtype: int64

types : index      int64
x        float64
y        float64
z        float64
dtype: object

stats :              index            x            y            z
count  2803

In [9]:
# We can see that there is no missing value in the four datasets
# They have between 601 and 2803 data points, and all data are float, so there is nothing to change
# However, some variables have widest ranges than others
# Hence, there weight will be more important than others
# We must normalize data

In [13]:
### We normalize our data

scaler = preprocessing.MinMaxScaler()
for df in list_df :
    X1 = pd.DataFrame(scaler.fit_transform(df.drop(columns = ['index'])))
    X1 = pd.DataFrame(scaler.fit_transform(df.drop(columns = ['index'])), columns = ["x_norm", "y_norm", "z_norm"])
    df[["x_norm", "y_norm", "z_norm"]] = X1

In [15]:
# We check result
lidar_easy

Unnamed: 0,index,x,y,z,x_norm,y_norm,z_norm
0,0,11.976350,-21.933835,11.565186,0.973599,0.000000,0.981050
1,7,11.827631,-21.680405,11.562325,0.967754,0.005756,0.979309
2,14,11.662012,-21.305979,11.501609,0.961244,0.014261,0.942374
3,22,11.477836,-21.014806,11.399616,0.954004,0.020875,0.880328
4,29,11.299635,-20.703266,11.426619,0.947000,0.027951,0.896755
...,...,...,...,...,...,...,...
596,2961,-11.728845,19.708505,11.288180,0.041821,0.945862,0.812539
597,2994,-12.586627,21.211981,11.499281,0.008104,0.980012,0.940958
598,2997,-12.659579,21.344602,11.512136,0.005236,0.983025,0.948778
599,2999,-12.670900,21.425689,11.480889,0.004791,0.984866,0.929769
