# Data Preprocessing

Here, we are loading the data generated from the SUMO simulator. These data is then cleaned to get desired data suitable for our Machine Learning based approach.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Loading the csv file which contains the data having 25000 vehicles and simulation of 50000 timestep

In [None]:
data = pd.read_csv('emission50000t_25000v.csv',delimiter=";")

Total 7979907 rows of data is obtained

In [3]:
data

Unnamed: 0,timestep_time,vehicle_CO,vehicle_CO2,vehicle_HC,vehicle_NOx,vehicle_PMx,vehicle_angle,vehicle_eclass,vehicle_electricity,vehicle_fuel,vehicle_id,vehicle_lane,vehicle_noise,vehicle_pos,vehicle_route,vehicle_speed,vehicle_type,vehicle_waiting,vehicle_x,vehicle_y
0,0.0,164.78,2624.72,0.81,1.20,0.07,186.58,HBEFA3/PC_G_EU4,0.0,1.13,0.0,-338817486_0,55.94,5.10,!0,0.00,DEFAULT_VEHTYPE,0.0,13229.54,13858.50
1,1.0,147.37,3191.28,0.75,1.42,0.07,186.58,HBEFA3/PC_G_EU4,0.0,1.37,0.0,-338817486_0,64.14,6.81,!0,1.71,DEFAULT_VEHTYPE,0.0,13229.34,13856.80
2,1.0,164.78,2624.72,0.81,1.20,0.07,300.61,HBEFA3/PC_G_EU4,0.0,1.13,1.0,-62518623#1_0,55.94,5.10,!1,0.00,DEFAULT_VEHTYPE,0.0,8550.01,7263.51
3,2.0,146.32,5001.07,0.78,2.21,0.11,186.58,HBEFA3/PC_G_EU4,0.0,2.15,0.0,-338817486_0,68.71,11.05,!0,4.24,DEFAULT_VEHTYPE,0.0,13228.85,13852.59
4,2.0,147.60,3044.17,0.74,1.36,0.07,300.61,HBEFA3/PC_G_EU4,0.0,1.31,1.0,-62518623#1_0,63.18,6.61,!1,1.51,DEFAULT_VEHTYPE,0.0,8548.71,7264.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7979902,49995.0,,,,,,,,,,,,,,,,,,,
7979903,49996.0,,,,,,,,,,,,,,,,,,,
7979904,49997.0,,,,,,,,,,,,,,,,,,,
7979905,49998.0,,,,,,,,,,,,,,,,,,,


Columns of the data obtained are as below

In [4]:
data.columns

Index(['timestep_time', 'vehicle_CO', 'vehicle_CO2', 'vehicle_HC',
       'vehicle_NOx', 'vehicle_PMx', 'vehicle_angle', 'vehicle_eclass',
       'vehicle_electricity', 'vehicle_fuel', 'vehicle_id', 'vehicle_lane',
       'vehicle_noise', 'vehicle_pos', 'vehicle_route', 'vehicle_speed',
       'vehicle_type', 'vehicle_waiting', 'vehicle_x', 'vehicle_y'],
      dtype='object')

We do not need all the data obtained from the simulator, so we drop few columns from the dataset 

In [5]:
data.drop(['vehicle_CO', 'vehicle_CO2', 'vehicle_HC',
       'vehicle_NOx', 'vehicle_PMx', 'vehicle_angle', 'vehicle_eclass',
       'vehicle_electricity','vehicle_lane',
       'vehicle_noise', 'vehicle_pos', 'vehicle_route','vehicle_type'],axis=1,inplace=True)

In [6]:
data

Unnamed: 0,timestep_time,vehicle_fuel,vehicle_id,vehicle_speed,vehicle_waiting,vehicle_x,vehicle_y
0,0.0,1.13,0.0,0.00,0.0,13229.54,13858.50
1,1.0,1.37,0.0,1.71,0.0,13229.34,13856.80
2,1.0,1.13,1.0,0.00,0.0,8550.01,7263.51
3,2.0,2.15,0.0,4.24,0.0,13228.85,13852.59
4,2.0,1.31,1.0,1.51,0.0,8548.71,7264.28
...,...,...,...,...,...,...,...
7979902,49995.0,,,,,,
7979903,49996.0,,,,,,
7979904,49997.0,,,,,,
7979905,49998.0,,,,,,


In [7]:
df = data

After droping certain features, we need to remove the null values from the dataset

In [8]:
df = df.dropna(axis=0)

This cleaned dataset is then stored in the csv file, so that we can use it later.

In [9]:
df.to_csv('emission50000t_25000v.csv')

In [10]:
df

Unnamed: 0,timestep_time,vehicle_fuel,vehicle_id,vehicle_speed,vehicle_waiting,vehicle_x,vehicle_y
0,0.0,1.13,0.0,0.00,0.0,13229.54,13858.50
1,1.0,1.37,0.0,1.71,0.0,13229.34,13856.80
2,1.0,1.13,1.0,0.00,0.0,8550.01,7263.51
3,2.0,2.15,0.0,4.24,0.0,13228.85,13852.59
4,2.0,1.31,1.0,1.51,0.0,8548.71,7264.28
...,...,...,...,...,...,...,...
7955858,25951.0,4.47,24775.0,15.53,0.0,8864.90,9829.10
7955859,25952.0,2.50,24775.0,16.29,0.0,8850.86,9837.35
7955860,25953.0,2.78,24775.0,17.13,0.0,8836.09,9846.03
7955861,25954.0,0.00,24775.0,17.08,0.0,8821.36,9854.68


Total number of uniques vehicles present in the simulation results

In [11]:
df['vehicle_id'].nunique()

24508

After cleaning and removing dataset, finally we have 7955863 rows of data

In [12]:
df

Unnamed: 0,timestep_time,vehicle_fuel,vehicle_id,vehicle_speed,vehicle_waiting,vehicle_x,vehicle_y
0,0.0,1.13,0.0,0.00,0.0,13229.54,13858.50
1,1.0,1.37,0.0,1.71,0.0,13229.34,13856.80
2,1.0,1.13,1.0,0.00,0.0,8550.01,7263.51
3,2.0,2.15,0.0,4.24,0.0,13228.85,13852.59
4,2.0,1.31,1.0,1.51,0.0,8548.71,7264.28
...,...,...,...,...,...,...,...
7955858,25951.0,4.47,24775.0,15.53,0.0,8864.90,9829.10
7955859,25952.0,2.50,24775.0,16.29,0.0,8850.86,9837.35
7955860,25953.0,2.78,24775.0,17.13,0.0,8836.09,9846.03
7955861,25954.0,0.00,24775.0,17.08,0.0,8821.36,9854.68
