In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Loading Data

In [None]:
tele = pd.read_csv('/content/EV_Energy_Consumption_Dataset.csv')
trip = pd.read_csv('/content/Electric Vehicle Trip Energy Consumption Data.csv')

In [None]:
print("EV trip Energy Consumption Data(Trip):")
print(trip.shape);
print(trip.dtypes);
print("Tele:")
print(tele.shape);
print(tele.dtypes);

# Null counts
print(trip.isnull().sum())
print(tele.isnull().sum())

EV trip Energy Consumption Data(Trip):
(10151, 13)
Trip Energy Consumption                float64
Vehicle ID                               int64
Trip Distance                            int64
Time of Day                            float64
Day of the Week                          int64
Longitude                              float64
Latitude                               float64
Speed                                  float64
Current                                float64
Total Voltage                          float64
Maximum Cell Temperature of Battery    float64
Minimum Cell Temperature of Battery    float64
Trip Time Length                         int64
dtype: object
Tele:
(5000, 19)
Vehicle_ID                  int64
Timestamp                  object
Speed_kmh                 float64
Acceleration_ms2          float64
Battery_State_%           float64
Battery_Voltage_V         float64
Battery_Temperature_C     float64
Driving_Mode                int64
Road_Type                   int64
T

In [None]:
def normalize_cols(df):
    df.columns = (
      df.columns.str.strip()
                .str.lower()
                .str.replace(' ', '_')
                .str.replace('%','pct')
                .str.replace('(', '')
                .str.replace(')', '')
    )
    return df

trip = normalize_cols(trip)
tele = normalize_cols(tele)

# Parse timestamp
tele['timestamp'] = pd.to_datetime(tele['timestamp'], errors='coerce')

In [None]:
trip.head(5)

Unnamed: 0,trip_energy_consumption,vehicle_id,trip_distance,time_of_day,day_of_the_week,longitude,latitude,speed,current,total_voltage,maximum_cell_temperature_of_battery,minimum_cell_temperature_of_battery,trip_time_length
0,0.672,1,6,10.333333,4,121.497948,31.281574,246.0,2.583348,308.283333,31.0,30.833333,13
1,0.896,1,6,16.0,4,121.587564,31.25607,393.714286,2.985729,304.485714,29.0,28.0,18
2,1.344,1,7,16.090909,2,121.576968,31.262034,192.0,2.35456,308.463636,31.272727,30.0,21
3,1.344,1,8,19.0,5,121.549709,31.257796,369.24,1.540015,308.06,30.0,30.0,16
4,0.896,1,6,14.166667,6,121.58228,31.21503,413.450617,9.659892,304.473457,28.0,28.0,129


In [None]:
tele.head(5)

Unnamed: 0,vehicle_id,timestamp,speed_kmh,acceleration_ms2,battery_state_pct,battery_voltage_v,battery_temperature_c,driving_mode,road_type,traffic_condition,slope_pct,weather_condition,temperature_c,humidity_pct,wind_speed_ms,tire_pressure_psi,vehicle_weight_kg,distance_travelled_km,energy_consumption_kwh
0,1102,2024-01-01 00:00:00,111.507366,-2.773816,30.415148,378.091525,25.314786,2,1,1,6.879446,4,0.74177,42.172533,7.829253,31.11202,1822.967368,20.757508,12.054317
1,1435,2024-01-01 00:01:00,48.612323,-0.796982,97.385534,392.718377,18.240755,1,2,1,-3.007212,4,-3.495516,57.018427,4.495572,31.504366,2091.831914,0.642918,4.488701
2,1860,2024-01-01 00:02:00,108.73332,0.2538,84.9126,398.993495,44.449145,1,1,3,0.029585,1,9.248275,69.028911,5.144489,33.838015,1816.702497,40.842824,11.701377
3,1270,2024-01-01 00:03:00,38.579484,-2.111395,28.777904,358.128273,28.980155,1,2,2,8.271943,3,2.868409,86.638349,4.518283,33.256014,1283.102642,5.305229,7.389266
4,1106,2024-01-01 00:04:00,57.172438,1.477883,29.74016,310.888162,33.184551,2,1,1,2.776814,2,16.750244,27.189185,4.263406,33.579678,2160.350788,5.825926,6.761205


In [None]:
# If median of tele.speed_kmh < 30 but trip.speed median is > 60 then maybe tele in m/s, check units
print(tele['speed_kmh'].median(), trip['speed'].median())

58.646793545901446 31.83333333


In [None]:
tele['dt_seconds'] = tele.groupby('vehicle_id')['timestamp'].diff().dt.total_seconds().fillna(0)

# Replace zero or very large dt with median dt per vehicle
median_dt = tele.loc[tele['dt_seconds']>0,'dt_seconds'].median()
tele['dt_seconds'] = tele['dt_seconds'].replace(0, median_dt)
tele['dt_seconds'] = tele['dt_seconds'].clip(lower=0.1, upper=3600)  # reasonable limits

In [None]:
tele['dt_seconds'].value_counts()

Unnamed: 0_level_0,count
dt_seconds,Unnamed: 1_level_1
3600.0,4700
120.0,11
2400.0,10
2220.0,9
1440.0,8
1680.0,8
2100.0,8
900.0,8
2820.0,7
3180.0,7


##**Outlier Detection**

In [None]:
tele['speed_kmh'] = tele['speed_kmh'].clip(0,200)
trip['speed'] = trip['speed'].clip(0,200)

In [None]:
tele['acceleration_ms2'] = tele['acceleration_ms2'].clip(-10,10)

In [None]:
tele = tele[tele['energy_consumption_kwh'] >= 0]
trip = trip[trip['trip_energy_consumption'] >= 0]

In [None]:
# Checking if trip_time_length in minutes should be > 0 and < (24*60).
trip_duration = (trip['trip_time_length'] > 0) & (trip['trip_time_length'] < 1440)
trip_duration.value_counts()

Unnamed: 0_level_0,count
trip_time_length,Unnamed: 1_level_1
True,10151


In [None]:
#Checking if trip_distance > 0 and <= 1000 km.
distance = (trip['trip_distance'] > 0) & (trip['trip_distance'] <= 1000)
distance.value_counts()

Unnamed: 0_level_0,count
trip_distance,Unnamed: 1_level_1
True,10151
