In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv('cars.csv', low_memory=False)

In [3]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50084 entries, 0 to 50083
Columns: 205 entries, Brand to Fuel consumption (economy) - extra urban (Ethanol - E85) (NEDC)
dtypes: float64(3), object(202)
memory usage: 392.1 MB


In [4]:
columns_to_drop = [
    'Fuel consumption (economy) - combined', 
    'Fuel consumption (economy) - combined (CLTC)', 
    'Fuel consumption (economy) - combined (CNG)', 
    'Fuel consumption (economy) - combined (CNG) (NEDC)', 
    'Fuel consumption (economy) - combined (CNG) (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - combined (EPA)', 
    'Fuel consumption (economy) - combined (Ethanol - E85)', 
    'Fuel consumption (economy) - combined (Ethanol - E85) (NEDC)', 
    'Fuel consumption (economy) - combined (LPG)', 
    'Fuel consumption (economy) - combined (LPG) (NEDC)', 
    'Fuel consumption (economy) - combined (LPG) (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - combined (NEDC)', 
    'Fuel consumption (economy) - combined (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - combined (WLTC)',
    'Acceleration 0 - 62 mph',
    'Acceleration 0 - 200 km/h',
    'Acceleration 0 - 300 km/h',
    'Acceleration 0 - 60 mph',
    'Acceleration 0 - 60 mph (Calculated by Auto-Data.net)',
    'Engine oil specification',
    '200 km/h - 0',
    'Combined fuel consumption (WLTP)', 
    'Combined fuel consumption (WLTP) (CNG)', 
    'Combined fuel consumption (WLTP) (LPG)',
    'Power per litre',  #engine displacement
    'Power per litre (CNG)', 
    'Power per litre (Ethanol - E100)', 
    'Power per litre (Ethanol - E85)', 
    'Power per litre (LPG)',
    'Weight-to-power ratio', # kerb weight to power ratio
    'Weight-to-torque ratio' #kerb weight
]

df = df.drop(columns_to_drop, axis=1, errors='ignore')

In [5]:
len(df.columns)

174

In [6]:
print(sorted(list(df.columns)))

['100 km/h - 0', 'Acceleration 0 - 100 km/h', 'Acceleration 0 - 100 km/h (CNG)', 'Acceleration 0 - 100 km/h (Ethanol - E100)', 'Acceleration 0 - 100 km/h (Ethanol - E85)', 'Acceleration 0 - 100 km/h (LPG)', 'AdBlue tank', 'All-electric range', 'All-electric range (CLTC)', 'All-electric range (EPA)', 'All-electric range (NEDC)', 'All-electric range (NEDC, WLTP equivalent)', 'All-electric range (WLTC)', 'All-electric range (WLTP)', 'Approach angle', 'Assisting systems', 'Average Energy consumption', 'Average Energy consumption (CLTC)', 'Average Energy consumption (EPA)', 'Average Energy consumption (NEDC)', 'Average Energy consumption (NEDC, WLTP equivalent)', 'Average Energy consumption (WLTC)', 'Average Energy consumption (WLTP)', 'Battery location', 'Battery technology', 'Battery voltage', 'Battery weight', 'Body type', 'Brand', 'CNG cylinder capacity', 'CO emissions', 'CO emissions (CNG)', 'CO emissions (CNG) (NEDC)', 'CO emissions (CNG) (NEDC, WLTP equivalent)', 'CO emissions (CNG) 

In [7]:
def parse_number_or_range(val, hmean=False):
    """
    Parse a number or range from a string, ignoring units at the end.
    Returns the arithmetic mean if a range, or the number itself.
    """
    if pd.isna(val):
        return np.nan
    
    s = str(val).strip().lower()
    
    # Single regex to capture:
    #  - optional leading spaces
    #  - first number (integer or decimal)
    #  - optional range separator and second number
    #  - ignore any text after numbers (units)
    match = re.match(r"^\s*(\d+(?:\.\d+)?)\s*(?:[-–]\s*(\d+(?:\.\d+)?))?", s)
    if match:
        num1 = float(match.group(1))
        num2 = match.group(2)
        if num2:
            num2 = float(num2)
            if hmean:
                return 2.0 * num1 * num2 / (num1+num2)
            else:
                return (num1 + num2) / 2  # arithmetic mean of range
        else:
            return num1
    
    return np.nan

def hmean_across(data):
    return (data.notna().sum(axis=1)) / (1 / data).sum(axis=1, skipna=True)

In [8]:
columns = [
    'All-electric range', 
    'All-electric range (CLTC)', 
    'All-electric range (EPA)', 
    'All-electric range (NEDC)', 
    'All-electric range (NEDC, WLTP equivalent)', 
    'All-electric range (WLTC)', 
    'All-electric range (WLTP)'
]

data = df[columns].map(parse_number_or_range)
df["All-electric average range (km)"] = data.mean(axis=1, skipna=True)
df = df.drop(columns, axis=1)
#data[data[columns].notna().sum(axis=1)>=2]
df[df["All-electric average range (km)"].notna()]

Unnamed: 0,Brand,Model,Generation,Start of production,End of production,Modification (Engine),Powertrain Architecture,Body type,Fuel Type,Max. weight,...,Drag coefficient (C),Doors,Engine displacement,Acceleration 0 - 100 km/h (CNG),System torque,Torque (Ethanol - E85),Number of valves per cylinder,Fuel consumption (economy) - urban (Ethanol - E85),Fuel consumption (economy) - extra urban (Ethanol - E85) (NEDC),All-electric average range (km)
8,BYD,e6,e6,2017 year,,80 kWh (122 Hp) 4WD Electric,BEV (Electric Vehicle),MPV,Electricity,,...,,5,,,,,,,,400.0
9,BYD,e2,e2,"September, 2019 year","April, 2021 year",47.3 kWh (95 Hp) BEV,BEV (Electric Vehicle),Hatchback,Electricity,,...,,5,,,180 Nm,,,,,405.0
10,BYD,e2,e2,"September, 2019 year","April, 2021 year",35.2 kWh (95 Hp) BEV,BEV (Electric Vehicle),Hatchback,Electricity,,...,,5,,,180 Nm,,,,,305.0
17,BYD,e2,e2,"April, 2021 year",,33.2 kWh (95 Hp) BEV,BEV (Electric Vehicle),Hatchback,Electricity,,...,,5,,,180 Nm,,,,,301.0
21,BYD,ETP3,ETP3,"October, 2023 year",,44.9 kWh (136 Hp) Electric,BEV (Electric Vehicle),Van,Electricity,2420 kg,...,,5,,,180 Nm,,,,,233.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49634,Renault,Twingo,Twingo III (facelift 2019),"August, 2020 year","July, 2024 year",Z.E. 22 kWh (82 Hp),BEV (Electric Vehicle),Hatchback,Electricity,1518 kg,...,,5,,,160 Nm @ 500-3590 rpm.,,,,,180.0
50014,Renault,Megane,"Megane IV (Phase II, 2020) Grandtour","April, 2020 year","July, 2023 year",1.6 E-TECH (158 Hp) Plug-in Hybrid Multimode,PHEV (Plug-in Hybrid Electric Vehicle),Station wagon (estate),Petrol / electricity,2131 kg,...,,5,1598 cm,,,,4.0,,,50.0
50031,Renault,Megane,Megane V E-Tech Electric,"February, 2022 year",,EV40 (130 Hp),BEV (Electric Vehicle),Hatchback,Electricity,2045 kg,...,,5,,,250 Nm,,,,,300.0
50059,Renault,Megane,Megane V E-Tech Electric,"February, 2022 year",,EV60 (220 Hp),BEV (Electric Vehicle),Hatchback,Electricity,2158 kg,...,,5,,,300 Nm,,,,,470.0


In [9]:
columns = [
    'Average Energy consumption', 
    'Average Energy consumption (CLTC)', 
    'Average Energy consumption (EPA)', 
    'Average Energy consumption (NEDC)', 
    'Average Energy consumption (NEDC, WLTP equivalent)', 
    'Average Energy consumption (WLTC)', 
    'Average Energy consumption (WLTP)'
]
data = df[columns].map(lambda x: parse_number_or_range(x, hmean=True))
data = data.replace(0, np.nan)
df["Average Energy consumption (kWh/100km)"] = hmean_across(data) # (data.notna().sum(axis=1)) / (1 / data).sum(axis=1, skipna=True)
df = df.drop(columns, axis=1)
#data[data[columns].notna().any(axis=1)][columns]
#df[df[columns].notna().any(axis=1)][columns]

In [10]:
df[df['Average Energy consumption (kWh/100km)'].notna()]

Unnamed: 0,Brand,Model,Generation,Start of production,End of production,Modification (Engine),Powertrain Architecture,Body type,Fuel Type,Max. weight,...,Doors,Engine displacement,Acceleration 0 - 100 km/h (CNG),System torque,Torque (Ethanol - E85),Number of valves per cylinder,Fuel consumption (economy) - urban (Ethanol - E85),Fuel consumption (economy) - extra urban (Ethanol - E85) (NEDC),All-electric average range (km),Average Energy consumption (kWh/100km)
8,BYD,e6,e6,2017 year,,80 kWh (122 Hp) 4WD Electric,BEV (Electric Vehicle),MPV,Electricity,,...,5,,,,,,,,400.0,19.500000
126,BYD,M3e,M3e,"December, 2020 year",,50.3 kWh (95 Hp) BEV,BEV (Electric Vehicle),MPV,Electricity,2360 kg,...,5,,,180 Nm,,,,,300.0,19.400000
162,BYD,Sealion 6,Sealion 6,2024 year,,DM-i 1.5L (324 Hp) Plug-in Hybrid AWD E-CVT,PHEV (Plug-in Hybrid Electric Vehicle),SUV,Petrol / electricity,2510 kg,...,5,1497 cm,,550 Nm,,4.0,,,81.0,17.900000
165,BYD,Sealion 6,Sealion 6,2024 year,,DM-i 1.5L (218 Hp) Plug-in Hybrid E-CVT,PHEV (Plug-in Hybrid Electric Vehicle),SUV,Petrol / electricity,2350 kg,...,5,1498 cm,,300 Nm,,4.0,,,92.0,16.900000
186,BYD,Seal 6,Seal 6 Touring,"September, 2025 year",,DM-i 1.5L 19 kWh (212 Hp) Plug-in Hybrid E-CVT,PHEV (Plug-in Hybrid Electric Vehicle),Station wagon (estate),Petrol / electricity,2240 kg,...,5,1498 cm,,,,4.0,,,100.0,16.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49634,Renault,Twingo,Twingo III (facelift 2019),"August, 2020 year","July, 2024 year",Z.E. 22 kWh (82 Hp),BEV (Electric Vehicle),Hatchback,Electricity,1518 kg,...,5,,,160 Nm @ 500-3590 rpm.,,,,,180.0,16.148607
50014,Renault,Megane,"Megane IV (Phase II, 2020) Grandtour","April, 2020 year","July, 2023 year",1.6 E-TECH (158 Hp) Plug-in Hybrid Multimode,PHEV (Plug-in Hybrid Electric Vehicle),Station wagon (estate),Petrol / electricity,2131 kg,...,5,1598 cm,,,,4.0,,,50.0,13.928571
50031,Renault,Megane,Megane V E-Tech Electric,"February, 2022 year",,EV40 (130 Hp),BEV (Electric Vehicle),Hatchback,Electricity,2045 kg,...,5,,,250 Nm,,,,,300.0,15.800000
50059,Renault,Megane,Megane V E-Tech Electric,"February, 2022 year",,EV60 (220 Hp),BEV (Electric Vehicle),Hatchback,Electricity,2158 kg,...,5,,,300 Nm,,,,,470.0,16.100000


In [11]:
fuel_cols = [
    'Fuel consumption (economy) - extra urban', 
    'Fuel consumption (economy) - extra urban (EPA)', 
    'Fuel consumption (economy) - extra urban (NEDC)', 
    'Fuel consumption (economy) - extra urban (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - extra urban (WLTC)', 
    'Fuel consumption (economy) - urban', 
    'Fuel consumption (economy) - urban (EPA)', 
    'Fuel consumption (economy) - urban (NEDC)', 
    'Fuel consumption (economy) - urban (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - urban (WLTC)'
]

cng_cols = [
    'Fuel consumption (economy) - extra urban (CNG)', 
    'Fuel consumption (economy) - extra urban (CNG) (NEDC)', 
    'Fuel consumption (economy) - extra urban (CNG) (NEDC, WLTP equivalent)', 
        'Fuel consumption (economy) - urban (CNG)', 
    'Fuel consumption (economy) - urban (CNG) (NEDC)', 
    'Fuel consumption (economy) - urban (CNG) (NEDC, WLTP equivalent)', 
]

lpg_cols = [
    'Fuel consumption (economy) - extra urban (LPG)', 
    'Fuel consumption (economy) - extra urban (LPG) (NEDC)', 
    'Fuel consumption (economy) - extra urban (LPG) (NEDC, WLTP equivalent)',
        'Fuel consumption (economy) - urban (LPG)', 
    'Fuel consumption (economy) - urban (LPG) (NEDC)', 
    'Fuel consumption (economy) - urban (LPG) (NEDC, WLTP equivalent)', 
]

ethanol_cols = [
    'Fuel consumption (economy) - extra urban (Ethanol - E100)', 
    'Fuel consumption (economy) - extra urban (Ethanol - E85)', 
    'Fuel consumption (economy) - extra urban (Ethanol - E85) (NEDC)', 
        'Fuel consumption (economy) - urban (Ethanol - E100)', 
    'Fuel consumption (economy) - urban (Ethanol - E85)', 
    'Fuel consumption (economy) - urban (Ethanol - E85) (NEDC)', 
]

fuel_columns = [
    'Fuel consumption (economy) - extra urban', 
    'Fuel consumption (economy) - extra urban (CNG)', 
    'Fuel consumption (economy) - extra urban (CNG) (NEDC)', 
    'Fuel consumption (economy) - extra urban (CNG) (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - extra urban (EPA)', 
    'Fuel consumption (economy) - extra urban (Ethanol - E100)', 
    'Fuel consumption (economy) - extra urban (Ethanol - E85)', 
    'Fuel consumption (economy) - extra urban (Ethanol - E85) (NEDC)', 
    'Fuel consumption (economy) - extra urban (LPG)', 
    'Fuel consumption (economy) - extra urban (LPG) (NEDC)', 
    'Fuel consumption (economy) - extra urban (LPG) (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - extra urban (NEDC)', 
    'Fuel consumption (economy) - extra urban (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - extra urban (WLTC)', 
    'Fuel consumption (economy) - urban', 
    'Fuel consumption (economy) - urban (CNG)', 
    'Fuel consumption (economy) - urban (CNG) (NEDC)', 
    'Fuel consumption (economy) - urban (CNG) (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - urban (EPA)', 
    'Fuel consumption (economy) - urban (Ethanol - E100)', 
    'Fuel consumption (economy) - urban (Ethanol - E85)', 
    'Fuel consumption (economy) - urban (Ethanol - E85) (NEDC)', 
    'Fuel consumption (economy) - urban (LPG)', 
    'Fuel consumption (economy) - urban (LPG) (NEDC)', 
    'Fuel consumption (economy) - urban (LPG) (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - urban (NEDC)', 
    'Fuel consumption (economy) - urban (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - urban (WLTC)', 
    'Fuel consumption at Low speed (WLTP)', 
    'Fuel consumption at Low speed (WLTP) (CNG)', 
    'Fuel consumption at Low speed (WLTP) (LPG)', 
    'Fuel consumption at Medium speed (WLTP)', 
    'Fuel consumption at Medium speed (WLTP) (CNG)', 
    'Fuel consumption at Medium speed (WLTP) (LPG)', 
    'Fuel consumption at high speed (WLTP)', 
    'Fuel consumption at high speed (WLTP) (CNG)', 
    'Fuel consumption at high speed (WLTP) (LPG)', 
    'Fuel consumption at very high speed (WLTP)', 
    'Fuel consumption at very high speed (WLTP) (CNG)', 
    'Fuel consumption at very high speed (WLTP) (LPG)'
]

df[fuel_columns] = df[fuel_columns].map(lambda x: parse_number_or_range(x, hmean=True))
df['Fuel consumption (L/100km)'] = hmean_across(df[fuel_cols])
df['Fuel consumption CNG (L/100km)'] = hmean_across(df[cng_cols])
df['Fuel consumption LPG (L/100km)'] = hmean_across(df[lpg_cols])
df['Fuel consumption Ethanol (L/100km)'] = hmean_across(df[ethanol_cols])
df = df.drop(fuel_columns, axis=1)

In [12]:
new_cols = [
    'Fuel consumption (L/100km)',
    'Fuel consumption CNG (L/100km)',
    'Fuel consumption LPG (L/100km)',
    'Fuel consumption Ethanol (L/100km)'
]
df[df[new_cols].notna().any(axis=1)][new_cols]

Unnamed: 0,Fuel consumption (L/100km),Fuel consumption CNG (L/100km),Fuel consumption LPG (L/100km),Fuel consumption Ethanol (L/100km)
4,8.795897,,,
11,6.600000,,,
12,6.900000,,,
16,8.479000,,,
23,12.107801,,,
...,...,...,...,...
50076,5.520000,,,
50077,4.077108,,,
50080,5.403509,,,
50081,3.757895,,,


In [13]:
print(sorted(list(df.columns)))

['100 km/h - 0', 'Acceleration 0 - 100 km/h', 'Acceleration 0 - 100 km/h (CNG)', 'Acceleration 0 - 100 km/h (Ethanol - E100)', 'Acceleration 0 - 100 km/h (Ethanol - E85)', 'Acceleration 0 - 100 km/h (LPG)', 'AdBlue tank', 'All-electric average range (km)', 'Approach angle', 'Assisting systems', 'Average Energy consumption (kWh/100km)', 'Battery location', 'Battery technology', 'Battery voltage', 'Battery weight', 'Body type', 'Brand', 'CNG cylinder capacity', 'CO emissions', 'CO emissions (CNG)', 'CO emissions (CNG) (NEDC)', 'CO emissions (CNG) (NEDC, WLTP equivalent)', 'CO emissions (CNG) (WLTP)', 'CO emissions (EPA)', 'CO emissions (Ethanol - E100)', 'CO emissions (Ethanol - E85)', 'CO emissions (Ethanol - E85) (NEDC)', 'CO emissions (LPG)', 'CO emissions (LPG) (NEDC)', 'CO emissions (LPG) (NEDC, WLTP equivalent)', 'CO emissions (LPG) (WLTP)', 'CO emissions (NEDC)', 'CO emissions (NEDC, WLTP equivalent)', 'CO emissions (WLTC)', 'CO emissions (WLTP)', 'Climb angle', 'Compression ratio

In [14]:
len(df.columns)

126

In [15]:
df.head()

Unnamed: 0,Brand,Model,Generation,Start of production,End of production,Modification (Engine),Powertrain Architecture,Body type,Fuel Type,Max. weight,...,Acceleration 0 - 100 km/h (CNG),System torque,Torque (Ethanol - E85),Number of valves per cylinder,All-electric average range (km),Average Energy consumption (kWh/100km),Fuel consumption (L/100km),Fuel consumption CNG (L/100km),Fuel consumption LPG (L/100km),Fuel consumption Ethanol (L/100km)
0,Alpina,B9,B9 Coupe (E24),1982 year,1985 year,3.5 (245 Hp),Internal Combustion engine,Coupe,Petrol (Gasoline),,...,,,,,,,,,,
1,Alpina,B11,B11 (E32),1987 year,1987 year,3.5 (250 Hp),Internal Combustion engine,Sedan,Petrol (Gasoline),,...,,,,,,,,,,
2,Alpina,B9,B9 (E28),1981 year,1985 year,3.0 (245 Hp),Internal Combustion engine,Sedan,Petrol (Gasoline),,...,,,,,,,,,,
3,Alpina,B11,B11 (E32),1987 year,1993 year,3.5 (254 Hp),Internal Combustion engine,Sedan,Petrol (Gasoline),,...,,,,,,,,,,
4,Alpina,C2,C2 Cabrio (E30),"February, 1986 year","July, 1987 year",2.7 (209 Hp),Internal Combustion engine,Cabriolet,Petrol (Gasoline),,...,,,,,,,8.795897,,,


In [16]:
co2_columns = [
    'CO emissions', 
    'CO emissions (CNG)', 
    'CO emissions (CNG) (NEDC)', 
    'CO emissions (CNG) (NEDC, WLTP equivalent)', 
    'CO emissions (CNG) (WLTP)', 
    'CO emissions (EPA)', 
    'CO emissions (Ethanol - E100)', 
    'CO emissions (Ethanol - E85)', 
    'CO emissions (Ethanol - E85) (NEDC)', 
    'CO emissions (LPG)', 
    'CO emissions (LPG) (NEDC)', 
    'CO emissions (LPG) (NEDC, WLTP equivalent)', 
    'CO emissions (LPG) (WLTP)', 
    'CO emissions (NEDC)', 
    'CO emissions (NEDC, WLTP equivalent)', 
    'CO emissions (WLTC)', 
    'CO emissions (WLTP)'
]

co2_fuel_cols = [
    'CO emissions', 
    'CO emissions (EPA)', 
    'CO emissions (NEDC)', 
    'CO emissions (NEDC, WLTP equivalent)', 
    'CO emissions (WLTC)', 
    'CO emissions (WLTP)'
]

co2_cng_cols = [
    'CO emissions (CNG)', 
    'CO emissions (CNG) (NEDC)', 
    'CO emissions (CNG) (NEDC, WLTP equivalent)', 
    'CO emissions (CNG) (WLTP)'
]

co2_lpg_cols = [
    'CO emissions (LPG)', 
    'CO emissions (LPG) (NEDC)', 
    'CO emissions (LPG) (NEDC, WLTP equivalent)', 
    'CO emissions (LPG) (WLTP)', 
]

co2_ethanol_cols = [
    'CO emissions (Ethanol - E100)', 
    'CO emissions (Ethanol - E85)', 
    'CO emissions (Ethanol - E85) (NEDC)'
]

df[co2_columns] = df[co2_columns].map(lambda x: parse_number_or_range(x, hmean=True))
df['CO2 emission (g/km)'] = hmean_across(df[co2_fuel_cols])
df['CO2 emission CNG (g/km)'] = hmean_across(df[co2_cng_cols])
df['CO2 emission LPG (g/km)'] = hmean_across(df[co2_lpg_cols])
df['CO2 emission Ethanol (g/km)'] = hmean_across(df[co2_ethanol_cols])
df = df.drop(co2_columns, axis=1)

In [17]:
new_cols = [
    'CO2 emission (g/km)',
    'CO2 emission CNG (g/km)',
    'CO2 emission LPG (g/km)',
    'CO2 emission Ethanol (g/km)'
]
df[df[new_cols].notna().any(axis=1)][new_cols]

Unnamed: 0,CO2 emission (g/km),CO2 emission CNG (g/km),CO2 emission LPG (g/km),CO2 emission Ethanol (g/km)
36,183.0,,,
38,268.0,,,
43,139.0,,,
47,155.0,,,
50,139.0,,,
...,...,...,...,...
50076,125.0,,,
50077,103.0,,,
50080,120.0,,,
50081,95.0,,,


In [18]:
print(sorted(list(df.columns)))

['100 km/h - 0', 'Acceleration 0 - 100 km/h', 'Acceleration 0 - 100 km/h (CNG)', 'Acceleration 0 - 100 km/h (Ethanol - E100)', 'Acceleration 0 - 100 km/h (Ethanol - E85)', 'Acceleration 0 - 100 km/h (LPG)', 'AdBlue tank', 'All-electric average range (km)', 'Approach angle', 'Assisting systems', 'Average Energy consumption (kWh/100km)', 'Battery location', 'Battery technology', 'Battery voltage', 'Battery weight', 'Body type', 'Brand', 'CNG cylinder capacity', 'CO2 emission (g/km)', 'CO2 emission CNG (g/km)', 'CO2 emission Ethanol (g/km)', 'CO2 emission LPG (g/km)', 'Climb angle', 'Compression ratio', 'Coolant', 'Cylinder Bore', 'Departure angle', 'Doors', 'Drag coefficient (C)', 'Drive wheel', 'Drivetrain Architecture', 'Emission standard', 'End of production', 'Engine Model/Code', 'Engine aspiration', 'Engine configuration', 'Engine displacement', 'Engine layout', 'Engine oil capacity', 'Engine systems', 'Front brakes', 'Front overhang', 'Front suspension', 'Front track', 'Fuel Type',

In [19]:
len(df.columns)

113

In [20]:
e100e85_cols = [
    'Power (Ethanol - E100)', 
    'Power (Ethanol - E85)',
    'Torque (Ethanol - E100)', 
    'Torque (Ethanol - E85)',
    'Maximum speed (Ethanol - E100)', 
    'Maximum speed (Ethanol - E85)',
    'Acceleration 0 - 100 km/h (Ethanol - E100)', 
    'Acceleration 0 - 100 km/h (Ethanol - E85)'
]

power_ethanol_cols = [
    'Power (Ethanol - E100)', 
    'Power (Ethanol - E85)'
]

torque_ethanol_cols = [
    'Torque (Ethanol - E100)', 
    'Torque (Ethanol - E85)'
]

max_speed_ethanol_cols = [
    'Maximum speed (Ethanol - E100)', 
    'Maximum speed (Ethanol - E85)'
]

acc_ethanol_cols = [
    'Acceleration 0 - 100 km/h (Ethanol - E100)', 
    'Acceleration 0 - 100 km/h (Ethanol - E85)'
]

df[e100e85_cols] = df[e100e85_cols].map(lambda x: parse_number_or_range(x, hmean=False))
df['Power (Ethanol)'] = df[power_ethanol_cols].mean(axis=1, skipna=True)
df['Torque (Ethanol)'] = df[torque_ethanol_cols].mean(axis=1, skipna=True)
df['Maximum speed (Ethanol)'] = df[max_speed_ethanol_cols].mean(axis=1, skipna=True)
df['Acceleration 0 - 100 km/h (Ethanol)'] = df[acc_ethanol_cols].mean(axis=1, skipna=True)
df = df.drop(e100e85_cols, axis=1)

In [21]:
new_cols = [
    'Power (Ethanol)',
    'Torque (Ethanol)',
    'Maximum speed (Ethanol)',
    'Acceleration 0 - 100 km/h (Ethanol)'
]
df[df[new_cols].notna().any(axis=1)][new_cols]

Unnamed: 0,Power (Ethanol),Torque (Ethanol),Maximum speed (Ethanol),Acceleration 0 - 100 km/h (Ethanol)
8284,326.0,472.0,,
8289,330.0,472.0,,
8295,330.0,472.0,,
8298,330.0,475.0,,
8305,330.0,475.0,,
...,...,...,...,...
31844,297.0,447.0,,
31863,326.0,472.0,,
31892,326.0,475.0,,
48199,170.0,,,


In [22]:
print(sorted(list(df.columns)))

['100 km/h - 0', 'Acceleration 0 - 100 km/h', 'Acceleration 0 - 100 km/h (CNG)', 'Acceleration 0 - 100 km/h (Ethanol)', 'Acceleration 0 - 100 km/h (LPG)', 'AdBlue tank', 'All-electric average range (km)', 'Approach angle', 'Assisting systems', 'Average Energy consumption (kWh/100km)', 'Battery location', 'Battery technology', 'Battery voltage', 'Battery weight', 'Body type', 'Brand', 'CNG cylinder capacity', 'CO2 emission (g/km)', 'CO2 emission CNG (g/km)', 'CO2 emission Ethanol (g/km)', 'CO2 emission LPG (g/km)', 'Climb angle', 'Compression ratio', 'Coolant', 'Cylinder Bore', 'Departure angle', 'Doors', 'Drag coefficient (C)', 'Drive wheel', 'Drivetrain Architecture', 'Emission standard', 'End of production', 'Engine Model/Code', 'Engine aspiration', 'Engine configuration', 'Engine displacement', 'Engine layout', 'Engine oil capacity', 'Engine systems', 'Front brakes', 'Front overhang', 'Front suspension', 'Front track', 'Fuel Type', 'Fuel consumption (L/100km)', 'Fuel consumption CNG

In [23]:
len(df.columns)

109

In [24]:
df[df['Number of gears and type of gearbox'].notna()]['Number of gears and type of gearbox']

8            1 gears, automatic transmission
9            1 gears, automatic transmission
10           1 gears, automatic transmission
11              5 gears, manual transmission
12              5 gears, manual transmission
                        ...                 
50079        1 gears, automatic transmission
50080           6 gears, manual transmission
50081           6 gears, manual transmission
50082           6 gears, manual transmission
50083    7 gears, automatic transmission EDC
Name: Number of gears and type of gearbox, Length: 47305, dtype: object

In [25]:
df['Number of gears and type of gearbox'].str.extract(
    r'(\d+)\s+gears, (\S+) transmission', expand=True
)

Unnamed: 0,0,1
0,,
1,,
2,,
3,,
4,,
...,...,...
50079,1,automatic
50080,6,manual
50081,6,manual
50082,6,manual


In [26]:
df[['Number of gears', 'Transmission type']] = df['Number of gears and type of gearbox'].str.extract(
    r'(\d+)\s+gears, (\S+) transmission', expand=True
)

df['Transmission type'] = df['Transmission type'].astype('category')
print(df['Transmission type'].cat.categories)

df['Number of gears'] = df['Number of gears'].fillna(0).astype('uint8')

Index(['automatic', 'manual'], dtype='object')


In [27]:
df = df.drop('Number of gears and type of gearbox', axis=1)

In [28]:
for col, dtype in df.dtypes.items():
    #if str(dtype) == 'object':
    print(col, '[' + str(dtype) + ']')

Brand [object]
Model [object]
Generation [object]
Start of production [object]
End of production [object]
Modification (Engine) [object]
Powertrain Architecture [object]
Body type [object]
Fuel Type [object]
Max. weight [object]
Length [object]
Width [object]
Height [object]
Front suspension [object]
Battery weight [object]
Engine aspiration [object]
Recuperation output [object]
Gross battery capacity [object]
Power [object]
Emission standard [object]
Compression ratio [object]
Permitted trailer load with brakes (12%) [object]
Wheelbase [object]
Acceleration 0 - 100 km/h [object]
Maximum speed (CNG) [object]
Ramp-over (brakeover) angle [object]
Power (CNG) [object]
AdBlue tank [object]
CNG cylinder capacity [object]
Power (LPG) [object]
Number of cylinders [float64]
Drivetrain Architecture [object]
Permitted trailer load with brakes (8%) [object]
Fuel tank capacity (LPG) [object]
Torque [object]
System power [object]
Front track [object]
Fuel injection system [object]
Engine systems [o

In [29]:
df['Brand'] = df['Brand'].astype('category')
print(df['Brand'].cat.categories)

Index(['Acura', 'Alfa Romeo', 'Alpina', 'Aston Martin', 'Audi', 'BMW', 'BYD',
       'Bentley', 'Bugatti', 'Cadillac', 'Chevrolet', 'Chrysler', 'Citroen',
       'Cupra', 'DS', 'Dacia', 'Daewoo', 'Daihatsu', 'Dodge', 'Ferrari',
       'Fiat', 'Ford', 'GMC', 'Genesis', 'Great Wall', 'Haval', 'Honda',
       'Hongqi', 'Hummer', 'Hyundai', 'Infiniti', 'Jaguar', 'Jeep', 'Kia',
       'Koenigsegg', 'Lada', 'Lamborghini', 'Lancia', 'Land Rover', 'Lexus',
       'Lotus', 'MG', 'Maserati', 'Mazda', 'McLaren', 'Mercedes-Benz', 'Mini',
       'Mitsubishi', 'NIO', 'Nissan', 'Opel', 'Pagani', 'Peugeot', 'Porsche',
       'RAM', 'Renault', 'Rolls-Royce', 'Rover', 'Saab', 'Seat', 'Skoda',
       'Smart', 'Subaru', 'Suzuki', 'Tesla', 'Toyota', 'Vauxhall',
       'Volkswagen', 'Volvo'],
      dtype='object')


In [30]:
df['Model'] = df['Model'].astype('category')
print(df['Model'].cat.categories)

Index(['#1', '#3', '#5', '/8', '02', '1 Series', '10', '100', '100 NX', '1000',
       ...
       'iQ', 'iX', 'iX1', 'iX2', 'iX3', 'ix20', 'ix25', 'ix35', 'ix55',
       'nanuk quattro concept'],
      dtype='object', length=2110)


In [31]:
df['Generation'] = df['Generation'].astype('category')
print(df['Generation'].cat.categories)

Index(['#1', '#3', '#5', '/8 (W114)', '/8 (W114, facelift 1973)', '/8 (W115)',
       '/8 (W115, facelift 1973)', '/8 Coupe (W114)',
       '/8 Coupe (W114, facelift 1973)', '02 (E10)',
       ...
       'iX3 (G08, facelift 2021)', 'iX3 (NA5)', 'ix20', 'ix20 (facelift 2015)',
       'ix25', 'ix35', 'ix35 (Facelift 2013)', 'ix35 FCEV', 'ix55',
       'nanuk quattro concept'],
      dtype='object', length=7926)


In [32]:
dates = df['Start of production']
dates_clean = dates.str.replace('year', '', regex=False).str.strip()
dates_parsed = pd.to_datetime(dates_clean, format='mixed')
df['Start of production'] = dates_parsed

In [33]:
dates = df['End of production']
dates_clean = dates.str.replace('year', '', regex=False).str.strip()
dates_parsed = pd.to_datetime(dates_clean, format='mixed')
df['End of production'] = dates_parsed

In [34]:
df['Modification (Engine)'] = df['Modification (Engine)'].astype('category')
print(df['Modification (Engine)'].cat.categories)

Index(['(134 Hp) CVT', '(136 Hp) Fuel Cell Automatic',
       '(1360 Hp) 4WD Electric', '(174 Hp) Fuel Cell Automatic', '(226 Hp)',
       '(226 Hp) Electric', '(306 Hp) Electric', '(435 Hp) AWD',
       '(600 Hp) AWD', '0.4 (16 Hp)',
       ...
       'iV 1.5 TSI (204 Hp) Plug-in Hybrid DSG', 'iV 36.8 kWh (83 Hp)',
       'performance 100 kWh (326 Hp)', 'performance 100 kWh (381 Hp)',
       'performance 105 kWh (925 Hp) quattro',
       'performance 2.9 TFSI V6 (470 Hp) quattro tiptronic',
       'performance 4.0 TFSI V8 (630 Hp) Mild Hybrid quattro tiptronic',
       'performance 4.0 TFSI V8 (640 Hp) Mild Hybrid quattro tiptronic',
       'vRS 2.0 TSI (245 Hp)', 'vRS 2.0 TSI (245 Hp) DSG'],
      dtype='object', length=25269)


In [35]:
df['Powertrain Architecture'] = df['Powertrain Architecture'].astype('category')
print(df['Powertrain Architecture'].cat.categories)

Index(['BEV (Electric Vehicle)', 'FCEV (Fuel Cell Electric Vehicle)',
       'FHEV (Full Hybrid Electric Vehicle)', 'Internal Combustion engine',
       'MHEV (Mild Hybrid Electric Vehicle, power-assist hybrid, battery-assisted hybrid vehicles, BAHV)',
       'PFCEV (Plug-in Fuel Cell Electric Vehicle)',
       'PHEV (Plug-in Hybrid Electric Vehicle)'],
      dtype='object')


In [36]:
df['Body type'] = df['Body type'].astype('category')
print(df['Body type'].cat.categories)

Index(['CUV', 'Cabriolet', 'Cabriolet, Coupe', 'Cabriolet, Hatchback',
       'Cabriolet, SUV', 'Coupe', 'Coupe - Cabriolet',
       'Coupe - Cabriolet, Roadster', 'Coupe, CUV', 'Coupe, Crossover',
       'Coupe, Fastback', 'Coupe, Hatchback', 'Coupe, Liftback', 'Coupe, SUV',
       'Coupe, SUV, Crossover', 'Coupe, SUV, Fastback', 'Crossover',
       'Crossover, MPV', 'Fastback', 'Grand Tourer', 'Hatchback',
       'Hatchback, Crossover', 'Hatchback, Fastback', 'Liftback', 'MPV',
       'MPV, Van', 'Minivan', 'Minivan, Crossover', 'Minivan, MPV',
       'Off-road vehicle', 'Off-road vehicle, Cabriolet',
       'Off-road vehicle, Cabriolet, SUV', 'Off-road vehicle, Coupe',
       'Off-road vehicle, Pick-up', 'Off-road vehicle, SUV',
       'Off-road vehicle, Station wagon (estate)', 'Pick-up', 'Pick-up, Targa',
       'Quadricycle', 'Roadster', 'SAC', 'SAV', 'SUV', 'SUV, Crossover',
       'SUV, Crossover, Fastback', 'SUV, Fastback', 'SUV, MPV', 'SUV, Targa',
       'Sedan', 'Sedan, Cro

In [37]:
df['Fuel Type'] = df['Fuel Type'].astype('category')
print(df['Fuel Type'].cat.categories)

Index(['Diesel', 'Diesel / electricity', 'Electricity', 'Ethanol - E85',
       'Hydrogen', 'Hydrogen / electricity', 'LPG',
       'Mixture of two stroke engine', 'Petrol (Gasoline)', 'Petrol / CNG',
       'Petrol / Ethanol - E100', 'Petrol / Ethanol - E85',
       'Petrol / Ethanol - E85 / electricity', 'Petrol / LPG',
       'Petrol / electricity', 'Synthetic gasoline'],
      dtype='object')


In [38]:
df['Max. weight'] = df['Max. weight'].map(parse_number_or_range)
df['Max. weight'] = df['Max. weight'].fillna(0).astype('uint16')
df = df.rename(columns={'Max. weight': 'Max. weight (kg)'})
df['Max. weight (kg)']

0           0
1           0
2           0
3           0
4           0
         ... 
50079    2116
50080    1806
50081    1841
50082    1806
50083    1839
Name: Max. weight (kg), Length: 50084, dtype: uint16

In [39]:
df['Length'] = df['Length'].map(parse_number_or_range)
df['Length'] = df['Length'].fillna(0).astype('uint16')
df = df.rename(columns={'Length': 'Length (mm)'})
df['Length (mm)']

0           0
1           0
2           0
3           0
4           0
         ... 
50079    4200
50080    4359
50081    4359
50082    4359
50083    4359
Name: Length (mm), Length: 50084, dtype: uint16

In [40]:
df['Width'] = df['Width'].map(parse_number_or_range)
df['Width'] = df['Width'].fillna(0).astype('uint16')
df = df.rename(columns={'Width': 'Width (mm)'})
df['Width (mm)']

0           0
1           0
2           0
3           0
4           0
         ... 
50079    1768
50080    1814
50081    1814
50082    1814
50083    1814
Name: Width (mm), Length: 50084, dtype: uint16

In [41]:
df['Height'] = df['Height'].map(parse_number_or_range)
df['Height'] = df['Height'].fillna(0).astype('uint16')
df = df.rename(columns={'Height': 'Height (mm)'})
df['Height (mm)']

0           0
1           0
2           0
3           0
4           0
         ... 
50079    1505
50080    1447
50081    1447
50082    1447
50083    1447
Name: Height (mm), Length: 50084, dtype: uint16

In [42]:
df['Front suspension']
df['Front suspension'] = df['Front suspension'].astype('category')
print(df['Front suspension'].cat.categories)

Index(['Air suspension', 'Coil spring',
       'Coil spring, Air Suspension - Optional',
       'Coil spring, Double wishbone',
       'Coil spring, Double wishbone, Transverse stabilizer',
       'Coil spring, Double wishbone, Transverse stabilizer, Air Suspension - Optional',
       'Coil spring, Double wishbone, Wishbone',
       'Coil spring, Hydro-pneumatic element, Wishbone, Transverse stabilizer',
       'Coil spring, Independent coil spring, Wishbone',
       'Coil spring, Independent multi-link suspension',
       ...
       'Spring-loaded rack', 'Torsion', 'Trailing arm',
       'Trailing arm, Torsion', 'Transverse stabilizer',
       'Transverse stabilizer, Torsion', 'Wishbone',
       'Wishbone, Air suspension', 'Wishbone, Torsion',
       'Wishbone, Transverse stabilizer'],
      dtype='object', length=122)


In [43]:
df['Battery weight'] = df['Battery weight'].map(parse_number_or_range)
df['Battery weight'] = df['Battery weight'].fillna(0).astype('uint16')
df = df.rename(columns={'Battery weight': 'Battery weight (kg)'})
df['Battery weight (kg)']

0          0
1          0
2          0
3          0
4          0
        ... 
50079    394
50080      0
50081      0
50082      0
50083      0
Name: Battery weight (kg), Length: 50084, dtype: uint16

In [44]:
df['Engine aspiration']
df['Engine aspiration'] = df['Engine aspiration'].astype('category')
print(df['Engine aspiration'].cat.categories)

Index(['2 x Electric Assisted Turbocharger, Intercooler',
       '2 x Twin-Turbo, Intercooler', '2 x Twin-scroll turbo, Intercooler',
       '4 Turbochargers, Intercooler', 'BiTurbo', 'BiTurbo, Intercooler',
       'Electric Assisted Turbocharger, Intercooler',
       'Naturally aspirated engine', 'Naturally aspirated engine, Intercooler',
       'Supercharger', 'Supercharger, Intercooler', 'Turbocharger',
       'Turbocharger and Electric Powered Compressor, Intercooler',
       'Turbocharger, Intercooler',
       'Turbocharging and Supercharger, Intercooler', 'Twin-Turbo',
       'Twin-Turbo and Electric Powered Compressor, Intercooler',
       'Twin-Turbo, Intercooler', 'Twin-power turbo, Intercooler',
       'Twin-scroll turbo, Intercooler'],
      dtype='object')


In [45]:
#df[df['Recuperation output'].notna()]['Recuperation output'].value_counts()
df['Recuperation output'] = df['Recuperation output'].map(parse_number_or_range)
df['Recuperation output'] = df['Recuperation output'].fillna(0).astype('uint16')
df = df.rename(columns={'Recuperation output': 'Recuperation output (kW)'})
df['Recuperation output (kW)']

0        0
1        0
2        0
3        0
4        0
        ..
50079    0
50080    0
50081    0
50082    0
50083    0
Name: Recuperation output (kW), Length: 50084, dtype: uint16

In [46]:
# kWh
df['Gross battery capacity'] = df['Gross battery capacity'].map(parse_number_or_range)
df['Gross battery capacity'] = df['Gross battery capacity'].astype('float16')
df = df.rename(columns={'Gross battery capacity': 'Gross battery capacity (kWh)'})

df[df['Gross battery capacity (kWh)'].notna()]['Gross battery capacity (kWh)']

  has_large_values = (abs_vals > 1e6).any()


8        80.000000
9        47.312500
10       35.187500
17       33.187500
21       44.906250
           ...    
49483    44.093750
49500    25.906250
49615    27.500000
49634    22.000000
50014    10.460938
Name: Gross battery capacity (kWh), Length: 2461, dtype: float16

In [47]:
# Hp
df['Power'] = df['Power'].map(parse_number_or_range)
df['Power'] = df['Power'] * 0.7355
df['Power'] = df['Power'].astype('float16')
df = df.rename(columns={'Power': 'Power (kW)'})
df['Power (kW)']

  has_large_values = (abs_vals > 1e6).any()


0        180.2500
1        183.8750
2        180.2500
3        186.8750
4        153.7500
           ...   
50079         NaN
50080     73.5625
50081     66.1875
50082     95.6250
50083    117.6875
Name: Power (kW), Length: 50084, dtype: float16

In [48]:
df['Emission standard'] = df['Emission standard'].astype('category')
print(df['Emission standard'].cat.categories)

Index(['AT-PZEV', 'BIN 125', 'BS 6', 'BS VI', 'BS VI 2.0', 'BS VI Phase 2',
       'BS-IV', 'California LEV III', 'China 6', 'China 6b+RDE',
       ...
       'ULEV2', 'ULEV50', 'ULEV70', 'ULEV70 SULEV30', 'WCC', 'WCC + UCC',
       'WCC + UCC / SULEV 30', 'WCC+UCC', 'ZEV', 'euro 4'],
      dtype='object', length=294)


In [49]:
df['Compression ratio'] = df['Compression ratio'].astype('category')
print(df['Compression ratio'].cat.categories)

Index(['10.01:1', '10.1:1', '10.25:1', '10.2:1', '10.3:1', '10.478:1',
       '10.4:1', '10.55:1', '10.5:1', '10.67:1',
       ...
       '9.65:1', '9.6:1', '9.75:1', '9.7:1', '9.85:1', '9.8:1', '9.91:1',
       '9.9:1', '90:1', '9:1'],
      dtype='object', length=203)


In [50]:
df['Permitted trailer load with brakes (12%)'].value_counts()
df['Permitted trailer load with brakes (12%)'] = df['Permitted trailer load with brakes (12%)'].map(parse_number_or_range)
df['Permitted trailer load with brakes (12%)'] = df['Permitted trailer load with brakes (12%)'].fillna(0).astype('uint16')
df = df.rename(columns={'Permitted trailer load with brakes (12%)': 'Permitted trailer load with brakes (12%) (kg)'})
df['Permitted trailer load with brakes (12%) (kg)']

0           0
1           0
2           0
3           0
4           0
         ... 
50079     500
50080    1300
50081    1300
50082    1300
50083    1650
Name: Permitted trailer load with brakes (12%) (kg), Length: 50084, dtype: uint16

In [51]:
df['Wheelbase']
df['Wheelbase'] = df['Wheelbase'].map(parse_number_or_range)
df['Wheelbase'] = df['Wheelbase'].fillna(0).astype('uint16')
df = df.rename(columns={'Wheelbase': 'Wheelbase (mm)'})
df['Wheelbase (mm)']

0           0
1           0
2           0
3           0
4           0
         ... 
50079    2685
50080    2669
50081    2669
50082    2669
50083    2669
Name: Wheelbase (mm), Length: 50084, dtype: uint16

In [52]:
df['Acceleration 0 - 100 km/h']
df['Acceleration 0 - 100 km/h'] = df['Acceleration 0 - 100 km/h'].map(parse_number_or_range)
df['Acceleration 0 - 100 km/h'] = df['Acceleration 0 - 100 km/h'].astype('float16')
df['Acceleration 0 - 100 km/h']

  has_large_values = (abs_vals > 1e6).any()


0              NaN
1              NaN
2              NaN
3              NaN
4         6.898438
           ...    
50079    10.500000
50080    12.296875
50081    13.398438
50082    10.601562
50083     8.203125
Name: Acceleration 0 - 100 km/h, Length: 50084, dtype: float16

In [53]:
df['Ramp-over (brakeover) angle'] = df['Ramp-over (brakeover) angle'].map(parse_number_or_range)
df['Ramp-over (brakeover) angle'] = df['Ramp-over (brakeover) angle'].astype('float16')
df[df['Ramp-over (brakeover) angle'].notna()]['Ramp-over (brakeover) angle']

  has_large_values = (abs_vals > 1e6).any()


54       19.406250
219      17.000000
2298     14.796875
2300     14.796875
2380     15.898438
           ...    
49195    20.000000
49200    20.000000
49216    20.000000
49229    20.000000
49235    20.000000
Name: Ramp-over (brakeover) angle, Length: 5361, dtype: float16

In [54]:
df['AdBlue tank'] = df['AdBlue tank'].map(parse_number_or_range)
df['AdBlue tank'] = df['AdBlue tank'].astype('float16')
df = df.rename(columns={'AdBlue tank': 'AdBlue tank (l)'})
df[df['AdBlue tank (l)'].notna()]['AdBlue tank (l)']

  has_large_values = (abs_vals > 1e6).any()


54       20.0
395      18.5
408      18.5
418      18.5
956      21.5
         ... 
50063    16.0
50064    16.0
50070    16.0
50071    16.0
50074    16.0
Name: AdBlue tank (l), Length: 1859, dtype: float16

In [55]:
df['Number of cylinders'] = df['Number of cylinders'].fillna(0).astype('uint8')
df['Number of cylinders']

0        0
1        0
2        0
3        0
4        6
        ..
50079    0
50080    4
50081    4
50082    4
50083    4
Name: Number of cylinders, Length: 50084, dtype: uint8

In [56]:
df['Drivetrain Architecture'] = df['Drivetrain Architecture'].astype('category')
print(df['Drivetrain Architecture'].cat.categories)

Index(['An Internal combustion engine (ICE) drives the front wheels, one electric motor drives the front wheels, one electric motor drives the rear wheels. There is an ability for running in full electric or mixed mode.',
       'An Internal combustion engine (ICE) drives the front wheels, one electric motor drives the front wheels, one electric motor drives the rear wheels. There is an ability for running in full electric or mixed mode. There are parallel and serial hybrid modes.',
       'An Internal combustion engine (ICE) drives the rear wheels, one electric motor drives the front wheels, one electric motor drives the rear wheels. There is an ability for running in full electric or mixed mode.',
       'Four electric motors drive each wheel individually.',
       'One electric motor drives the front wheels, one electric motor drives the rear wheels.',
       'One electric motor drives the front wheels.',
       'One electric motor drives the front wheels. One electric motor drives 

In [57]:
df['Permitted trailer load with brakes (8%)']
df['Permitted trailer load with brakes (8%)'] = df['Permitted trailer load with brakes (8%)'].map(parse_number_or_range)
df['Permitted trailer load with brakes (8%)'] = df['Permitted trailer load with brakes (8%)'].fillna(0).astype('uint16')
df = df.rename(columns={'Permitted trailer load with brakes (8%)': 'Permitted trailer load with brakes (8%) (kg)'})
df['Permitted trailer load with brakes (8%) (kg)']

0        0
1        0
2        0
3        0
4        0
        ..
50079    0
50080    0
50081    0
50082    0
50083    0
Name: Permitted trailer load with brakes (8%) (kg), Length: 50084, dtype: uint16

In [58]:
df['Torque'] = df['Torque'].map(parse_number_or_range)
df['Torque'] = df['Torque'].fillna(0).astype('uint16')
df = df.rename(columns={'Torque': 'Torque (Nm)'})
df['Torque (Nm)']

0          0
1          0
2          0
3          0
4        267
        ... 
50079      0
50080    175
50081    220
50082    205
50083    270
Name: Torque (Nm), Length: 50084, dtype: uint16

In [59]:
df['System power']
df['System power'] = df['System power'].map(parse_number_or_range)
df['System power'] = df['System power'] * 0.7355
df['System power'] = df['System power'].astype('float16')
df = df.rename(columns={'System power': 'System power (kW)'})
df['System power (kW)']

  has_large_values = (abs_vals > 1e6).any()


0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
          ...  
50079    95.625
50080       NaN
50081       NaN
50082       NaN
50083       NaN
Name: System power (kW), Length: 50084, dtype: float16

In [60]:
df['Front track']
df['Front track'] = df['Front track'].map(parse_number_or_range)
df['Front track'] = df['Front track'].fillna(0).astype('uint16')
df = df.rename(columns={'Front track': 'Front track (mm)'})
df['Front track (mm)']

0           0
1           0
2           0
3           0
4           0
         ... 
50079       0
50080    1591
50081    1591
50082    1591
50083    1591
Name: Front track (mm), Length: 50084, dtype: uint16

In [61]:
df['Fuel injection system'] = df['Fuel injection system'].astype('category')
print(df['Fuel injection system'].cat.categories)

Index(['Carburettor', 'Diesel Commonrail', 'Direct injection',
       'Direct injection and Multi-port manifold injection',
       'Dual-point throttle body fuel injection',
       'Indirect injection with two injectors on each intake port',
       'Multi-port manifold injection', 'Precombustion chamber injection',
       'Pump-nozzle (Unit Injector)', 'Single-point injection'],
      dtype='object')


In [62]:
df['Engine systems'] = df['Engine systems'].astype('category')
print(df['Engine systems'].cat.categories)

Index(['Cylinder deactivation system', 'Particulate filter',
       'Start & Stop System',
       'Start & Stop SystemCylinder deactivation systemParticulate filter',
       'Start & Stop SystemParticulate filter'],
      dtype='object')


In [63]:
df['Battery voltage'] = df['Battery voltage'].map(parse_number_or_range)
df['Battery voltage'] = df['Battery voltage'].fillna(0).astype('uint16')
df = df.rename(columns={'Battery voltage': 'Battery voltage (V)'})
df['Battery voltage (V)']

0          0
1          0
2          0
3          0
4          0
        ... 
50079    400
50080      0
50081      0
50082      0
50083      0
Name: Battery voltage (V), Length: 50084, dtype: uint16

In [64]:
df['Rear suspension'] = df['Rear suspension'].astype('category')
print(df['Rear suspension'].cat.categories)

Index(['Air suspension', 'Air suspension, Trailing arm',
       'Air suspension, Transverse stabilizer',
       'Air suspension, Transverse stabilizer, Leaf spring',
       'Air suspension, Transverse stabilizer, Trailing arm', 'Coil spring',
       'Coil spring, Air Suspension - Optional', 'Coil spring, Elastic beam',
       'Coil spring, Torsion',
       'Dependent spring suspension with transverse stabilizer',
       ...
       'Wishbone, Transverse stabilizer, Trailing arm',
       'Wishbone, Transverse stabilizer, Trailing arm, Coil spring',
       'dependent spring suspension',
       'dependent spring suspension, Leaf spring',
       'dependent spring suspension, Trailing arm',
       'dependent spring suspension, Transverse stabilizer',
       'dependent spring suspension, Transverse stabilizer, Trailing arm',
       'independent torsion suspension',
       'independent torsion suspension, Trailing arm',
       'independent torsion suspension, Wishbone'],
      dtype='object', 

In [65]:
df['Max. roof load']
df['Max. roof load'] = df['Max. roof load'].map(parse_number_or_range)
df['Max. roof load'] = df['Max. roof load'].fillna(0).astype('uint16')
df = df.rename(columns={'Max. roof load': 'Max. roof load (kg)'})
df['Max. roof load (kg)']

0         0
1         0
2         0
3         0
4         0
         ..
50079     0
50080     0
50081     0
50082     0
50083    80
Name: Max. roof load (kg), Length: 50084, dtype: uint16

In [66]:
df['Piston Stroke']
df['Piston Stroke'] = df['Piston Stroke'].map(parse_number_or_range)
df['Piston Stroke'] = df['Piston Stroke'].astype('float16')
df = df.rename(columns={'Piston Stroke': 'Piston Stroke (mm)'})
df['Piston Stroke (mm)']

  has_large_values = (abs_vals > 1e6).any()


0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
          ...  
50079       NaN
50080    73.125
50081    80.500
50082    73.125
50083       NaN
Name: Piston Stroke (mm), Length: 50084, dtype: float16

In [67]:
df['Max load']
df['Max load'] = df['Max load'].map(parse_number_or_range)
df['Max load'] = df['Max load'].fillna(0).astype('uint16')
df = df.rename(columns={'Max load': 'Max load (kg)'})
df['Max load (kg)']

0          0
1          0
2          0
3          0
4          0
        ... 
50079    492
50080    601
50081    636
50082    601
50083    560
Name: Max load (kg), Length: 50084, dtype: uint16

In [68]:
df['Engine configuration'] = df['Engine configuration'].astype('category')
print(df['Engine configuration'].cat.categories)

Index(['180° flat V-engine', 'Boxer', 'Inline', 'Rotary (Wankel)', 'V-engine',
       'VR-engine', 'W-engine'],
      dtype='object')


In [69]:
df['Engine layout'] = df['Engine layout'].astype('category')
print(df['Engine layout'].cat.categories)

Index(['Front axle, Longitudinal', 'Front, Longitudinal', 'Front, Transverse',
       'Middle, Longitudinal', 'Middle, Transverse', 'Rear axle, Longitudinal',
       'Rear, Longitudinal', 'Rear, Transverse'],
      dtype='object')


In [70]:
df['Maximum speed (CNG)'] = df['Maximum speed (CNG)'].map(parse_number_or_range)
df['Maximum speed (CNG)'] = df['Maximum speed (CNG)'].fillna(0).astype('uint8')
df = df.rename(columns={'Maximum speed (CNG)': 'Maximum speed (CNG) (km/h)'})
df['Maximum speed (CNG) (km/h)']

0        0
1        0
2        0
3        0
4        0
        ..
50079    0
50080    0
50081    0
50082    0
50083    0
Name: Maximum speed (CNG) (km/h), Length: 50084, dtype: uint8

In [71]:
df['Power (CNG)'] = df['Power (CNG)'].map(parse_number_or_range)
df['Power (CNG)'] = df['Power (CNG)'] * 0.7355
df['Power (CNG)'] = df['Power (CNG)'].astype('float16')
df = df.rename(columns={'Power (CNG)': 'Power (CNG) (kW)'})
df['Power (CNG) (kW)']

  has_large_values = (abs_vals > 1e6).any()


0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
50079   NaN
50080   NaN
50081   NaN
50082   NaN
50083   NaN
Name: Power (CNG) (kW), Length: 50084, dtype: float16

In [72]:
df['CNG cylinder capacity'] = df['CNG cylinder capacity'].map(parse_number_or_range)
df['CNG cylinder capacity'] = df['CNG cylinder capacity'].astype('float16')
df = df.rename(columns={'CNG cylinder capacity': 'CNG cylinder capacity (kg)'})
df[df['CNG cylinder capacity (kg)'].notna()]['CNG cylinder capacity (kg)']

  has_large_values = (abs_vals > 1e6).any()


2686     17.296875
2879     19.000000
2880     19.000000
3210     17.296875
3222     17.296875
           ...    
43857    13.398438
43945    25.000000
43969    25.000000
44306    25.000000
44312    25.000000
Name: CNG cylinder capacity (kg), Length: 146, dtype: float16

In [73]:
df['Power (LPG)'] = df['Power (LPG)'].map(parse_number_or_range)
df['Power (LPG)'] = df['Power (LPG)'] * 0.7355
df['Power (LPG)'] = df['Power (LPG)'].astype('float16')
df = df.rename(columns={'Power (LPG)': 'Power (LPG) (kW)'})
df[df['Power (LPG) (kW)'].notna()]['Power (LPG) (kW)']

  has_large_values = (abs_vals > 1e6).any()


10535     72.06250
13543     72.06250
13558     72.06250
14153     60.31250
16454     72.06250
19985    103.00000
19986    103.00000
20539     77.93750
24490     58.84375
24491     58.84375
24586     49.28125
24638     74.31250
24644     74.31250
24772     74.31250
24798     74.31250
24803     75.00000
24818     73.56250
24898     80.18750
25058     80.18750
25081     74.31250
25662     67.68750
26322     61.78125
26328     73.56250
26331     60.31250
26352     60.31250
26354     73.56250
26579     74.31250
26781     60.31250
26847     52.96875
26857     74.31250
26870     74.31250
26872     74.31250
27066     52.96875
40074     74.31250
41461     61.03125
41470     64.00000
41473     57.37500
41496     64.00000
41500     61.03125
41502     57.37500
42260     53.68750
43044     83.87500
43476     82.37500
48180     73.56250
48511     74.31250
48723     72.06250
Name: Power (LPG) (kW), dtype: float16

In [74]:
df[df['Fuel tank capacity (LPG)'].notna()]['Fuel tank capacity (LPG)']
df['Fuel tank capacity (LPG)'] = df['Fuel tank capacity (LPG)'].map(parse_number_or_range)
df['Fuel tank capacity (LPG)'] = df['Fuel tank capacity (LPG)'].astype('float16')
df = df.rename(columns={'Fuel tank capacity (LPG)': 'Fuel tank capacity (LPG) (l)'})
df[df['Fuel tank capacity (LPG) (l)'].notna()]['Fuel tank capacity (LPG) (l)']

  has_large_values = (abs_vals > 1e6).any()


6342     38.0000
8524     49.0000
8605     49.0000
9074     52.8125
9375     52.8125
          ...   
48180    50.0000
48438    32.0000
48511    32.0000
48723    24.0000
49324    40.0000
Name: Fuel tank capacity (LPG) (l), Length: 96, dtype: float16

In [75]:
df['Width including mirrors'] = df['Width including mirrors'].map(parse_number_or_range)
df['Width including mirrors'] = df['Width including mirrors'].fillna(0).astype('uint16')
df = df.rename(columns={'Width including mirrors': 'Width including mirrors (mm)'})
df['Width including mirrors (mm)']

0           0
1           0
2           0
3           0
4           0
         ... 
50079    2055
50080    2058
50081    2058
50082    2058
50083    2058
Name: Width including mirrors (mm), Length: 50084, dtype: uint16

In [76]:
df[df['Wading depth'].notna()]['Wading depth']
df['Wading depth'] = df['Wading depth'].map(parse_number_or_range)
df['Wading depth'] = df['Wading depth'].fillna(0).astype('uint16')
df = df.rename(columns={'Wading depth': 'Wading depth (mm)'})
df[df['Wading depth (mm)']>0]['Wading depth (mm)']

54       500
219      700
2298     500
2300     500
2380     400
        ... 
47832    450
47839    450
47841    450
47843    450
48387    813
Name: Wading depth (mm), Length: 2056, dtype: uint16

In [77]:
df[df['100 km/h - 0'].notna()]['100 km/h - 0']
df['100 km/h - 0'] = df['100 km/h - 0'].map(parse_number_or_range)
df['100 km/h - 0'] = df['100 km/h - 0'].astype('float16')
df = df.rename(columns={'100 km/h - 0': '100 km/h - 0 (m)'})
df[df['100 km/h - 0 (m)'].notna()]['100 km/h - 0 (m)']

  has_large_values = (abs_vals > 1e6).any()


6043     33.00000
6211     33.00000
6212     33.00000
6240     36.00000
7336     38.00000
           ...   
44561    33.90625
44567    37.00000
44571    34.50000
44574    35.00000
48321    37.09375
Name: 100 km/h - 0 (m), Length: 553, dtype: float16

In [78]:
df['Assisting systems'] = df['Assisting systems'].astype('category')
print(df['Assisting systems'].cat.categories)

Index(['4-wheel steering (4WS, active rear steering)',
       'ABS (Anti-lock braking system)',
       'ABS (Anti-lock braking system)4-wheel steering (4WS, active rear steering)'],
      dtype='object')


In [79]:
df['Drive wheel'] = df['Drive wheel'].astype('category')
print(df['Drive wheel'].cat.categories)

Index(['All wheel drive (4x4)', 'Front wheel drive', 'Rear wheel drive'], dtype='object')


In [80]:
df['Approach angle'] = df['Approach angle'].map(parse_number_or_range)
df['Approach angle'] = df['Approach angle'].astype('float16')
df[df['Approach angle'].notna()]['Approach angle']

  has_large_values = (abs_vals > 1e6).any()


8        21.000000
21       21.000000
54       25.703125
219      31.000000
231      20.000000
           ...    
49195    18.000000
49200    18.000000
49216    18.000000
49229    18.000000
49235    18.000000
Name: Approach angle, Length: 7221, dtype: float16

In [81]:
df[df['Fuel tank capacity'].notna()]['Fuel tank capacity']
df['Fuel tank capacity'] = df['Fuel tank capacity'].map(parse_number_or_range)
df['Fuel tank capacity'] = df['Fuel tank capacity'].astype('float16')
df = df.rename(columns={'Fuel tank capacity': 'Fuel tank capacity (l)'})
df[df['Fuel tank capacity (l)'].notna()]['Fuel tank capacity (l)']

  has_large_values = (abs_vals > 1e6).any()


13       95.0
23       90.0
24       95.0
25       73.0
30       50.0
         ... 
50078    47.0
50080    47.0
50081    47.0
50082    47.0
50083    50.0
Name: Fuel tank capacity (l), Length: 45142, dtype: float16

In [82]:
df['Rear (Back) track'].value_counts()
df['Rear (Back) track'] = df['Rear (Back) track'].map(parse_number_or_range)
df['Rear (Back) track'] = df['Rear (Back) track'].fillna(0).astype('uint16')
df = df.rename(columns={'Rear (Back) track': 'Rear (Back) track (mm)'})
df[df['Rear (Back) track (mm)']>0]['Rear (Back) track (mm)']

8        1560
9        1470
10       1470
11       1551
12       1551
         ... 
50078    1586
50080    1586
50081    1586
50082    1586
50083    1586
Name: Rear (Back) track (mm), Length: 41342, dtype: uint16

In [83]:
df['Steering type'] = df['Steering type'].astype('category')
print(df['Steering type'].cat.categories)

Index(['Cone worm with recirculation balls', 'Steering rack and pinion',
       'Worm-reduction unit'],
      dtype='object')


In [84]:
df[df['Maximum speed (LPG)'].notna()]['Maximum speed (LPG)'].max()
df['Maximum speed (LPG)'] = df['Maximum speed (LPG)'].map(parse_number_or_range)
df['Maximum speed (LPG)'] = df['Maximum speed (LPG)'].fillna(0).astype('uint8')
df = df.rename(columns={'Maximum speed (LPG)': 'Maximum speed (LPG) (km/h)'})
df[df['Maximum speed (LPG) (km/h)']>0]['Maximum speed (LPG) (km/h)']

8524     178
8605     178
10535    181
13543    163
13558    163
14153    174
16454    185
20539    183
24490    158
24491    158
24638    175
24644    175
24772    175
24798    175
24803    162
24898    167
25058    169
25081    168
26322    165
26354    186
26857    183
26870    177
26872    177
27066    158
34590    155
41461    155
41470    170
41473    166
41496    170
41500    155
41502    166
42229    187
43044    189
48180    168
49324    184
Name: Maximum speed (LPG) (km/h), dtype: uint8

In [85]:
df['Rear overhang'].value_counts()
df['Rear overhang'] = df['Rear overhang'].map(parse_number_or_range)
df['Rear overhang'] = df['Rear overhang'].fillna(0).astype('uint16')
df = df.rename(columns={'Rear overhang': 'Rear overhang (mm)'})
df[df['Rear overhang (mm)']>0]['Rear overhang (mm)']

8         810
21        910
50       1043
63       1043
90       1098
         ... 
50079     715
50080     771
50081     771
50082     771
50083     771
Name: Rear overhang (mm), Length: 14340, dtype: uint16

In [86]:
df['Acceleration 0 - 100 km/h (LPG)'] = df['Acceleration 0 - 100 km/h (LPG)'].map(parse_number_or_range)
df['Acceleration 0 - 100 km/h (LPG)'] = df['Acceleration 0 - 100 km/h (LPG)'].astype('float16')
df[df['Acceleration 0 - 100 km/h (LPG)'].notna()]['Acceleration 0 - 100 km/h (LPG)']

  has_large_values = (abs_vals > 1e6).any()


6342     10.296875
8524     13.796875
8605     14.000000
10535    12.601562
13543    13.898438
13558    14.500000
14153    13.398438
16454    12.101562
19985    11.500000
19986    10.601562
20539    13.398438
24484    12.000000
24487    12.000000
24490    15.101562
24491    15.101562
24638    13.000000
24644    12.296875
24772    12.296875
24798    13.000000
24803    12.796875
24863    10.000000
24898    11.398438
25058    16.593750
25081    13.203125
26322    13.648438
26331    15.296875
26352    15.296875
26847    15.101562
27066    15.203125
31403    10.500000
40074    12.500000
41221    13.203125
41461    14.101562
41470    12.601562
41473    14.101562
41496    12.601562
41500    14.101562
41502    14.101562
42142    11.796875
42229    11.796875
43044    12.203125
48180    13.203125
48511    11.796875
Name: Acceleration 0 - 100 km/h (LPG), dtype: float16

In [87]:
df['Front overhang']
df['Front overhang'] = df['Front overhang'].map(parse_number_or_range)
df['Front overhang'] = df['Front overhang'].fillna(0).astype('uint16')
df = df.rename(columns={'Front overhang': 'Front overhang (mm)'})
df[df['Front overhang (mm)']>0]['Front overhang (mm)']

8        920
21       825
50       787
63       787
90       941
        ... 
50079    800
50080    919
50081    919
50082    919
50083    919
Name: Front overhang (mm), Length: 14357, dtype: uint16

In [88]:
df[df['Net (usable) battery capacity'].notna()]['Net (usable) battery capacity']
df['Net (usable) battery capacity'] = df['Net (usable) battery capacity'].map(parse_number_or_range)
df['Net (usable) battery capacity'] = df['Net (usable) battery capacity'].astype('float16')
df = df.rename(columns={'Net (usable) battery capacity': 'Net (usable) battery capacity (kWh)'})
df[df['Net (usable) battery capacity (kWh)'].notna()]['Net (usable) battery capacity (kWh)']

  has_large_values = (abs_vals > 1e6).any()


679      18.796875
687      18.796875
689      37.906250
690      27.203125
691      66.125000
           ...    
49634    21.296875
50014     9.796875
50031    40.000000
50059    60.000000
50079    60.000000
Name: Net (usable) battery capacity (kWh), Length: 930, dtype: float16

In [89]:
#Trunk (boot) space - maximum [object]
#Trunk (boot) space - minimum [object]

df['Trunk (boot) space - maximum'] = df['Trunk (boot) space - maximum'].map(parse_number_or_range)
df['Trunk (boot) space - maximum'] = df['Trunk (boot) space - maximum'].fillna(0).astype('uint16')
df['Trunk (boot) space - minimum'] = df['Trunk (boot) space - minimum'].map(parse_number_or_range)
df['Trunk (boot) space - minimum'] = df['Trunk (boot) space - minimum'].fillna(0).astype('uint16')
df = df.rename(columns={
    'Trunk (boot) space - maximum': 'Trunk (boot) space - maximum (l)',
    'Trunk (boot) space - minimum': 'Trunk (boot) space - minimum (l)'
})

In [90]:
df['Engine Model/Code'] = df['Engine Model/Code'].astype('category')
print(df['Engine Model/Code'].cat.categories)

Index(['005.15', '005.26', '015.44, 015.88, 015.90', '015.63', '016.00',
       '016.00, 016.08', '016.23, 016.55', '016.44', '016.55', '016.78',
       ...
       'ZTGA', 'ZTJD', 'ZY66, ZY84', 'Zetec', 'Zetec / C20HDEZ',
       'Zetec / L1L, L1N, L1Q', 'Zetec / NGA, NGB, NGC, NGD',
       'Zetec / RKB, RKF, RKH, RKJ, RKK', 'e-Skyactiv', 'e-Skyactiv G'],
      dtype='object', length=7470)


In [91]:
df['Torque (CNG)'] = df['Torque (CNG)'].map(parse_number_or_range)
df['Torque (CNG)'] = df['Torque (CNG)'].fillna(0).astype('uint16')
df = df.rename(columns={'Torque (CNG)': 'Torque (CNG) (Nm)'})
df[df['Torque (CNG) (Nm)'] > 0]['Torque (CNG) (Nm)']

7830     200
8151     200
8153     200
8704     295
11215    200
11252    200
11257    200
11262    200
20005    192
20013    192
21014    192
21027    192
21061    192
23720     95
24481     98
29207    140
29222    140
29378    104
29475    104
29606    130
29696    104
29811    104
29828    104
31721    135
Name: Torque (CNG) (Nm), dtype: uint16

In [92]:
df['Power steering'] = df['Power steering'].astype('category')
print(df['Power steering'].cat.categories)

Index(['Electric Steering', 'Hydraulic Steering'], dtype='object')


In [93]:
df['Battery location'] = df['Battery location'].astype('category')
print(df['Battery location'].cat.categories)

Index(['Behind the back wall of the passenger cabin', 'Below the floor',
       'Below the floor, between front and rear seats',
       'Below the floor, under the front seats',
       'Below the floor, under the front seatsBelow the floor, between front and rear seats',
       'Below the floor, under the rear seats',
       'Below the floor, under the rear seatsBelow the floor, between front and rear seats',
       'Below the floor, under the rear seatsBelow the floor, under the front seats',
       'Below the floor, under the rear seatsBelow the floor, under the front seatsIn the central tunnel',
       'Below the floor, under the rear seatsInside the trunk',
       'Below the floorBelow the floor, under the rear seats',
       'Below the floorUnder the trunk', 'In the central tunnel',
       'In the central tunnelBehind the back wall of the passenger cabin',
       'Inside the trunk', 'Inside the trunkIn the central tunnel',
       'Inside the trunkUnder the front hood', 'Under the 

In [94]:
df[df['Width with mirrors folded'].notna()]['Width with mirrors folded']
df['Width with mirrors folded'] = df['Width with mirrors folded'].map(parse_number_or_range)
df['Width with mirrors folded'] = df['Width with mirrors folded'].fillna(0).astype('uint16')
df = df.rename(columns={'Width with mirrors folded': 'Width with mirrors folded (mm)'})
df[df['Width with mirrors folded (mm)']>0]['Width with mirrors folded (mm)']

1436     1989
1446     1989
1447     1989
1452     1989
1453     1989
         ... 
49659    1890
49660    1890
49661    1890
49662    1890
49676    1890
Name: Width with mirrors folded (mm), Length: 3039, dtype: uint16

In [95]:
df[df['Cylinder Bore'].notna()]['Cylinder Bore']

df['Cylinder Bore'] = df['Cylinder Bore'].map(parse_number_or_range)
df['Cylinder Bore'] = df['Cylinder Bore'].astype('float16')
df = df.rename(columns={'Cylinder Bore': 'Cylinder Bore (mm)'})
df[df['Cylinder Bore (mm)']>0]['Cylinder Bore (mm)']

  has_large_values = (abs_vals > 1e6).any()


13       86.0000
23       84.0000
24       89.0000
27       84.0000
32       93.0000
          ...   
50077    80.0000
50078    72.1875
50080    72.1875
50081    76.0000
50082    72.1875
Name: Cylinder Bore (mm), Length: 39076, dtype: float16

In [96]:
df['Valvetrain'] = df['Valvetrain'].astype('category')
print(df['Valvetrain'].cat.categories)

Index(['B4164T4', 'CAMTRONIC', 'CIH', 'CVCTS', 'CVTC', 'CVTCS', 'CVTCS, DOHC',
       'CVTCS,DOHC', 'CVVD', 'CVVL',
       ...
       'dual C-VTC', 'e-VGT', 'i-DTEC', 'i-VCT', 'i-VTEC', 'i-VVT',
       'iVCT, DOHC', 'iVCT, DOHC, Atkinson cycle', 'ohc', 'ohv'],
      dtype='object', length=212)


In [97]:
df['Wheel rims size'] = df['Wheel rims size'].astype('category')
print(df['Wheel rims size'].cat.categories)

Index(['10.0 J x 19; 12.0 J x 20', '10.0J x 20; 11.5J x 20', '10.5 J x 22',
       '10.5J x 19; 11J x 19', '10.5J x 21', '10J x 20',
       '10J x 20; 10.5J x 20', '10J x 20; 10.5J x 21; 10.5J x 22',
       '10J x 20; 11J x 20', '10J x 20; 13.5J x 21',
       ...
       'Front wheel rims: 9J x 20Rear wheel rims: 10.5J x 20',
       'Front wheel rims: 9J x 20Rear wheel rims: 11.5J x 21',
       'Front wheel rims: 9J x 20Rear wheel rims: 11J x 20',
       'Front wheel rims: 9J x 20Rear wheel rims: 12.5J x 21',
       'Front wheel rims: 9J x 21', 'Front wheel rims: 9J x 21; 9.5J x 21',
       'Front wheel rims: 9J x 21Rear wheel rims: 11.5J x 21',
       'Front wheel rims: R14', 'R14', 'R15'],
      dtype='object', length=1700)


In [98]:
df['Front brakes'] = df['Front brakes'].astype('category')
print(df['Front brakes'].cat.categories)

Index(['Disc', 'Disc, 209.55 mm', 'Disc, 21.8 mm', 'Disc, 227 mm',
       'Disc, 228 mm', 'Disc, 228.6 mm', 'Disc, 231 mm', 'Disc, 232 mm',
       'Disc, 235 mm', 'Disc, 236 mm',
       ...
       'Ventilated discs, 415 mm', 'Ventilated discs, 415x33 mm',
       'Ventilated discs, 415x40 mm', 'Ventilated discs, 420 mm',
       'Ventilated discs, 420-440 mm', 'Ventilated discs, 420x40 mm',
       'Ventilated discs, 431.8 mm', 'Ventilated discs, 440 mm',
       'Ventilated discs, 440x40 mm', 'Ventilated discs, 457.2 mm'],
      dtype='object', length=520)


In [99]:
df['Permitted towbar download'] = df['Permitted towbar download'].map(parse_number_or_range)
df['Permitted towbar download'] = df['Permitted towbar download'].fillna(0).astype('uint16')
df = df.rename(columns={'Permitted towbar download': 'Permitted towbar download (kg)'})
df[df['Permitted towbar download (kg)']>0]['Permitted towbar download (kg)']

54       100
62        75
119       75
318       75
354       75
        ... 
50073     75
50074     75
50075     75
50076     75
50083     75
Name: Permitted towbar download (kg), Length: 12069, dtype: uint16

In [100]:
df['Seats'] = df['Seats'].fillna("0").astype(str).str.extract(r"(\d+)(?:-(\d+))?").fillna(method="ffill", axis=1).iloc[:, 1].astype("uint8") 
df[df['Seats']>0]['Seats']

  df['Seats'] = df['Seats'].fillna("0").astype(str).str.extract(r"(\d+)(?:-(\d+))?").fillna(method="ffill", axis=1).iloc[:, 1].astype("uint8")


4        5
8        5
9        5
10       5
11       5
        ..
50079    5
50080    5
50081    5
50082    5
50083    5
Name: Seats, Length: 48513, dtype: uint8

In [101]:
df['Coolant'] = df['Coolant'].map(parse_number_or_range)
df['Coolant'] = df['Coolant'].astype('float16')
df = df.rename(columns={'Coolant': 'Coolant (l)'})
df[df['Coolant (l)'].notna()]['Coolant (l)']

  has_large_values = (abs_vals > 1e6).any()


11        6.300781
12        7.199219
18        6.800781
35        6.800781
58       10.000000
           ...    
50078     6.500000
50080     6.500000
50081     6.000000
50082     6.500000
50083     6.500000
Name: Coolant (l), Length: 35050, dtype: float16

In [102]:
df['Permitted trailer load without brakes'] = df['Permitted trailer load without brakes'].map(parse_number_or_range)
df['Permitted trailer load without brakes'] = df['Permitted trailer load without brakes'].fillna(0).astype('uint16')
df = df.rename(columns={'Permitted trailer load without brakes': 'Permitted trailer load without brakes (kg)'})
df[df['Permitted trailer load without brakes (kg)']>0]['Permitted trailer load without brakes (kg)']

36       750
43       750
47       750
50       750
54       750
        ... 
50079    500
50080    640
50081    640
50082    640
50083    675
Name: Permitted trailer load without brakes (kg), Length: 17391, dtype: uint16

In [103]:
df['Max speed (electric)'] = df['Max speed (electric)'].map(parse_number_or_range)
df['Max speed (electric)'] = df['Max speed (electric)'].fillna(0).astype('uint8')
df = df.rename(columns={'Max speed (electric)': 'Max speed (electric) (km/h)'})
df[df['Max speed (electric) (km/h)']>0]['Max speed (electric) (km/h)']

822       60
844       60
889      140
955      140
957      140
        ... 
49059    130
49060    130
49063    130
49406     80
50014    135
Name: Max speed (electric) (km/h), Length: 478, dtype: uint8

In [104]:
df['Battery technology'] = df['Battery technology'].astype('category')
print(df['Battery technology'].cat.categories)

Index(['Lead-Acid', 'Lithium iron phosphate (LiFePO)',
       'Lithium nickel cobalt manganese aluminum (Li-NCMA)',
       'Lithium nickel manganese cobalt oxides (Li-NMC)',
       'Lithium-ion (Li-Ion)', 'Lithium-polymer (LiPo)',
       'Nickel-metal hydride (NiMH)', 'Semi-solid-state lithium-ion battery'],
      dtype='object')


In [105]:
df['Climb angle'] = df['Climb angle'].map(parse_number_or_range)
df['Climb angle'] = df['Climb angle'].astype('float16')
df[df['Climb angle'].notna()]['Climb angle']

  has_large_values = (abs_vals > 1e6).any()


219      31.00000
3983     26.59375
3995     26.59375
4297     27.00000
4298     27.00000
           ...   
48494    30.00000
48496    30.00000
48500    30.00000
48505    30.00000
48516    30.00000
Name: Climb angle, Length: 528, dtype: float16

In [106]:
df['Departure angle'] = df['Departure angle'].map(parse_number_or_range)
df['Departure angle'] = df['Departure angle'].astype('float16')
df[df['Departure angle'].notna()]['Departure angle']

  has_large_values = (abs_vals > 1e6).any()


8        25.000000
21       22.000000
54       22.593750
219      19.296875
231      21.000000
           ...    
49195    27.000000
49200    27.000000
49216    27.000000
49229    27.000000
49235    27.000000
Name: Departure angle, Length: 7165, dtype: float16

In [107]:
df['Engine oil capacity'] = df['Engine oil capacity'].map(parse_number_or_range)
df['Engine oil capacity'] = df['Engine oil capacity'].astype('float16')
df = df.rename(columns={'Engine oil capacity': 'Engine oil capacity (l)'})
df[df['Engine oil capacity (l)'].notna()]['Engine oil capacity (l)']

  has_large_values = (abs_vals > 1e6).any()


11       3.000000
12       3.800781
18       3.000000
35       3.000000
49       3.000000
           ...   
50078    4.601562
50080    4.601562
50081    4.699219
50082    4.601562
50083    5.500000
Name: Engine oil capacity (l), Length: 43349, dtype: float16

In [108]:
df['Ride height (ground clearance)'] = df['Ride height (ground clearance)'].map(parse_number_or_range)
df['Ride height (ground clearance)'] = df['Ride height (ground clearance)'].fillna(0).astype('uint8')
df = df.rename(columns={'Ride height (ground clearance)': 'Ride height (ground clearance) (mm)'})
df[df['Ride height (ground clearance) (mm)']>0]['Ride height (ground clearance) (mm)']

8        120
26       120
31       170
33       170
41       170
        ... 
50079    135
50080    145
50081    145
50082    145
50083    145
Name: Ride height (ground clearance) (mm), Length: 18186, dtype: uint8

In [109]:
df['Maximum engine speed'] = df['Maximum engine speed'].map(parse_number_or_range)
df['Maximum engine speed'] = df['Maximum engine speed'].fillna(0).astype('uint16')
df = df.rename(columns={'Maximum engine speed': 'Maximum engine speed (RPM)'})
df[df['Maximum engine speed (RPM)']>0]['Maximum engine speed (RPM)']

382      7200
880      7000
1605     6100
1889     4750
1894     4750
         ... 
48643    5000
48651    6500
48660    6500
48662    5200
48664    5200
Name: Maximum engine speed (RPM), Length: 2723, dtype: uint16

In [110]:
df['Rear brakes'] = df['Rear brakes'].astype('category')
print(df['Rear brakes'].cat.categories)

Index(['Disc', 'Disc, 10.8 mm', 'Disc, 12.2 mm', 'Disc, 176 mm',
       'Disc, 180.34 mm', 'Disc, 200x40 mm', 'Disc, 203 mm',
       'Disc, 203-262 mm', 'Disc, 203.2 mm', 'Disc, 224 mm',
       ...
       'Ventilated discs, 396x24 mm', 'Ventilated discs, 397-410 mm',
       'Ventilated discs, 398 mm', 'Ventilated discs, 398x28 mm',
       'Ventilated discs, 400 mm', 'Ventilated discs, 406.4 mm',
       'Ventilated discs, 408x36 mm', 'Ventilated discs, 410 mm',
       'Ventilated discs, 410x32 mm', 'Ventilated discs, 431.8 mm'],
      dtype='object', length=542)


In [111]:
df['Torque (LPG)'] = df['Torque (LPG)'].map(parse_number_or_range)
df['Torque (LPG)'] = df['Torque (LPG)'].fillna(0).astype('uint16')
df = df.rename(columns={'Torque (LPG)': 'Torque (LPG) (Nm)'})
df[df['Torque (LPG) (Nm)']>0]['Torque (LPG) (Nm)']

10535    144
13543    144
13558    144
14153    126
16454    144
19985    214
19986    214
20539    155
24484    145
24487    145
24490    131
24491    131
24586     90
24638    170
24644    170
24772    170
24798    170
24803    144
24818    170
24898    156
25058    144
25081    170
25662    123
26322    124
26328    170
26331    131
26352    131
26354    170
26579    170
26781    131
26847    103
26857    170
26870    170
26872    170
27066    103
40074    170
41221    125
41461    110
41470    119
41473    105
41496    119
41500    110
41502    105
41676    124
42260     92
43044    147
43476    147
48180    170
48511    170
48538    170
Name: Torque (LPG) (Nm), dtype: uint16

In [112]:
df['Minimum turning circle (turning diameter)'] = df['Minimum turning circle (turning diameter)'].map(parse_number_or_range)
df['Minimum turning circle (turning diameter)'] = df['Minimum turning circle (turning diameter)'].astype('float16')
df = df.rename(columns={'Minimum turning circle (turning diameter)': 'Minimum turning circle (turning diameter) (m)'})
df[df['Minimum turning circle (turning diameter) (m)'].notna()]['Minimum turning circle (turning diameter) (m)']

  has_large_values = (abs_vals > 1e6).any()


8        11.398438
9        10.796875
10       10.796875
11       10.203125
12       10.203125
           ...    
50079    10.562500
50080    11.796875
50081    11.796875
50082    11.796875
50083    11.203125
Name: Minimum turning circle (turning diameter) (m), Length: 31295, dtype: float16

In [113]:
df['Maximum speed'] = df['Maximum speed'].map(parse_number_or_range)
df['Maximum speed'] = df['Maximum speed'].fillna(0).astype('uint16')
df = df.rename(columns={'Maximum speed': 'Maximum speed (km/h)'})
df[df['Maximum speed (km/h)']>0]['Maximum speed (km/h)']

4        230
8        140
11       185
12       185
13       280
        ... 
50079    150
50080    179
50081    174
50082    197
50083    212
Name: Maximum speed (km/h), Length: 37887, dtype: uint16

In [114]:
df['Tires size'] = df['Tires size'].astype('category')
print(df['Tires size'].cat.categories)

Index(['05/60 R16 91V; 215/50 R17 90V', '135 R12', '135/45 R12', '135/70 R13',
       '135/75 R13', '135/75 R13 S', '135/80 R12', '135/80 R12 S',
       '135/80 R13', '135/80 SR13',
       ...
       'P245/50 R17', 'P245/50 R17 98V', 'P255/50 R19 103H', 'P255/55 R18',
       'P265/70 R16', 'P285/30 ZR19; P335/25 ZR20', 'R12', 'R15', 'R20; R21',
       'R37'],
      dtype='object', length=3848)


In [115]:
df['Kerb Weight'] = df['Kerb Weight'].map(parse_number_or_range)
df['Kerb Weight'] = df['Kerb Weight'].astype('float16')
df = df.rename(columns={'Kerb Weight': 'Kerb Weight (kg)'})
df[df['Kerb Weight (kg)'].notna()]['Kerb Weight (kg)']

  has_large_values = (abs_vals > 1e6).any()


8        2420.0
11       1435.0
12       1435.0
13       2035.0
18       1170.0
          ...  
50079    1624.0
50080    1205.0
50081    1205.0
50082    1205.0
50083    1279.0
Name: Kerb Weight (kg), Length: 43410, dtype: float16

In [116]:
df['Drag coefficient (C)'] = df['Drag coefficient (C)'].map(parse_number_or_range)
df['Drag coefficient (C)'] = df['Drag coefficient (C)'].astype('float16')
df[df['Drag coefficient (C)'].notna()]['Drag coefficient (C)']

  has_large_values = (abs_vals > 1e6).any()


60       0.310059
125      0.350098
266      0.233032
281      0.233032
282      0.233032
           ...   
50033    0.330078
50039    0.330078
50047    0.330078
50050    0.330078
50061    0.330078
Name: Drag coefficient (C), Length: 12926, dtype: float16

In [117]:
df['Doors'] = df['Doors'].map(parse_number_or_range)
df['Doors'] = df['Doors'].fillna(0).astype('uint8')
df[df['Doors']>0]['Doors']

4        2
8        5
9        5
10       5
11       4
        ..
50079    5
50080    5
50081    5
50082    5
50083    5
Name: Doors, Length: 48584, dtype: uint8

In [118]:
df['Engine displacement'] = df['Engine displacement'].map(parse_number_or_range)
df['Engine displacement'] = df['Engine displacement'].fillna(0).astype('uint16')
df = df.rename(columns={'Engine displacement': 'Engine displacement (cm^3)'})
df[df['Engine displacement (cm^3)']>0]['Engine displacement (cm^3)']

4        2693
11       1991
12       2378
13       5646
16       2316
         ... 
50078    1198
50080    1198
50081    1461
50082    1198
50083    1332
Name: Engine displacement (cm^3), Length: 45992, dtype: uint16

In [119]:
df['Acceleration 0 - 100 km/h (CNG)']
df['Acceleration 0 - 100 km/h (CNG)'] = df['Acceleration 0 - 100 km/h (CNG)'].map(parse_number_or_range)
df['Acceleration 0 - 100 km/h (CNG)'] = df['Acceleration 0 - 100 km/h (CNG)'].astype('float16')
df[df['Acceleration 0 - 100 km/h (CNG)'].notna()]['Acceleration 0 - 100 km/h (CNG)']

  has_large_values = (abs_vals > 1e6).any()


7941     10.000000
7950      9.898438
20005    11.898438
20013    11.000000
21014    11.000000
21027    11.898438
21061    11.000000
29148    15.703125
29207    15.703125
29222    15.703125
29227    15.703125
29696    16.906250
29811    16.906250
29828    14.898438
31721    14.398438
37885    10.796875
Name: Acceleration 0 - 100 km/h (CNG), dtype: float16

In [120]:
df['System torque'] = df['System torque'].map(parse_number_or_range)
df['System torque'] = df['System torque'].fillna(0).astype('uint16')
df = df.rename(columns={'System torque': 'System torque (Nm)'})
df[df['System torque (Nm)']>0]['System torque (Nm)']

9        180
10       180
17       180
21       180
26       180
        ... 
49615    175
49634    160
50031    250
50059    300
50079    250
Name: System torque (Nm), Length: 1864, dtype: uint16

In [121]:
for col, dtype in df.dtypes.items():
    if str(dtype) == 'float64':
        df[col] = df[col].astype('float16')

In [122]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50084 entries, 0 to 50083
Columns: 110 entries, Brand to Transmission type
dtypes: category(30), datetime64[ns](2), float16(40), uint16(30), uint8(8)
memory usage: 15.1 MB


In [123]:
df.to_parquet('auto-cars.parquet')