In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv('cars.csv', low_memory=False)

In [3]:
columns_to_drop = [
    'Fuel consumption (economy) - combined', 
    'Fuel consumption (economy) - combined (CLTC)', 
    'Fuel consumption (economy) - combined (CNG)', 
    'Fuel consumption (economy) - combined (CNG) (NEDC)', 
    'Fuel consumption (economy) - combined (CNG) (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - combined (EPA)', 
    'Fuel consumption (economy) - combined (Ethanol - E85)', 
    'Fuel consumption (economy) - combined (Ethanol - E85) (NEDC)', 
    'Fuel consumption (economy) - combined (LPG)', 
    'Fuel consumption (economy) - combined (LPG) (NEDC)', 
    'Fuel consumption (economy) - combined (LPG) (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - combined (NEDC)', 
    'Fuel consumption (economy) - combined (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - combined (WLTC)',
    'Acceleration 0 - 62 mph',
    'Acceleration 0 - 200 km/h',
    'Acceleration 0 - 300 km/h',
    'Acceleration 0 - 60 mph',
    'Acceleration 0 - 60 mph (Calculated by Auto-Data.net)',
    'Engine oil specification',
    '200 km/h - 0',
    'Combined fuel consumption (WLTP)', 
    'Combined fuel consumption (WLTP) (CNG)', 
    'Combined fuel consumption (WLTP) (LPG)'
]

df = df.drop(columns_to_drop, axis=1, errors='ignore')

In [4]:
print(sorted(list(df.columns)))

['100 km/h - 0', 'Acceleration 0 - 100 km/h', 'Acceleration 0 - 100 km/h (CNG)', 'Acceleration 0 - 100 km/h (Ethanol - E100)', 'Acceleration 0 - 100 km/h (Ethanol - E85)', 'Acceleration 0 - 100 km/h (LPG)', 'AdBlue tank', 'All-electric range', 'All-electric range (CLTC)', 'All-electric range (EPA)', 'All-electric range (NEDC)', 'All-electric range (NEDC, WLTP equivalent)', 'All-electric range (WLTC)', 'All-electric range (WLTP)', 'Approach angle', 'Assisting systems', 'Average Energy consumption', 'Average Energy consumption (CLTC)', 'Average Energy consumption (EPA)', 'Average Energy consumption (NEDC)', 'Average Energy consumption (NEDC, WLTP equivalent)', 'Average Energy consumption (WLTC)', 'Average Energy consumption (WLTP)', 'Battery location', 'Battery technology', 'Battery voltage', 'Battery weight', 'Body type', 'Brand', 'CNG cylinder capacity', 'CO emissions', 'CO emissions (CNG)', 'CO emissions (CNG) (NEDC)', 'CO emissions (CNG) (NEDC, WLTP equivalent)', 'CO emissions (CNG) 

In [5]:
def parse_number_or_range(val, hmean=False):
    """
    Parse a number or range from a string, ignoring units at the end.
    Returns the arithmetic mean if a range, or the number itself.
    """
    if pd.isna(val):
        return np.nan
    
    s = str(val).strip().lower()
    
    # Single regex to capture:
    #  - optional leading spaces
    #  - first number (integer or decimal)
    #  - optional range separator and second number
    #  - ignore any text after numbers (units)
    match = re.match(r"^\s*(\d+(?:\.\d+)?)\s*(?:[-–]\s*(\d+(?:\.\d+)?))?", s)
    if match:
        num1 = float(match.group(1))
        num2 = match.group(2)
        if num2:
            num2 = float(num2)
            if hmean:
                return 2.0 * num1 * num2 / (num1+num2)
            else:
                return (num1 + num2) / 2  # arithmetic mean of range
        else:
            return num1
    
    return np.nan

def hmean_across(data):
    return (data.notna().sum(axis=1)) / (1 / data).sum(axis=1, skipna=True)

In [6]:
columns = [
    'All-electric range', 
    'All-electric range (CLTC)', 
    'All-electric range (EPA)', 
    'All-electric range (NEDC)', 
    'All-electric range (NEDC, WLTP equivalent)', 
    'All-electric range (WLTC)', 
    'All-electric range (WLTP)'
]

data = df[columns].map(parse_number_or_range)
df["All-electric average range (km)"] = data.mean(axis=1)
df = df.drop(columns, axis=1)
#data[data[columns].notna().sum(axis=1)>=2]
df[df["All-electric average range (km)"].notna()]

Unnamed: 0,Brand,Model,Generation,Start of production,End of production,Modification (Engine),Powertrain Architecture,Body type,Fuel Type,Max. weight,...,Drag coefficient (C),Doors,Engine displacement,Acceleration 0 - 100 km/h (CNG),System torque,Torque (Ethanol - E85),Number of valves per cylinder,Fuel consumption (economy) - urban (Ethanol - E85),Fuel consumption (economy) - extra urban (Ethanol - E85) (NEDC),All-electric average range (km)
8,BYD,e6,e6,2017 year,,80 kWh (122 Hp) 4WD Electric,BEV (Electric Vehicle),MPV,Electricity,,...,,5,,,,,,,,400.0
9,BYD,e2,e2,"September, 2019 year","April, 2021 year",47.3 kWh (95 Hp) BEV,BEV (Electric Vehicle),Hatchback,Electricity,,...,,5,,,180 Nm,,,,,405.0
10,BYD,e2,e2,"September, 2019 year","April, 2021 year",35.2 kWh (95 Hp) BEV,BEV (Electric Vehicle),Hatchback,Electricity,,...,,5,,,180 Nm,,,,,305.0
17,BYD,e2,e2,"April, 2021 year",,33.2 kWh (95 Hp) BEV,BEV (Electric Vehicle),Hatchback,Electricity,,...,,5,,,180 Nm,,,,,301.0
21,BYD,ETP3,ETP3,"October, 2023 year",,44.9 kWh (136 Hp) Electric,BEV (Electric Vehicle),Van,Electricity,2420 kg,...,,5,,,180 Nm,,,,,233.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49634,Renault,Twingo,Twingo III (facelift 2019),"August, 2020 year","July, 2024 year",Z.E. 22 kWh (82 Hp),BEV (Electric Vehicle),Hatchback,Electricity,1518 kg,...,,5,,,160 Nm @ 500-3590 rpm.,,,,,180.0
50014,Renault,Megane,"Megane IV (Phase II, 2020) Grandtour","April, 2020 year","July, 2023 year",1.6 E-TECH (158 Hp) Plug-in Hybrid Multimode,PHEV (Plug-in Hybrid Electric Vehicle),Station wagon (estate),Petrol / electricity,2131 kg,...,,5,1598 cm,,,,4.0,,,50.0
50031,Renault,Megane,Megane V E-Tech Electric,"February, 2022 year",,EV40 (130 Hp),BEV (Electric Vehicle),Hatchback,Electricity,2045 kg,...,,5,,,250 Nm,,,,,300.0
50059,Renault,Megane,Megane V E-Tech Electric,"February, 2022 year",,EV60 (220 Hp),BEV (Electric Vehicle),Hatchback,Electricity,2158 kg,...,,5,,,300 Nm,,,,,470.0


In [7]:
columns = [
    'Average Energy consumption', 
    'Average Energy consumption (CLTC)', 
    'Average Energy consumption (EPA)', 
    'Average Energy consumption (NEDC)', 
    'Average Energy consumption (NEDC, WLTP equivalent)', 
    'Average Energy consumption (WLTC)', 
    'Average Energy consumption (WLTP)'
]
data = df[columns].map(lambda x: parse_number_or_range(x, hmean=True))
data = data.replace(0, np.nan)
df["Average Energy consumption (kWh/100km)"] = hmean_across(data) # (data.notna().sum(axis=1)) / (1 / data).sum(axis=1, skipna=True)
df = df.drop(columns, axis=1)
#data[data[columns].notna().any(axis=1)][columns]
#df[df[columns].notna().any(axis=1)][columns]

In [8]:
df[df['Average Energy consumption (kWh/100km)'].notna()]

Unnamed: 0,Brand,Model,Generation,Start of production,End of production,Modification (Engine),Powertrain Architecture,Body type,Fuel Type,Max. weight,...,Doors,Engine displacement,Acceleration 0 - 100 km/h (CNG),System torque,Torque (Ethanol - E85),Number of valves per cylinder,Fuel consumption (economy) - urban (Ethanol - E85),Fuel consumption (economy) - extra urban (Ethanol - E85) (NEDC),All-electric average range (km),Average Energy consumption (kWh/100km)
8,BYD,e6,e6,2017 year,,80 kWh (122 Hp) 4WD Electric,BEV (Electric Vehicle),MPV,Electricity,,...,5,,,,,,,,400.0,19.500000
126,BYD,M3e,M3e,"December, 2020 year",,50.3 kWh (95 Hp) BEV,BEV (Electric Vehicle),MPV,Electricity,2360 kg,...,5,,,180 Nm,,,,,300.0,19.400000
162,BYD,Sealion 6,Sealion 6,2024 year,,DM-i 1.5L (324 Hp) Plug-in Hybrid AWD E-CVT,PHEV (Plug-in Hybrid Electric Vehicle),SUV,Petrol / electricity,2510 kg,...,5,1497 cm,,550 Nm,,4.0,,,81.0,17.900000
165,BYD,Sealion 6,Sealion 6,2024 year,,DM-i 1.5L (218 Hp) Plug-in Hybrid E-CVT,PHEV (Plug-in Hybrid Electric Vehicle),SUV,Petrol / electricity,2350 kg,...,5,1498 cm,,300 Nm,,4.0,,,92.0,16.900000
186,BYD,Seal 6,Seal 6 Touring,"September, 2025 year",,DM-i 1.5L 19 kWh (212 Hp) Plug-in Hybrid E-CVT,PHEV (Plug-in Hybrid Electric Vehicle),Station wagon (estate),Petrol / electricity,2240 kg,...,5,1498 cm,,,,4.0,,,100.0,16.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49634,Renault,Twingo,Twingo III (facelift 2019),"August, 2020 year","July, 2024 year",Z.E. 22 kWh (82 Hp),BEV (Electric Vehicle),Hatchback,Electricity,1518 kg,...,5,,,160 Nm @ 500-3590 rpm.,,,,,180.0,16.148607
50014,Renault,Megane,"Megane IV (Phase II, 2020) Grandtour","April, 2020 year","July, 2023 year",1.6 E-TECH (158 Hp) Plug-in Hybrid Multimode,PHEV (Plug-in Hybrid Electric Vehicle),Station wagon (estate),Petrol / electricity,2131 kg,...,5,1598 cm,,,,4.0,,,50.0,13.928571
50031,Renault,Megane,Megane V E-Tech Electric,"February, 2022 year",,EV40 (130 Hp),BEV (Electric Vehicle),Hatchback,Electricity,2045 kg,...,5,,,250 Nm,,,,,300.0,15.800000
50059,Renault,Megane,Megane V E-Tech Electric,"February, 2022 year",,EV60 (220 Hp),BEV (Electric Vehicle),Hatchback,Electricity,2158 kg,...,5,,,300 Nm,,,,,470.0,16.100000


In [9]:
extra_urban_fuel_cols = [
    'Fuel consumption (economy) - extra urban', 
    'Fuel consumption (economy) - extra urban (EPA)', 
    'Fuel consumption (economy) - extra urban (NEDC)', 
    'Fuel consumption (economy) - extra urban (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - extra urban (WLTC)', 
]

extra_urban_cng_cols = [
    'Fuel consumption (economy) - extra urban (CNG)', 
    'Fuel consumption (economy) - extra urban (CNG) (NEDC)', 
    'Fuel consumption (economy) - extra urban (CNG) (NEDC, WLTP equivalent)', 
]

extra_urban_lpg_cols = [
    'Fuel consumption (economy) - extra urban (LPG)', 
    'Fuel consumption (economy) - extra urban (LPG) (NEDC)', 
    'Fuel consumption (economy) - extra urban (LPG) (NEDC, WLTP equivalent)', 
]

extra_urban_ethanol_cols = [
    'Fuel consumption (economy) - extra urban (Ethanol - E100)', 
    'Fuel consumption (economy) - extra urban (Ethanol - E85)', 
    'Fuel consumption (economy) - extra urban (Ethanol - E85) (NEDC)', 
]

urban_fuel_cols = [
    'Fuel consumption (economy) - urban', 
    'Fuel consumption (economy) - urban (EPA)', 
    'Fuel consumption (economy) - urban (NEDC)', 
    'Fuel consumption (economy) - urban (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - urban (WLTC)', 
]

urban_cng_cols = [
    'Fuel consumption (economy) - urban (CNG)', 
    'Fuel consumption (economy) - urban (CNG) (NEDC)', 
    'Fuel consumption (economy) - urban (CNG) (NEDC, WLTP equivalent)', 
]

urban_lpg_cols = [
    'Fuel consumption (economy) - urban (LPG)', 
    'Fuel consumption (economy) - urban (LPG) (NEDC)', 
    'Fuel consumption (economy) - urban (LPG) (NEDC, WLTP equivalent)', 
]

urban_ethanol_cols = [
    'Fuel consumption (economy) - urban (Ethanol - E100)', 
    'Fuel consumption (economy) - urban (Ethanol - E85)', 
    'Fuel consumption (economy) - urban (Ethanol - E85) (NEDC)', 
]

fuel_columns = [
    'Fuel consumption (economy) - extra urban', 
    'Fuel consumption (economy) - extra urban (CNG)', 
    'Fuel consumption (economy) - extra urban (CNG) (NEDC)', 
    'Fuel consumption (economy) - extra urban (CNG) (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - extra urban (EPA)', 
    'Fuel consumption (economy) - extra urban (Ethanol - E100)', 
    'Fuel consumption (economy) - extra urban (Ethanol - E85)', 
    'Fuel consumption (economy) - extra urban (Ethanol - E85) (NEDC)', 
    'Fuel consumption (economy) - extra urban (LPG)', 
    'Fuel consumption (economy) - extra urban (LPG) (NEDC)', 
    'Fuel consumption (economy) - extra urban (LPG) (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - extra urban (NEDC)', 
    'Fuel consumption (economy) - extra urban (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - extra urban (WLTC)', 
    'Fuel consumption (economy) - urban', 
    'Fuel consumption (economy) - urban (CNG)', 
    'Fuel consumption (economy) - urban (CNG) (NEDC)', 
    'Fuel consumption (economy) - urban (CNG) (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - urban (EPA)', 
    'Fuel consumption (economy) - urban (Ethanol - E100)', 
    'Fuel consumption (economy) - urban (Ethanol - E85)', 
    'Fuel consumption (economy) - urban (Ethanol - E85) (NEDC)', 
    'Fuel consumption (economy) - urban (LPG)', 
    'Fuel consumption (economy) - urban (LPG) (NEDC)', 
    'Fuel consumption (economy) - urban (LPG) (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - urban (NEDC)', 
    'Fuel consumption (economy) - urban (NEDC, WLTP equivalent)', 
    'Fuel consumption (economy) - urban (WLTC)', 
    'Fuel consumption at Low speed (WLTP)', 
    'Fuel consumption at Low speed (WLTP) (CNG)', 
    'Fuel consumption at Low speed (WLTP) (LPG)', 
    'Fuel consumption at Medium speed (WLTP)', 
    'Fuel consumption at Medium speed (WLTP) (CNG)', 
    'Fuel consumption at Medium speed (WLTP) (LPG)', 
    'Fuel consumption at high speed (WLTP)', 
    'Fuel consumption at high speed (WLTP) (CNG)', 
    'Fuel consumption at high speed (WLTP) (LPG)', 
    'Fuel consumption at very high speed (WLTP)', 
    'Fuel consumption at very high speed (WLTP) (CNG)', 
    'Fuel consumption at very high speed (WLTP) (LPG)'
]

df[fuel_columns] = df[fuel_columns].map(lambda x: parse_number_or_range(x, hmean=True))
df['Fuel consumption highway (L/100km)'] = hmean_across(df[extra_urban_fuel_cols])
df['Fuel consumption highway CNG (L/100km)'] = hmean_across(df[extra_urban_cng_cols])
df['Fuel consumption highway LPG (L/100km)'] = hmean_across(df[extra_urban_lpg_cols])
df['Fuel consumption highway Ethanol (L/100km)'] = hmean_across(df[extra_urban_ethanol_cols])
df['Fuel consumption city (L/100km)'] = hmean_across(df[urban_fuel_cols])
df['Fuel consumption city CNG (L/100km)'] = hmean_across(df[urban_cng_cols])
df['Fuel consumption city LPG (L/100km)'] = hmean_across(df[urban_lpg_cols])
df['Fuel consumption city Ethanol (L/100km)'] = hmean_across(df[urban_ethanol_cols])
df = df.drop(fuel_columns, axis=1)

In [10]:
new_cols = [
    'Fuel consumption highway (L/100km)',
    'Fuel consumption highway CNG (L/100km)',
    'Fuel consumption highway LPG (L/100km)',
    'Fuel consumption highway Ethanol (L/100km)',
    'Fuel consumption city (L/100km)',
    'Fuel consumption city CNG (L/100km)',
    'Fuel consumption city LPG (L/100km)',
    'Fuel consumption city Ethanol (L/100km)'
]
df[df[new_cols].notna().any(axis=1)][new_cols]

Unnamed: 0,Fuel consumption highway (L/100km),Fuel consumption highway CNG (L/100km),Fuel consumption highway LPG (L/100km),Fuel consumption highway Ethanol (L/100km),Fuel consumption city (L/100km),Fuel consumption city CNG (L/100km),Fuel consumption city LPG (L/100km),Fuel consumption city Ethanol (L/100km)
4,6.7,,,,12.8,,,
11,6.6,,,,,,,
12,6.9,,,,,,,
16,6.1,,,,13.9,,,
23,8.8,,,,19.4,,,
...,...,...,...,...,...,...,...,...
50076,4.6,,,,6.9,,,
50077,3.6,,,,4.7,,,
50080,4.4,,,,7.0,,,
50081,3.4,,,,4.2,,,


In [11]:
print(sorted(list(df.columns)))

['100 km/h - 0', 'Acceleration 0 - 100 km/h', 'Acceleration 0 - 100 km/h (CNG)', 'Acceleration 0 - 100 km/h (Ethanol - E100)', 'Acceleration 0 - 100 km/h (Ethanol - E85)', 'Acceleration 0 - 100 km/h (LPG)', 'AdBlue tank', 'All-electric average range (km)', 'Approach angle', 'Assisting systems', 'Average Energy consumption (kWh/100km)', 'Battery location', 'Battery technology', 'Battery voltage', 'Battery weight', 'Body type', 'Brand', 'CNG cylinder capacity', 'CO emissions', 'CO emissions (CNG)', 'CO emissions (CNG) (NEDC)', 'CO emissions (CNG) (NEDC, WLTP equivalent)', 'CO emissions (CNG) (WLTP)', 'CO emissions (EPA)', 'CO emissions (Ethanol - E100)', 'CO emissions (Ethanol - E85)', 'CO emissions (Ethanol - E85) (NEDC)', 'CO emissions (LPG)', 'CO emissions (LPG) (NEDC)', 'CO emissions (LPG) (NEDC, WLTP equivalent)', 'CO emissions (LPG) (WLTP)', 'CO emissions (NEDC)', 'CO emissions (NEDC, WLTP equivalent)', 'CO emissions (WLTC)', 'CO emissions (WLTP)', 'Climb angle', 'Compression ratio

In [12]:
df.head()

Unnamed: 0,Brand,Model,Generation,Start of production,End of production,Modification (Engine),Powertrain Architecture,Body type,Fuel Type,Max. weight,...,All-electric average range (km),Average Energy consumption (kWh/100km),Fuel consumption highway (L/100km),Fuel consumption highway CNG (L/100km),Fuel consumption highway LPG (L/100km),Fuel consumption highway Ethanol (L/100km),Fuel consumption city (L/100km),Fuel consumption city CNG (L/100km),Fuel consumption city LPG (L/100km),Fuel consumption city Ethanol (L/100km)
0,Alpina,B9,B9 Coupe (E24),1982 year,1985 year,3.5 (245 Hp),Internal Combustion engine,Coupe,Petrol (Gasoline),,...,,,,,,,,,,
1,Alpina,B11,B11 (E32),1987 year,1987 year,3.5 (250 Hp),Internal Combustion engine,Sedan,Petrol (Gasoline),,...,,,,,,,,,,
2,Alpina,B9,B9 (E28),1981 year,1985 year,3.0 (245 Hp),Internal Combustion engine,Sedan,Petrol (Gasoline),,...,,,,,,,,,,
3,Alpina,B11,B11 (E32),1987 year,1993 year,3.5 (254 Hp),Internal Combustion engine,Sedan,Petrol (Gasoline),,...,,,,,,,,,,
4,Alpina,C2,C2 Cabrio (E30),"February, 1986 year","July, 1987 year",2.7 (209 Hp),Internal Combustion engine,Cabriolet,Petrol (Gasoline),,...,,,6.7,,,,12.8,,,


In [None]:
co2_columns = [
    'CO emissions', 
    'CO emissions (CNG)', 
    'CO emissions (CNG) (NEDC)', 
    'CO emissions (CNG) (NEDC, WLTP equivalent)', 
    'CO emissions (CNG) (WLTP)', 
    'CO emissions (EPA)', 
    'CO emissions (Ethanol - E100)', 
    'CO emissions (Ethanol - E85)', 
    'CO emissions (Ethanol - E85) (NEDC)', 
    'CO emissions (LPG)', 
    'CO emissions (LPG) (NEDC)', 
    'CO emissions (LPG) (NEDC, WLTP equivalent)', 
    'CO emissions (LPG) (WLTP)', 
    'CO emissions (NEDC)', 
    'CO emissions (NEDC, WLTP equivalent)', 
    'CO emissions (WLTC)', 
    'CO emissions (WLTP)'
]

co2_fuel_cols = [
    'CO emissions', 
    'CO emissions (EPA)', 
    'CO emissions (NEDC)', 
    'CO emissions (NEDC, WLTP equivalent)', 
    'CO emissions (WLTC)', 
    'CO emissions (WLTP)'
]

In [108]:
for col, dtype in df.dtypes.items():
    print(col, '[' + str(dtype) + ']')

Brand [category]
Model [category]
Generation [category]
Start of production [datetime64[ns]]
End of production [datetime64[ns]]
Modification (Engine) [category]
Powertrain Architecture [category]
Body type [category]
Fuel Type [category]
Max. weight (kg) [uint16]
Length (mm) [uint16]
Width (mm) [uint16]
Height (mm) [uint16]
Front suspension [category]
Battery weight (kg) [uint16]
Engine aspiration [category]
Recuperation output (kW) [uint16]
Gross battery capacity (kWh) [float16]
Power (kW) [float16]
Emission standard [category]
Compression ratio [category]
Permitted trailer load with brakes (12%) (kg) [uint16]
Wheelbase (mm) [uint16]
Power per litre (Ethanol - E100) [object]
Acceleration 0 - 100 km/h [float16]
Maximum speed (CNG) [object]
Power per litre (Ethanol - E85) [object]
Maximum speed (Ethanol - E100) [object]
Ramp-over (brakeover) angle [float16]
Power (CNG) [object]
Power (Ethanol - E100) [object]
AdBlue tank (l) [float16]
CNG cylinder capacity [object]
CO emissions (LPG) (N

In [14]:
df['Brand'] = df['Brand'].astype('category')
print(df['Brand'].cat.categories)

Index(['Acura', 'Alfa Romeo', 'Alpina', 'Aston Martin', 'Audi', 'BMW', 'BYD',
       'Bentley', 'Bugatti', 'Cadillac', 'Chevrolet', 'Chrysler', 'Citroen',
       'Cupra', 'DS', 'Dacia', 'Daewoo', 'Daihatsu', 'Dodge', 'Ferrari',
       'Fiat', 'Ford', 'GMC', 'Genesis', 'Great Wall', 'Haval', 'Honda',
       'Hongqi', 'Hummer', 'Hyundai', 'Infiniti', 'Jaguar', 'Jeep', 'Kia',
       'Koenigsegg', 'Lada', 'Lamborghini', 'Lancia', 'Land Rover', 'Lexus',
       'Lotus', 'MG', 'Maserati', 'Mazda', 'McLaren', 'Mercedes-Benz', 'Mini',
       'Mitsubishi', 'NIO', 'Nissan', 'Opel', 'Pagani', 'Peugeot', 'Porsche',
       'RAM', 'Renault', 'Rolls-Royce', 'Rover', 'Saab', 'Seat', 'Skoda',
       'Smart', 'Subaru', 'Suzuki', 'Tesla', 'Toyota', 'Vauxhall',
       'Volkswagen', 'Volvo'],
      dtype='object')


In [15]:
df['Model'] = df['Model'].astype('category')
print(df['Model'].cat.categories)

Index(['#1', '#3', '#5', '/8', '02', '1 Series', '10', '100', '100 NX', '1000',
       ...
       'iQ', 'iX', 'iX1', 'iX2', 'iX3', 'ix20', 'ix25', 'ix35', 'ix55',
       'nanuk quattro concept'],
      dtype='object', length=2110)


In [16]:
df['Generation'] = df['Generation'].astype('category')
print(df['Generation'].cat.categories)

Index(['#1', '#3', '#5', '/8 (W114)', '/8 (W114, facelift 1973)', '/8 (W115)',
       '/8 (W115, facelift 1973)', '/8 Coupe (W114)',
       '/8 Coupe (W114, facelift 1973)', '02 (E10)',
       ...
       'iX3 (G08, facelift 2021)', 'iX3 (NA5)', 'ix20', 'ix20 (facelift 2015)',
       'ix25', 'ix35', 'ix35 (Facelift 2013)', 'ix35 FCEV', 'ix55',
       'nanuk quattro concept'],
      dtype='object', length=7926)


In [17]:
dates = df['Start of production']
dates_clean = dates.str.replace('year', '', regex=False).str.strip()
dates_parsed = pd.to_datetime(dates_clean, format='mixed')
df['Start of production'] = dates_parsed

In [18]:
dates = df['End of production']
dates_clean = dates.str.replace('year', '', regex=False).str.strip()
dates_parsed = pd.to_datetime(dates_clean, format='mixed')
df['End of production'] = dates_parsed

In [19]:
df['Modification (Engine)'] = df['Modification (Engine)'].astype('category')
print(df['Modification (Engine)'].cat.categories)

Index(['(134 Hp) CVT', '(136 Hp) Fuel Cell Automatic',
       '(1360 Hp) 4WD Electric', '(174 Hp) Fuel Cell Automatic', '(226 Hp)',
       '(226 Hp) Electric', '(306 Hp) Electric', '(435 Hp) AWD',
       '(600 Hp) AWD', '0.4 (16 Hp)',
       ...
       'iV 1.5 TSI (204 Hp) Plug-in Hybrid DSG', 'iV 36.8 kWh (83 Hp)',
       'performance 100 kWh (326 Hp)', 'performance 100 kWh (381 Hp)',
       'performance 105 kWh (925 Hp) quattro',
       'performance 2.9 TFSI V6 (470 Hp) quattro tiptronic',
       'performance 4.0 TFSI V8 (630 Hp) Mild Hybrid quattro tiptronic',
       'performance 4.0 TFSI V8 (640 Hp) Mild Hybrid quattro tiptronic',
       'vRS 2.0 TSI (245 Hp)', 'vRS 2.0 TSI (245 Hp) DSG'],
      dtype='object', length=25269)


In [20]:
df['Powertrain Architecture'] = df['Powertrain Architecture'].astype('category')
print(df['Powertrain Architecture'].cat.categories)

Index(['BEV (Electric Vehicle)', 'FCEV (Fuel Cell Electric Vehicle)',
       'FHEV (Full Hybrid Electric Vehicle)', 'Internal Combustion engine',
       'MHEV (Mild Hybrid Electric Vehicle, power-assist hybrid, battery-assisted hybrid vehicles, BAHV)',
       'PFCEV (Plug-in Fuel Cell Electric Vehicle)',
       'PHEV (Plug-in Hybrid Electric Vehicle)'],
      dtype='object')


In [21]:
df['Body type'] = df['Body type'].astype('category')
print(df['Body type'].cat.categories)

Index(['CUV', 'Cabriolet', 'Cabriolet, Coupe', 'Cabriolet, Hatchback',
       'Cabriolet, SUV', 'Coupe', 'Coupe - Cabriolet',
       'Coupe - Cabriolet, Roadster', 'Coupe, CUV', 'Coupe, Crossover',
       'Coupe, Fastback', 'Coupe, Hatchback', 'Coupe, Liftback', 'Coupe, SUV',
       'Coupe, SUV, Crossover', 'Coupe, SUV, Fastback', 'Crossover',
       'Crossover, MPV', 'Fastback', 'Grand Tourer', 'Hatchback',
       'Hatchback, Crossover', 'Hatchback, Fastback', 'Liftback', 'MPV',
       'MPV, Van', 'Minivan', 'Minivan, Crossover', 'Minivan, MPV',
       'Off-road vehicle', 'Off-road vehicle, Cabriolet',
       'Off-road vehicle, Cabriolet, SUV', 'Off-road vehicle, Coupe',
       'Off-road vehicle, Pick-up', 'Off-road vehicle, SUV',
       'Off-road vehicle, Station wagon (estate)', 'Pick-up', 'Pick-up, Targa',
       'Quadricycle', 'Roadster', 'SAC', 'SAV', 'SUV', 'SUV, Crossover',
       'SUV, Crossover, Fastback', 'SUV, Fastback', 'SUV, MPV', 'SUV, Targa',
       'Sedan', 'Sedan, Cro

In [22]:
df['Fuel Type'] = df['Fuel Type'].astype('category')
print(df['Fuel Type'].cat.categories)

Index(['Diesel', 'Diesel / electricity', 'Electricity', 'Ethanol - E85',
       'Hydrogen', 'Hydrogen / electricity', 'LPG',
       'Mixture of two stroke engine', 'Petrol (Gasoline)', 'Petrol / CNG',
       'Petrol / Ethanol - E100', 'Petrol / Ethanol - E85',
       'Petrol / Ethanol - E85 / electricity', 'Petrol / LPG',
       'Petrol / electricity', 'Synthetic gasoline'],
      dtype='object')


In [23]:
df['Max. weight'] = df['Max. weight'].map(parse_number_or_range)
df['Max. weight'] = df['Max. weight'].fillna(0).astype('uint16')
df = df.rename(columns={'Max. weight': 'Max. weight (kg)'})
df['Max. weight (kg)']

0           0
1           0
2           0
3           0
4           0
         ... 
50079    2116
50080    1806
50081    1841
50082    1806
50083    1839
Name: Max. weight (kg), Length: 50084, dtype: uint16

In [24]:
df['Length'] = df['Length'].map(parse_number_or_range)
df['Length'] = df['Length'].fillna(0).astype('uint16')
df = df.rename(columns={'Length': 'Length (mm)'})
df['Length (mm)']

0           0
1           0
2           0
3           0
4           0
         ... 
50079    4200
50080    4359
50081    4359
50082    4359
50083    4359
Name: Length (mm), Length: 50084, dtype: uint16

In [25]:
df['Width'] = df['Width'].map(parse_number_or_range)
df['Width'] = df['Width'].fillna(0).astype('uint16')
df = df.rename(columns={'Width': 'Width (mm)'})
df['Width (mm)']

0           0
1           0
2           0
3           0
4           0
         ... 
50079    1768
50080    1814
50081    1814
50082    1814
50083    1814
Name: Width (mm), Length: 50084, dtype: uint16

In [26]:
df['Height'] = df['Height'].map(parse_number_or_range)
df['Height'] = df['Height'].fillna(0).astype('uint16')
df = df.rename(columns={'Height': 'Height (mm)'})
df['Height (mm)']

0           0
1           0
2           0
3           0
4           0
         ... 
50079    1505
50080    1447
50081    1447
50082    1447
50083    1447
Name: Height (mm), Length: 50084, dtype: uint16

In [27]:
df['Front suspension']
df['Front suspension'] = df['Front suspension'].astype('category')
print(df['Front suspension'].cat.categories)

Index(['Air suspension', 'Coil spring',
       'Coil spring, Air Suspension - Optional',
       'Coil spring, Double wishbone',
       'Coil spring, Double wishbone, Transverse stabilizer',
       'Coil spring, Double wishbone, Transverse stabilizer, Air Suspension - Optional',
       'Coil spring, Double wishbone, Wishbone',
       'Coil spring, Hydro-pneumatic element, Wishbone, Transverse stabilizer',
       'Coil spring, Independent coil spring, Wishbone',
       'Coil spring, Independent multi-link suspension',
       ...
       'Spring-loaded rack', 'Torsion', 'Trailing arm',
       'Trailing arm, Torsion', 'Transverse stabilizer',
       'Transverse stabilizer, Torsion', 'Wishbone',
       'Wishbone, Air suspension', 'Wishbone, Torsion',
       'Wishbone, Transverse stabilizer'],
      dtype='object', length=122)


In [28]:
df['Battery weight'] = df['Battery weight'].map(parse_number_or_range)
df['Battery weight'] = df['Battery weight'].fillna(0).astype('uint16')
df = df.rename(columns={'Battery weight': 'Battery weight (kg)'})
df['Battery weight (kg)']

0          0
1          0
2          0
3          0
4          0
        ... 
50079    394
50080      0
50081      0
50082      0
50083      0
Name: Battery weight (kg), Length: 50084, dtype: uint16

In [29]:
df['Engine aspiration']
df['Engine aspiration'] = df['Engine aspiration'].astype('category')
print(df['Engine aspiration'].cat.categories)

Index(['2 x Electric Assisted Turbocharger, Intercooler',
       '2 x Twin-Turbo, Intercooler', '2 x Twin-scroll turbo, Intercooler',
       '4 Turbochargers, Intercooler', 'BiTurbo', 'BiTurbo, Intercooler',
       'Electric Assisted Turbocharger, Intercooler',
       'Naturally aspirated engine', 'Naturally aspirated engine, Intercooler',
       'Supercharger', 'Supercharger, Intercooler', 'Turbocharger',
       'Turbocharger and Electric Powered Compressor, Intercooler',
       'Turbocharger, Intercooler',
       'Turbocharging and Supercharger, Intercooler', 'Twin-Turbo',
       'Twin-Turbo and Electric Powered Compressor, Intercooler',
       'Twin-Turbo, Intercooler', 'Twin-power turbo, Intercooler',
       'Twin-scroll turbo, Intercooler'],
      dtype='object')


In [30]:
#df[df['Recuperation output'].notna()]['Recuperation output'].value_counts()
df['Recuperation output'] = df['Recuperation output'].map(parse_number_or_range)
df['Recuperation output'] = df['Recuperation output'].fillna(0).astype('uint16')
df = df.rename(columns={'Recuperation output': 'Recuperation output (kW)'})
df['Recuperation output (kW)']

0        0
1        0
2        0
3        0
4        0
        ..
50079    0
50080    0
50081    0
50082    0
50083    0
Name: Recuperation output (kW), Length: 50084, dtype: uint16

In [31]:
# kWh
df['Gross battery capacity'] = df['Gross battery capacity'].map(parse_number_or_range)
df['Gross battery capacity'] = df['Gross battery capacity'].astype('float16')
df = df.rename(columns={'Gross battery capacity': 'Gross battery capacity (kWh)'})

df[df['Gross battery capacity (kWh)'].notna()]['Gross battery capacity (kWh)']

  has_large_values = (abs_vals > 1e6).any()


8        80.000000
9        47.312500
10       35.187500
17       33.187500
21       44.906250
           ...    
49483    44.093750
49500    25.906250
49615    27.500000
49634    22.000000
50014    10.460938
Name: Gross battery capacity (kWh), Length: 2461, dtype: float16

In [47]:
# Hp
df['Power'] = df['Power'].map(parse_number_or_range)
df['Power'] = df['Power'] * 0.7355
df['Power'] = df['Power'].astype('float16')
df = df.rename(columns={'Power': 'Power (kW)'})
df['Power (kW)']

  has_large_values = (abs_vals > 1e6).any()


0        180.2500
1        183.8750
2        180.2500
3        186.8750
4        153.7500
           ...   
50079         NaN
50080     73.5625
50081     66.1875
50082     95.6250
50083    117.6875
Name: Power (kW), Length: 50084, dtype: float16

In [50]:
df['Emission standard'] = df['Emission standard'].astype('category')
print(df['Emission standard'].cat.categories)

Index(['AT-PZEV', 'BIN 125', 'BS 6', 'BS VI', 'BS VI 2.0', 'BS VI Phase 2',
       'BS-IV', 'California LEV III', 'China 6', 'China 6b+RDE',
       ...
       'ULEV2', 'ULEV50', 'ULEV70', 'ULEV70 SULEV30', 'WCC', 'WCC + UCC',
       'WCC + UCC / SULEV 30', 'WCC+UCC', 'ZEV', 'euro 4'],
      dtype='object', length=294)


In [53]:
df['Compression ratio'] = df['Compression ratio'].astype('category')
print(df['Compression ratio'].cat.categories)

Index(['10.01:1', '10.1:1', '10.25:1', '10.2:1', '10.3:1', '10.478:1',
       '10.4:1', '10.55:1', '10.5:1', '10.67:1',
       ...
       '9.65:1', '9.6:1', '9.75:1', '9.7:1', '9.85:1', '9.8:1', '9.91:1',
       '9.9:1', '90:1', '9:1'],
      dtype='object', length=203)


In [56]:
df['Permitted trailer load with brakes (12%)'].value_counts()
df['Permitted trailer load with brakes (12%)'] = df['Permitted trailer load with brakes (12%)'].map(parse_number_or_range)
df['Permitted trailer load with brakes (12%)'] = df['Permitted trailer load with brakes (12%)'].fillna(0).astype('uint16')
df = df.rename(columns={'Permitted trailer load with brakes (12%)': 'Permitted trailer load with brakes (12%) (kg)'})
df['Permitted trailer load with brakes (12%) (kg)']

0           0
1           0
2           0
3           0
4           0
         ... 
50079     500
50080    1300
50081    1300
50082    1300
50083    1650
Name: Permitted trailer load with brakes (12%) (kg), Length: 50084, dtype: uint16

In [58]:
df['Wheelbase']
df['Wheelbase'] = df['Wheelbase'].map(parse_number_or_range)
df['Wheelbase'] = df['Wheelbase'].fillna(0).astype('uint16')
df = df.rename(columns={'Wheelbase': 'Wheelbase (mm)'})
df['Wheelbase (mm)']

0           0
1           0
2           0
3           0
4           0
         ... 
50079    2685
50080    2669
50081    2669
50082    2669
50083    2669
Name: Wheelbase (mm), Length: 50084, dtype: uint16

In [62]:
df['Acceleration 0 - 100 km/h']
df['Acceleration 0 - 100 km/h'] = df['Acceleration 0 - 100 km/h'].map(parse_number_or_range)
df['Acceleration 0 - 100 km/h'] = df['Acceleration 0 - 100 km/h'].astype('float16')
df['Acceleration 0 - 100 km/h']

  has_large_values = (abs_vals > 1e6).any()


0              NaN
1              NaN
2              NaN
3              NaN
4         6.898438
           ...    
50079    10.500000
50080    12.296875
50081    13.398438
50082    10.601562
50083     8.203125
Name: Acceleration 0 - 100 km/h, Length: 50084, dtype: float16

In [65]:
df['Ramp-over (brakeover) angle'] = df['Ramp-over (brakeover) angle'].map(parse_number_or_range)
df['Ramp-over (brakeover) angle'] = df['Ramp-over (brakeover) angle'].astype('float16')
df[df['Ramp-over (brakeover) angle'].notna()]['Ramp-over (brakeover) angle']

  has_large_values = (abs_vals > 1e6).any()


54       19.406250
219      17.000000
2298     14.796875
2300     14.796875
2380     15.898438
           ...    
49195    20.000000
49200    20.000000
49216    20.000000
49229    20.000000
49235    20.000000
Name: Ramp-over (brakeover) angle, Length: 5361, dtype: float16

In [68]:
df['AdBlue tank'] = df['AdBlue tank'].map(parse_number_or_range)
df['AdBlue tank'] = df['AdBlue tank'].astype('float16')
df = df.rename(columns={'AdBlue tank': 'AdBlue tank (l)'})
df[df['AdBlue tank (l)'].notna()]['AdBlue tank (l)']

  has_large_values = (abs_vals > 1e6).any()


54       20.0
395      18.5
408      18.5
418      18.5
956      21.5
         ... 
50063    16.0
50064    16.0
50070    16.0
50071    16.0
50074    16.0
Name: AdBlue tank (l), Length: 1859, dtype: float16

In [71]:
df['Number of cylinders'] = df['Number of cylinders'].fillna(0).astype('uint8')
df['Number of cylinders']

0        0
1        0
2        0
3        0
4        6
        ..
50079    0
50080    4
50081    4
50082    4
50083    4
Name: Number of cylinders, Length: 50084, dtype: uint8

In [73]:
df['Drivetrain Architecture'] = df['Drivetrain Architecture'].astype('category')
print(df['Drivetrain Architecture'].cat.categories)

Index(['An Internal combustion engine (ICE) drives the front wheels, one electric motor drives the front wheels, one electric motor drives the rear wheels. There is an ability for running in full electric or mixed mode.',
       'An Internal combustion engine (ICE) drives the front wheels, one electric motor drives the front wheels, one electric motor drives the rear wheels. There is an ability for running in full electric or mixed mode. There are parallel and serial hybrid modes.',
       'An Internal combustion engine (ICE) drives the rear wheels, one electric motor drives the front wheels, one electric motor drives the rear wheels. There is an ability for running in full electric or mixed mode.',
       'Four electric motors drive each wheel individually.',
       'One electric motor drives the front wheels, one electric motor drives the rear wheels.',
       'One electric motor drives the front wheels.',
       'One electric motor drives the front wheels. One electric motor drives 

In [75]:
df['Permitted trailer load with brakes (8%)']
df['Permitted trailer load with brakes (8%)'] = df['Permitted trailer load with brakes (8%)'].map(parse_number_or_range)
df['Permitted trailer load with brakes (8%)'] = df['Permitted trailer load with brakes (8%)'].fillna(0).astype('uint16')
df = df.rename(columns={'Permitted trailer load with brakes (8%)': 'Permitted trailer load with brakes (8%) (kg)'})
df['Permitted trailer load with brakes (8%) (kg)']

0        0
1        0
2        0
3        0
4        0
        ..
50079    0
50080    0
50081    0
50082    0
50083    0
Name: Permitted trailer load with brakes (8%) (kg), Length: 50084, dtype: uint16

In [78]:
df['Torque'] = df['Torque'].map(parse_number_or_range)
df['Torque'] = df['Torque'].fillna(0).astype('uint16')
df = df.rename(columns={'Torque': 'Torque (Nm)'})
df['Torque (Nm)']

0          0
1          0
2          0
3          0
4        267
        ... 
50079      0
50080    175
50081    220
50082    205
50083    270
Name: Torque (Nm), Length: 50084, dtype: uint16

In [80]:
df['System power']
df['System power'] = df['System power'].map(parse_number_or_range)
df['System power'] = df['System power'] * 0.7355
df['System power'] = df['System power'].astype('float16')
df = df.rename(columns={'System power': 'System power (kW)'})
df['System power (kW)']

  has_large_values = (abs_vals > 1e6).any()


0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
          ...  
50079    95.625
50080       NaN
50081       NaN
50082       NaN
50083       NaN
Name: System power (kW), Length: 50084, dtype: float16

In [83]:
df['Front track']
df['Front track'] = df['Front track'].map(parse_number_or_range)
df['Front track'] = df['Front track'].fillna(0).astype('uint16')
df = df.rename(columns={'Front track': 'Front track (mm)'})
df['Front track (mm)']

0           0
1           0
2           0
3           0
4           0
         ... 
50079       0
50080    1591
50081    1591
50082    1591
50083    1591
Name: Front track (mm), Length: 50084, dtype: uint16

In [85]:
df['Fuel injection system'] = df['Fuel injection system'].astype('category')
print(df['Fuel injection system'].cat.categories)

Index(['Carburettor', 'Diesel Commonrail', 'Direct injection',
       'Direct injection and Multi-port manifold injection',
       'Dual-point throttle body fuel injection',
       'Indirect injection with two injectors on each intake port',
       'Multi-port manifold injection', 'Precombustion chamber injection',
       'Pump-nozzle (Unit Injector)', 'Single-point injection'],
      dtype='object')


In [88]:
df['Engine systems'] = df['Engine systems'].astype('category')
print(df['Engine systems'].cat.categories)

Index(['Cylinder deactivation system', 'Particulate filter',
       'Start & Stop System',
       'Start & Stop SystemCylinder deactivation systemParticulate filter',
       'Start & Stop SystemParticulate filter'],
      dtype='object')


In [91]:
df['Battery voltage'] = df['Battery voltage'].map(parse_number_or_range)
df['Battery voltage'] = df['Battery voltage'].fillna(0).astype('uint16')
df = df.rename(columns={'Battery voltage': 'Battery voltage (V)'})
df['Battery voltage (V)']

0          0
1          0
2          0
3          0
4          0
        ... 
50079    400
50080      0
50081      0
50082      0
50083      0
Name: Battery voltage (V), Length: 50084, dtype: uint16

In [93]:
df['Rear suspension'] = df['Rear suspension'].astype('category')
print(df['Rear suspension'].cat.categories)

Index(['Air suspension', 'Air suspension, Trailing arm',
       'Air suspension, Transverse stabilizer',
       'Air suspension, Transverse stabilizer, Leaf spring',
       'Air suspension, Transverse stabilizer, Trailing arm', 'Coil spring',
       'Coil spring, Air Suspension - Optional', 'Coil spring, Elastic beam',
       'Coil spring, Torsion',
       'Dependent spring suspension with transverse stabilizer',
       ...
       'Wishbone, Transverse stabilizer, Trailing arm',
       'Wishbone, Transverse stabilizer, Trailing arm, Coil spring',
       'dependent spring suspension',
       'dependent spring suspension, Leaf spring',
       'dependent spring suspension, Trailing arm',
       'dependent spring suspension, Transverse stabilizer',
       'dependent spring suspension, Transverse stabilizer, Trailing arm',
       'independent torsion suspension',
       'independent torsion suspension, Trailing arm',
       'independent torsion suspension, Wishbone'],
      dtype='object', 

In [98]:
df['Max. roof load']
df['Max. roof load'] = df['Max. roof load'].map(parse_number_or_range)
df['Max. roof load'] = df['Max. roof load'].fillna(0).astype('uint16')
df = df.rename(columns={'Max. roof load': 'Max. roof load (kg)'})
df['Max. roof load (kg)']

0         0
1         0
2         0
3         0
4         0
         ..
50079     0
50080     0
50081     0
50082     0
50083    80
Name: Max. roof load (kg), Length: 50084, dtype: uint16

In [100]:
df['Piston Stroke']
df['Piston Stroke'] = df['Piston Stroke'].map(parse_number_or_range)
df['Piston Stroke'] = df['Piston Stroke'].astype('float16')
df = df.rename(columns={'Piston Stroke': 'Piston Stroke (mm)'})
df['Piston Stroke (mm)']

  has_large_values = (abs_vals > 1e6).any()


0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
          ...  
50079       NaN
50080    73.125
50081    80.500
50082    73.125
50083       NaN
Name: Piston Stroke (mm), Length: 50084, dtype: float16

In [102]:
df['Max load']
df['Max load'] = df['Max load'].map(parse_number_or_range)
df['Max load'] = df['Max load'].fillna(0).astype('uint16')
df = df.rename(columns={'Max load': 'Max load (kg)'})
df['Max load (kg)']

0          0
1          0
2          0
3          0
4          0
        ... 
50079    492
50080    601
50081    636
50082    601
50083    560
Name: Max load (kg), Length: 50084, dtype: uint16

In [105]:
df['Engine configuration'] = df['Engine configuration'].astype('category')
print(df['Engine configuration'].cat.categories)

Index(['180° flat V-engine', 'Boxer', 'Inline', 'Rotary (Wankel)', 'V-engine',
       'VR-engine', 'W-engine'],
      dtype='object')


In [107]:
df['Engine layout'] = df['Engine layout'].astype('category')
print(df['Engine layout'].cat.categories)

Index(['Front axle, Longitudinal', 'Front, Longitudinal', 'Front, Transverse',
       'Middle, Longitudinal', 'Middle, Transverse', 'Rear axle, Longitudinal',
       'Rear, Longitudinal', 'Rear, Transverse'],
      dtype='object')
