In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv('theCarSpec_cars.csv', low_memory=False)

In [3]:
def analyse_instances(df, low_fill_threshold=1.0, max_to_show=30, show_first_n=5):
    low_fill_columns = []
    
    print(f"Dataset: {len(df):,} rows × {len(df.columns)} columns\n")
    print("-" * 100)
    
    for i, col in enumerate(df.columns, 1):
        non_null_count = df[col].notna().sum()
        fill_percentage = non_null_count / len(df) * 100
        
        value_counts = df[col].value_counts(dropna=False).sort_values(ascending=False)
        total_unique = len(value_counts)
        
        flag = "⚠" if fill_percentage < low_fill_threshold else " "
        if flag == "⚠":
            low_fill_columns.append(col)
        
        print(f"{i:3d}. {flag} [{col}]")
        print(f"     Fill: {non_null_count:,} / {len(df):,} ({fill_percentage:.2f}%) | Unique: {total_unique:,}")
        
        if total_unique <= max_to_show:
            print("     Values:")
            for val, count in value_counts.items():
                if pd.isna(val):
                    print(f"       - <NaN>: {count:,}")
                elif str(val).strip() == "":
                    print(f"       - <empty string>: {count:,}")
                else:
                    print(f"       - {val}: {count:,}")
        else:
            print(f"     Too many unique values ({total_unique:,}), showing top {show_first_n}:")
            for val, count in value_counts.head(show_first_n).items():
                if pd.isna(val):
                    print(f"         - <NaN>: {count:,}")
                elif str(val).strip() == "":
                    print(f"         - <empty string>: {count:,}")
                else:
                    print(f"         - {val}: {count:,}")
            print(f"         ... and {total_unique - show_first_n:,} more unique values")
        print()

    if low_fill_columns:
        print(f"⚠ Columns with < {low_fill_threshold}% fill ({len(low_fill_columns)}):")
        print("    " + ", ".join(f"'{col}'" for col in low_fill_columns))
    else:
        print(f"All columns have greater than or equal to {low_fill_threshold}% fill rate.")
    
    return low_fill_columns

In [4]:
# executing search
low_fill_cols = analyse_instances(df, low_fill_threshold=0.1)

print("\nAlmost empty columns which need to be examined:")
print(low_fill_cols)

Dataset: 37,010 rows × 154 columns

----------------------------------------------------------------------------------------------------
  1.   [Brand]
     Fill: 37,010 / 37,010 (100.00%) | Unique: 88
     Too many unique values (88), showing top 5:
         - Volkswagen: 2,720
         - Mercedes-Benz: 2,694
         - Ford: 2,240
         - BMW: 2,202
         - Audi: 2,117
         ... and 83 more unique values

  2.   [Model]
     Fill: 37,010 / 37,010 (100.00%) | Unique: 2,497
     Too many unique values (2,497), showing top 5:
         - Transporter (Van): 400
         - Golf (Hatchback): 281
         - A4 (Station wagon (estate)): 255
         - F-Series F-100/F-150 (Pick-up): 252
         - E-class (Sedan): 248
         ... and 2,492 more unique values

  3.   [Version]
     Fill: 37,010 / 37,010 (100.00%) | Unique: 12,342
     Too many unique values (12,342), showing top 5:
         - Transporter (T6) Kombi / Van 2015,2016,2017,2018: 40
         - MPV) Master III (Phase III, 

In [5]:
# dropping invalid columns
df_copy = df # making copy of the dataset

columns_drop = [
    'Dimensions-170', 'Exterior 128', 'Safety and security-140', 'Brakes-80',
    'Oil viscosity', 'Head room rear', 'Weights-180', 'Steering-90', 'Leg room rear',
    'Front', 'Safety and security-145', 'Weights-179', 'Wheels tyres 120',
    'Transmission and Drive system-67', 'Hip room rear', 'Head room front',
    'Front passenger’s seat', 'Driver’s seat', 'Exterior 125', 'Suspension-97',
    'Suspension-100', 'Steering-87', 'Battery', 'Curb weight (pounds) rear',
    'Turbocharger type', 'Brake assist', 'Parking brake', 'Safety and security-146',
    'Turns (lock-to-lock)', 'Navigation', 'Safety and security-147', 'Front seat type',
    'Steering-85', 'Driveshafts', 'Model summary-16', 'Exterior 123', 'Type',
    'Weight distribution (%) Front/rear', 'Wheels tyres 119', 'Alternator',
    'Exterior 126', 'Eng 60', 'Suspension-99', 'Dimensions-169', 'Brake cooling air guide',
    'Electronic brake force distribution', 'Transmission and Drive system-68',
    'Body / Chassis-106', 'Wheels tyres 113', 'Transmission and Drive system-66',
    'Ignition system', 'Curb weight (pounds) front', 'Wheels tyres 114', 'Eng 59',
    'Exterior 127', 'Taillights', 'Leg room front', 'Suspension-98', 'Headlights',
    'Heated seats', 'Radio', 'Hands free bluetooth Apple CarPlay® integration',
    'Brakes-79', 'Materials', 'Hip room front', 'Suspension-96', 'Additional body bonding',
    'Leather/synthetic suede front seat appointments', 'Safety and security-144',
    'Recommended fuel', 'Exterior 124', 'Weights-178', 'Model summary-15',
    'Platform', 'URL', 'Version'  # lisaks ka URL ja Version – absoluutselt unikaalsed, ei kanna analüütilist väärtust
]

df_clean_1 = df.drop(columns=columns_drop)

# valuable columns with small amount of values
precious_columns = [
    'Final drive ratio', 'Overall ratio', 'Gear ratio 1st 2nd 3rd 4th 5th 6th reverse',
    'Electric motor power', 'Electric motor torque', 'Total available power',
    'Total available torque', 'Autonomy km (combined use)', 'Combined fuel consumption (WLTP)'
]

low_fill_cols_2 = analyse_instances(df_clean_1, low_fill_threshold=5)

print("\nAlmost empty columns which need to be examined:")
print(low_fill_cols_2)


Dataset: 37,010 rows × 78 columns

----------------------------------------------------------------------------------------------------
  1.   [Brand]
     Fill: 37,010 / 37,010 (100.00%) | Unique: 88
     Too many unique values (88), showing top 5:
         - Volkswagen: 2,720
         - Mercedes-Benz: 2,694
         - Ford: 2,240
         - BMW: 2,202
         - Audi: 2,117
         ... and 83 more unique values

  2.   [Model]
     Fill: 37,010 / 37,010 (100.00%) | Unique: 2,497
     Too many unique values (2,497), showing top 5:
         - Transporter (Van): 400
         - Golf (Hatchback): 281
         - A4 (Station wagon (estate)): 255
         - F-Series F-100/F-150 (Pick-up): 252
         - E-class (Sedan): 248
         ... and 2,492 more unique values

  3.   [Year production start]
     Fill: 37,010 / 37,010 (100.00%) | Unique: 58
     Too many unique values (58), showing top 5:
         - 2015: 1,990
         - 2018: 1,808
         - 2019: 1,750
         - 2020: 1,623
      

In [6]:
# analysing the secondary dataset
df2 = pd.read_csv('carsdirectory.csv', low_memory=False)

low_fill_cols = analyse_instances(df2, low_fill_threshold=5)

print("\nAlmost empty columns which need to be examined:")
print(low_fill_cols)

Dataset: 37,695 rows × 90 columns

----------------------------------------------------------------------------------------------------
  1.   [Brand]
     Fill: 37,695 / 37,695 (100.00%) | Unique: 245
     Too many unique values (245), showing top 5:
         - Mercedes-Benz: 2,732
         - Audi: 2,659
         - Ford: 2,379
         - Volkswagen: 2,115
         - Opel: 2,109
         ... and 240 more unique values

  2.   [BrandModel]
     Fill: 37,695 / 37,695 (100.00%) | Unique: 2,126
     Too many unique values (2,126), showing top 5:
         - A4: 549
         - Golf: 542
         - E-class: 495
         - Astra: 495
         - F-Series F-100/F-150: 430
         ... and 2,121 more unique values

  3.   [Generation]
     Fill: 37,695 / 37,695 (100.00%) | Unique: 6,383
     Too many unique values (6,383), showing top 5:
         - V70 III: 45
         - W124 (facelift 1989): 45
         - A4 (B8 8K, facelift 2011): 44
         - Hatch (F55; F56 facelift 2018): 44
         - Golf

In [7]:
# dropping insignificant ones
columns_to_drop = [
    'Max speed (electric)', '200 km/h - 0', 'Power (LPG)', 
    'Power (CNG)', 'Average Energy consumption',
    'Power (Ethanol - E85)', 'AdBlue tank',
    'CNG cylinder capacity', 'Wading depth', 'System torque', 'All-electric range'
]

df2 = df2.drop(columns=columns_to_drop)
print(f"After erasing: {df2.shape}")

After erasing: (37695, 79)


In [8]:
# analysing the secondary dataset again
low_fill_cols = analyse_instances(df2, low_fill_threshold=5)

print("\nAlmost empty columns which need to be examined:")
print(low_fill_cols)

Dataset: 37,695 rows × 79 columns

----------------------------------------------------------------------------------------------------
  1.   [Brand]
     Fill: 37,695 / 37,695 (100.00%) | Unique: 245
     Too many unique values (245), showing top 5:
         - Mercedes-Benz: 2,732
         - Audi: 2,659
         - Ford: 2,379
         - Volkswagen: 2,115
         - Opel: 2,109
         ... and 240 more unique values

  2.   [BrandModel]
     Fill: 37,695 / 37,695 (100.00%) | Unique: 2,126
     Too many unique values (2,126), showing top 5:
         - A4: 549
         - Golf: 542
         - E-class: 495
         - Astra: 495
         - F-Series F-100/F-150: 430
         ... and 2,121 more unique values

  3.   [Generation]
     Fill: 37,695 / 37,695 (100.00%) | Unique: 6,383
     Too many unique values (6,383), showing top 5:
         - V70 III: 45
         - W124 (facelift 1989): 45
         - A4 (B8 8K, facelift 2011): 44
         - Hatch (F55; F56 facelift 2018): 44
         - Golf

In [9]:
pd.set_option('display.max_columns', None)    # näitab KÕIKI veerge
pd.set_option('display.max_rows', 10)         # näitab täpselt 10 rida (pool enne ja pool pärast)
pd.set_option('display.width', None)          # automaatne laius
pd.set_option('display.expand_frame_repr', False)  # ei murragi ridasid
pd.set_option('display.max_colwidth', 50)     # veergude sisu ei lõigata liiga lühikeseks

df2.head(10)

Unnamed: 0,Brand,BrandModel,Generation,Start of production,End of production,Modification,Powertrain Architecture,Body type,Fuel Type,Max. weight,Length,Width,Height,Number of Gears (manual transmission),Front track,Emission standard,Model,Steering type,Seats,Battery capacity,Position of cylinders,Power per litre,Max. roof load,Cylinder Bore,Drivetrain Architecture,Rear brakes,Assisting systems,Front suspension,100 km/h - 0,Doors,Maximum engine speed,Engine location,Kerb Weight,Front overhang,Ramp angle,Engine systems,Number of Gears (automatic transmission),Fuel tank capacity,Acceleration,Piston Stroke,Power,Number of valves per cylinder,Climb angle,Drag coefficient (Cd),Fuel consumption - extra urban,Compression ratio,Valvetrain,Weight-to-power ratio,Trunk (boot) space - maximum,Wheelbase,Front brakes,Coolant,Engine oil capacity,CO2 emissions,Wheel rims size,Engine aspiration,Departure angle,Fuel System,Permitted towbar download,Approach angle,Torque,Permitted trailer load without brakes,Max load,Trunk (boot) space - minimum,Number of cylinders,Permitted trailer load with brakes (12%),Fuel consumption - combined,Drive wheel,Ride height,Minimum turning circle (turning diameter),Permitted trailer load with brakes (8%),Rear suspension,Maximum speed,Rear (Back) track,Tires size,Acceleration 0 - 100 km/h (0 - 62 mph),Engine displacement,Fuel consumption - urban,Rear overhang
0,Alpine,A110,A110 Berlinette,1968 year,1970 year,1100 (L) (95 Hp),Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,95 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,
1,DAF,66,66 Combi,1972 year,1975 year,1.1 MARATHON (54 Hp),Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,54 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,
2,Alpine,A110,A110 Berlinette,1966 year,1970 year,1500 (L) (90 Hp),Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,90 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,
3,DAF,66,66 Combi,1973 year,1975 year,1300 MARATHON (57 Hp),Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,57 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,
4,Alpine,A110,A110 Berlinette,1968 year,1970 year,1300 (L) (110 Hp),Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,110 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,
5,Alpine,A110,A110 Berlinette,1966 year,1970 year,1500 (L) (70 Hp),Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,70 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,
6,AC,Aceca,Aceca III,1998 year,2000 year,4.6 i V8 32V (326 Hp),Internal Combustion engine,Coupe,Petrol (Gasoline),,4660 mm,1890 mm,1340 mm,5.0,1590 mm,,,Steering rack and pinion,4.0,,V-engine,70.9 Hp/l,,90.2 mm,The Internal combustion engine (ICE) drives th...,Ventilated discs,ABS (Anti-lock braking system),Coil spring,,3.0,,,1615 kg,,,,,70 l,,90 mm,326 Hp @ 5800 rpm.,4.0,,,10.5 l/100 km,9.85,DOHC,"5 kg/Hp, 201.9 Hp/tonne",300 l,2720 mm,Ventilated discs,,,,,Naturally aspirated engine,,Multi-point indirect injection,,,427 Nm @ 4600 rpm.,,,185 l,8.0,,,Rear wheel drive,,,,Helical spring,250 km/h,1570 mm,255/55 R18,6.1 sec,4601 cm3,15.1 l/100 km,
7,Daewoo,Tosca,Tosca,2006 year,2011 year,2.0i R6 24V (142 Hp) Automatic,Internal Combustion engine,Sedan,Petrol (Gasoline),,4805 mm,1810 mm,1450 mm,,1550 mm,,,Steering rack and pinion,5.0,,Inline,71.3 Hp/l,,,The Internal combustion engine (ICE) drives th...,Disc,ABS (Anti-lock braking system),Spring Strut,,4.0,,"Front, Transverse",,,,,5.0,65 l,,,142 Hp @ 5800 rpm.,4.0,,,,10.0,,,,2700 mm,Ventilated discs,,,,6J x 15,Naturally aspirated engine,,Multi-point indirect injection,,,187 Nm @ 3800 rpm.,,,480 l,6.0,,,Front wheel drive,,,,Helical spring,,1545 mm,205/65 R15,,1992 cm3,,
8,Daewoo,Racer,Racer Hatchback,1986 year,1995 year,1.6i (75 Hp) Automatic,Internal Combustion engine,Hatchback,Petrol (Gasoline),,4158 mm,1663 mm,1360 mm,,1400 mm,,,Steering rack and pinion,5.0,,Inline,46.9 Hp/l,,79 mm,The Internal combustion engine (ICE) drives th...,Drum,ABS (Anti-lock braking system),Wishbone,,3.0,,"Front, Transverse",990 kg,,,,3.0,50 l,,81.5 mm,75 Hp @ 5600 rpm.,2.0,,,,8.6,OHC,"13.2 kg/Hp, 75.8 Hp/tonne",,2520 mm,Disc,,,,,Naturally aspirated engine,,Mono-point injection,,,122 Nm @ 2800 rpm.,,,,4.0,,,Front wheel drive,,,,Helical spring,,1406 mm,175/70 R13,,1598 cm3,,
9,Daewoo,Racer,Racer Sedan,1993 year,1995 year,1.5 i (75 Hp),Internal Combustion engine,Sedan,Petrol (Gasoline),1460 kg,4482 mm,1662 mm,1394 mm,5.0,1400 mm,,,Steering rack and pinion,5.0,,Inline,50.1 Hp/l,,76.5 mm,The Internal combustion engine (ICE) drives th...,Drum,,Wishbone,,4.0,,"Front, Transverse",969 kg,,,,,50 l,,81.5 mm,75 Hp @ 5400 rpm.,2.0,,,5.1 l/100 km,9.5,OHC,"12.9 kg/Hp, 77.4 Hp/tonne",,2520 mm,Ventilated discs,,,,R13,Naturally aspirated engine,,Mono-point injection,,,123 Nm @ 3200 rpm.,,491 kg,530 l,4.0,,6.3 l/100 km,Front wheel drive,,,,Suspension with traction connecting levers,163 km/h,1406 mm,175/70 R13,12.5 sec,1498 cm3,9.3 l/100 km,


In [10]:
# celaning up first 5 attributes
from sklearn.preprocessing import MinMaxScaler

df = df2 # rebasing attribute

# =============================================
# 1. Cleaning year columns
df['Year'] = df['Start of production'].str.extract(r'(\d{4})').astype(float)

df['End_year'] = df['End of production'].str.extract(r'(\d{4})').astype(float)
df['Production_years'] = df['End_year'] - df['Year']

# If the model is still in production, then we add duration to NaN
df.loc[df['End_year'].isna(), 'Production_years'] = np.nan

# =============================================
# 2. Best categorical feature: Brand + Model
df['Brand_Model'] = df['Brand'].str.strip() + ' ' + df['BrandModel'].str.strip()

# =============================================
# 3. Frequency Encoding (because the target is missing)
# Brand frequency
brand_freq = df['Brand'].value_counts(normalize=True)
df['Brand_freq'] = df['Brand'].map(brand_freq)

# Brand_Model frequency
model_freq = df['Brand_Model'].value_counts(normalize=True)
df['Brand_Model_freq'] = df['Brand_Model'].map(model_freq)

# =============================================
# 4. Normalizing numeric ones (Year and Production_years)
scaler = MinMaxScaler()
df['Year_norm'] = scaler.fit_transform(df[['Year']])

# Production_years võib olla NaN → täida keskmisega või jäta NaN (puuduv info)
df['Production_years'] = df['Production_years'].fillna(df['Production_years'].median())
df['Production_years_norm'] = scaler.fit_transform(df[['Production_years']])

# =============================================
# 5. Deleting originals and other trash columns
df_clean = df.drop(columns=[
    'Brand', 'BrandModel', 'Generation', 
    'Start of production', 'End of production', 'End_year'
])

# validating
pd.set_option('display.max_columns', None)    # näitab KÕIKI veerge
pd.set_option('display.max_rows', 10)         # näitab täpselt 10 rida (pool enne ja pool pärast)
pd.set_option('display.width', None)          # automaatne laius
pd.set_option('display.expand_frame_repr', False)  # ei murragi ridasid
pd.set_option('display.max_colwidth', 50)     # veergude sisu ei lõigata liiga lühikeseks

df_clean.head(10)

Unnamed: 0,Modification,Powertrain Architecture,Body type,Fuel Type,Max. weight,Length,Width,Height,Number of Gears (manual transmission),Front track,Emission standard,Model,Steering type,Seats,Battery capacity,Position of cylinders,Power per litre,Max. roof load,Cylinder Bore,Drivetrain Architecture,Rear brakes,Assisting systems,Front suspension,100 km/h - 0,Doors,Maximum engine speed,Engine location,Kerb Weight,Front overhang,Ramp angle,Engine systems,Number of Gears (automatic transmission),Fuel tank capacity,Acceleration,Piston Stroke,Power,Number of valves per cylinder,Climb angle,Drag coefficient (Cd),Fuel consumption - extra urban,Compression ratio,Valvetrain,Weight-to-power ratio,Trunk (boot) space - maximum,Wheelbase,Front brakes,Coolant,Engine oil capacity,CO2 emissions,Wheel rims size,Engine aspiration,Departure angle,Fuel System,Permitted towbar download,Approach angle,Torque,Permitted trailer load without brakes,Max load,Trunk (boot) space - minimum,Number of cylinders,Permitted trailer load with brakes (12%),Fuel consumption - combined,Drive wheel,Ride height,Minimum turning circle (turning diameter),Permitted trailer load with brakes (8%),Rear suspension,Maximum speed,Rear (Back) track,Tires size,Acceleration 0 - 100 km/h (0 - 62 mph),Engine displacement,Fuel consumption - urban,Rear overhang,Year,Production_years,Brand_Model,Brand_freq,Brand_Model_freq,Year_norm,Production_years_norm
0,1100 (L) (95 Hp),Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,95 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081
1,1.1 MARATHON (54 Hp),Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,54 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1972.0,3.0,DAF 66,0.000186,0.000186,0.342105,0.108108
2,1500 (L) (90 Hp),Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,90 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135
3,1300 MARATHON (57 Hp),Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,57 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1973.0,2.0,DAF 66,0.000186,0.000186,0.355263,0.081081
4,1300 (L) (110 Hp),Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,110 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081
5,1500 (L) (70 Hp),Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,70 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135
6,4.6 i V8 32V (326 Hp),Internal Combustion engine,Coupe,Petrol (Gasoline),,4660 mm,1890 mm,1340 mm,5.0,1590 mm,,,Steering rack and pinion,4.0,,V-engine,70.9 Hp/l,,90.2 mm,The Internal combustion engine (ICE) drives th...,Ventilated discs,ABS (Anti-lock braking system),Coil spring,,3.0,,,1615 kg,,,,,70 l,,90 mm,326 Hp @ 5800 rpm.,4.0,,,10.5 l/100 km,9.85,DOHC,"5 kg/Hp, 201.9 Hp/tonne",300 l,2720 mm,Ventilated discs,,,,,Naturally aspirated engine,,Multi-point indirect injection,,,427 Nm @ 4600 rpm.,,,185 l,8.0,,,Rear wheel drive,,,,Helical spring,250 km/h,1570 mm,255/55 R18,6.1 sec,4601 cm3,15.1 l/100 km,,1998.0,2.0,AC Aceca,0.000186,5.3e-05,0.684211,0.081081
7,2.0i R6 24V (142 Hp) Automatic,Internal Combustion engine,Sedan,Petrol (Gasoline),,4805 mm,1810 mm,1450 mm,,1550 mm,,,Steering rack and pinion,5.0,,Inline,71.3 Hp/l,,,The Internal combustion engine (ICE) drives th...,Disc,ABS (Anti-lock braking system),Spring Strut,,4.0,,"Front, Transverse",,,,,5.0,65 l,,,142 Hp @ 5800 rpm.,4.0,,,,10.0,,,,2700 mm,Ventilated discs,,,,6J x 15,Naturally aspirated engine,,Multi-point indirect injection,,,187 Nm @ 3800 rpm.,,,480 l,6.0,,,Front wheel drive,,,,Helical spring,,1545 mm,205/65 R15,,1992 cm3,,,2006.0,5.0,Daewoo Tosca,0.004245,0.000106,0.789474,0.162162
8,1.6i (75 Hp) Automatic,Internal Combustion engine,Hatchback,Petrol (Gasoline),,4158 mm,1663 mm,1360 mm,,1400 mm,,,Steering rack and pinion,5.0,,Inline,46.9 Hp/l,,79 mm,The Internal combustion engine (ICE) drives th...,Drum,ABS (Anti-lock braking system),Wishbone,,3.0,,"Front, Transverse",990 kg,,,,3.0,50 l,,81.5 mm,75 Hp @ 5600 rpm.,2.0,,,,8.6,OHC,"13.2 kg/Hp, 75.8 Hp/tonne",,2520 mm,Disc,,,,,Naturally aspirated engine,,Mono-point injection,,,122 Nm @ 2800 rpm.,,,,4.0,,,Front wheel drive,,,,Helical spring,,1406 mm,175/70 R13,,1598 cm3,,,1986.0,9.0,Daewoo Racer,0.004245,0.000212,0.526316,0.27027
9,1.5 i (75 Hp),Internal Combustion engine,Sedan,Petrol (Gasoline),1460 kg,4482 mm,1662 mm,1394 mm,5.0,1400 mm,,,Steering rack and pinion,5.0,,Inline,50.1 Hp/l,,76.5 mm,The Internal combustion engine (ICE) drives th...,Drum,,Wishbone,,4.0,,"Front, Transverse",969 kg,,,,,50 l,,81.5 mm,75 Hp @ 5400 rpm.,2.0,,,5.1 l/100 km,9.5,OHC,"12.9 kg/Hp, 77.4 Hp/tonne",,2520 mm,Ventilated discs,,,,R13,Naturally aspirated engine,,Mono-point injection,,,123 Nm @ 3200 rpm.,,491 kg,530 l,4.0,,6.3 l/100 km,Front wheel drive,,,,Suspension with traction connecting levers,163 km/h,1406 mm,175/70 R13,12.5 sec,1498 cm3,9.3 l/100 km,,1993.0,2.0,Daewoo Racer,0.004245,0.000212,0.618421,0.081081


In [11]:
df = df_clean  # rebasing

# deleting modifications column
df = df.drop('Modification', axis=1, errors='ignore')


# ===========================================================================
# 2. Powertrain architecture → simplifying and adding indicators
df['is_ev']        = df['Powertrain Architecture'].str.contains('Electric Vehicle|FCEV', case=False, na=False).astype(int)
df['is_phev']      = df['Powertrain Architecture'].str.contains('PHEV', case=False, na=False).astype(int)
df['is_mhev_fhev'] = df['Powertrain Architecture'].str.contains('MHEV|FHEV', case=False, na=False).astype(int)
df['is_ice']       = (df['Powertrain Architecture'].str.contains('Internal Combustion', case=False, na=False) | 
                      df['Powertrain Architecture'].isna()).astype(int)

# Lihtne kategooria stratifitseerimiseks vms
df['Powertrain_simple'] = df['Powertrain Architecture'].map({
    'Internal Combustion engine':                                      'ICE',
    'MHEV (Mild Hybrid Electric Vehicle, Battery-assisted Hybrid Vehicle)': 'Mild_Hybrid',
    'PHEV (Plug-in Hybrid Electric Vehicle, Plug-in Hybrid)':         'Plug-in_Hybrid',
    'FHEV (Full Hybrid Electric Vehicle)':                           'Full_Hybrid',
    'Electric Vehicle':                                               'EV',
    'FCEV (Fuel Cell Electric Vehicle)':                              'FCEV'
}).fillna('ICE')


# ===========================================================================
# 3. Body type → simplifying and one-hot
top_bodies = ['Sedan', 'Hatchback', 'Station wagon', 'SUV', 'Coupe',
              'Convertible', 'MPV', 'Pick-up', 'Cabriolet', 'Roadster', 'Limousine']

df['Body_simple'] = (df['Body type']
                     .str.split(',')
                     .str[0]
                     .str.split('(')
                     .str[0]
                     .str.strip())

df['Body_simple'] = df['Body_simple'].where(df['Body_simple'].isin(top_bodies), 'Other')

# One-hot encoding
df = pd.get_dummies(df, columns=['Body_simple'], prefix='Body')


# ===========================================================================
# 4.Fuel type → simplifying and adding binary indicators
df['Fuel_simple'] = df['Fuel Type'].replace({
    'Petrol (Gasoline)':           'Petrol',
    'Diesel':                      'Diesel',
    'petrol / electricity':        'Hybrid',
    'Electricity':                 'EV',
    'Petrol / CNG':                'CNG/LPG',
    'Petrol / LPG':                'CNG/LPG',
    'Petrol / Ethanol - E85':      'Flex_Fuel',
    'diesel / electricity':        'Hybrid',
    'Hydrogen':                    'Other',
    'LPG':                         'CNG/LPG',
    'Mixture of two stroke engine':'Other',
    'CNG':                         'CNG/LPG'
}).fillna('Petrol')

df['is_diesel']     = (df['Fuel_simple'] == 'Diesel').astype(int)
df['is_petrol']     = (df['Fuel_simple'] == 'Petrol').astype(int)
df['is_hybrid_ev']  = df['Fuel_simple'].isin(['Hybrid', 'EV']).astype(int)
df['is_lpg_cng']    = (df['Fuel_simple'] == 'CNG/LPG').astype(int)


# ===========================================================================
# 5. Normalizing binary columns → int (0/1) – Variant A
# pd.get_dummies annab alates pandas 1.5 vaikimisi bool tüüpi → muudame int-iks
df[df.select_dtypes('bool').columns] = df.select_dtypes('bool').astype(int)


print("Binary indicators (result = how many 1-s):")
new_cols = ['is_ev','is_phev','is_mhev_fhev','is_ice',
            'is_diesel','is_petrol','is_hybrid_ev','is_lpg_cng']
print(df[new_cols].sum())

print("\nBody one-hot tulbad:")
print([col for col in df.columns if col.startswith('Body_')])

print(f"\nNow columns in total: {df.shape[1]}")
print(f"shape: {df.shape}")

Binary indicators (result = how many 1-s):
is_ev            1330
is_phev           281
is_mhev_fhev      797
is_ice          36365
is_diesel        9585
is_petrol       27251
is_hybrid_ev      535
is_lpg_cng        204
dtype: int64

Body one-hot tulbad:
['Body_Cabriolet', 'Body_Coupe', 'Body_Hatchback', 'Body_MPV', 'Body_Other', 'Body_Pick-up', 'Body_Roadster', 'Body_SUV', 'Body_Sedan', 'Body_Station wagon']

Now columns in total: 100
shape: (37695, 100)


In [12]:
pd.set_option('display.max_columns', None)    # näitab KÕIKI veerge
pd.set_option('display.max_rows', 10)         # näitab täpselt 10 rida (pool enne ja pool pärast)
pd.set_option('display.width', None)          # automaatne laius
pd.set_option('display.expand_frame_repr', False)  # ei murragi ridasid
pd.set_option('display.max_colwidth', 50)     # veergude sisu ei lõigata liiga lühikeseks

df.head(10)

Unnamed: 0,Powertrain Architecture,Body type,Fuel Type,Max. weight,Length,Width,Height,Number of Gears (manual transmission),Front track,Emission standard,Model,Steering type,Seats,Battery capacity,Position of cylinders,Power per litre,Max. roof load,Cylinder Bore,Drivetrain Architecture,Rear brakes,Assisting systems,Front suspension,100 km/h - 0,Doors,Maximum engine speed,Engine location,Kerb Weight,Front overhang,Ramp angle,Engine systems,Number of Gears (automatic transmission),Fuel tank capacity,Acceleration,Piston Stroke,Power,Number of valves per cylinder,Climb angle,Drag coefficient (Cd),Fuel consumption - extra urban,Compression ratio,Valvetrain,Weight-to-power ratio,Trunk (boot) space - maximum,Wheelbase,Front brakes,Coolant,Engine oil capacity,CO2 emissions,Wheel rims size,Engine aspiration,Departure angle,Fuel System,Permitted towbar download,Approach angle,Torque,Permitted trailer load without brakes,Max load,Trunk (boot) space - minimum,Number of cylinders,Permitted trailer load with brakes (12%),Fuel consumption - combined,Drive wheel,Ride height,Minimum turning circle (turning diameter),Permitted trailer load with brakes (8%),Rear suspension,Maximum speed,Rear (Back) track,Tires size,Acceleration 0 - 100 km/h (0 - 62 mph),Engine displacement,Fuel consumption - urban,Rear overhang,Year,Production_years,Brand_Model,Brand_freq,Brand_Model_freq,Year_norm,Production_years_norm,is_ev,is_phev,is_mhev_fhev,is_ice,Powertrain_simple,Body_Cabriolet,Body_Coupe,Body_Hatchback,Body_MPV,Body_Other,Body_Pick-up,Body_Roadster,Body_SUV,Body_Sedan,Body_Station wagon,Fuel_simple,is_diesel,is_petrol,is_hybrid_ev,is_lpg_cng
0,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,95 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0
1,Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,54 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1972.0,3.0,DAF 66,0.000186,0.000186,0.342105,0.108108,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0
2,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,90 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0
3,Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,57 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1973.0,2.0,DAF 66,0.000186,0.000186,0.355263,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0
4,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,110 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0
5,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,70 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0
6,Internal Combustion engine,Coupe,Petrol (Gasoline),,4660 mm,1890 mm,1340 mm,5.0,1590 mm,,,Steering rack and pinion,4.0,,V-engine,70.9 Hp/l,,90.2 mm,The Internal combustion engine (ICE) drives th...,Ventilated discs,ABS (Anti-lock braking system),Coil spring,,3.0,,,1615 kg,,,,,70 l,,90 mm,326 Hp @ 5800 rpm.,4.0,,,10.5 l/100 km,9.85,DOHC,"5 kg/Hp, 201.9 Hp/tonne",300 l,2720 mm,Ventilated discs,,,,,Naturally aspirated engine,,Multi-point indirect injection,,,427 Nm @ 4600 rpm.,,,185 l,8.0,,,Rear wheel drive,,,,Helical spring,250 km/h,1570 mm,255/55 R18,6.1 sec,4601 cm3,15.1 l/100 km,,1998.0,2.0,AC Aceca,0.000186,5.3e-05,0.684211,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0
7,Internal Combustion engine,Sedan,Petrol (Gasoline),,4805 mm,1810 mm,1450 mm,,1550 mm,,,Steering rack and pinion,5.0,,Inline,71.3 Hp/l,,,The Internal combustion engine (ICE) drives th...,Disc,ABS (Anti-lock braking system),Spring Strut,,4.0,,"Front, Transverse",,,,,5.0,65 l,,,142 Hp @ 5800 rpm.,4.0,,,,10.0,,,,2700 mm,Ventilated discs,,,,6J x 15,Naturally aspirated engine,,Multi-point indirect injection,,,187 Nm @ 3800 rpm.,,,480 l,6.0,,,Front wheel drive,,,,Helical spring,,1545 mm,205/65 R15,,1992 cm3,,,2006.0,5.0,Daewoo Tosca,0.004245,0.000106,0.789474,0.162162,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0
8,Internal Combustion engine,Hatchback,Petrol (Gasoline),,4158 mm,1663 mm,1360 mm,,1400 mm,,,Steering rack and pinion,5.0,,Inline,46.9 Hp/l,,79 mm,The Internal combustion engine (ICE) drives th...,Drum,ABS (Anti-lock braking system),Wishbone,,3.0,,"Front, Transverse",990 kg,,,,3.0,50 l,,81.5 mm,75 Hp @ 5600 rpm.,2.0,,,,8.6,OHC,"13.2 kg/Hp, 75.8 Hp/tonne",,2520 mm,Disc,,,,,Naturally aspirated engine,,Mono-point injection,,,122 Nm @ 2800 rpm.,,,,4.0,,,Front wheel drive,,,,Helical spring,,1406 mm,175/70 R13,,1598 cm3,,,1986.0,9.0,Daewoo Racer,0.004245,0.000212,0.526316,0.27027,0,0,0,1,ICE,0,0,1,0,0,0,0,0,0,0,Petrol,0,1,0,0
9,Internal Combustion engine,Sedan,Petrol (Gasoline),1460 kg,4482 mm,1662 mm,1394 mm,5.0,1400 mm,,,Steering rack and pinion,5.0,,Inline,50.1 Hp/l,,76.5 mm,The Internal combustion engine (ICE) drives th...,Drum,,Wishbone,,4.0,,"Front, Transverse",969 kg,,,,,50 l,,81.5 mm,75 Hp @ 5400 rpm.,2.0,,,5.1 l/100 km,9.5,OHC,"12.9 kg/Hp, 77.4 Hp/tonne",,2520 mm,Ventilated discs,,,,R13,Naturally aspirated engine,,Mono-point injection,,,123 Nm @ 3200 rpm.,,491 kg,530 l,4.0,,6.3 l/100 km,Front wheel drive,,,,Suspension with traction connecting levers,163 km/h,1406 mm,175/70 R13,12.5 sec,1498 cm3,9.3 l/100 km,,1993.0,2.0,Daewoo Racer,0.004245,0.000212,0.618421,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0


In [13]:
from sklearn.preprocessing import StandardScaler

# columns to be worked on:
dim_cols = ['Length', 'Width', 'Height', 'Max. weight', 'Front track']
gear_col = 'Number of Gears (manual transmission)'

# ===========================================================================
# 1. cleaning original columns – mm/kg values
for col in dim_cols:
    if col in df.columns:
        cleaned = df[col].astype(str).str.replace(r'\D+', '', regex=True)
        cleaned = pd.to_numeric(cleaned, errors='coerce')
        df[col] = cleaned.astype('Int64')  # ilus täisarv + NaN tugi

# ===========================================================================
# 2. gears – in series
if gear_col in df.columns:
    # Võtame ainult esimese numbri (nt "6 iMT" → 6)
    gears_num = df[gear_col].astype(str).str.extract(r'(\d+)')[0]  # [0] = võtame Seriesi
    gears_num = pd.to_numeric(gears_num, errors='coerce')
    
    # Nüüd between() töötab ideaalselt
    df['gears_manual'] = gears_num.where(gears_num.between(3, 8), np.nan).astype('Int64')
    
    df = df.drop(gear_col, axis=1)  # vana tulp ära


# ===========================================================================
# 3. Creating normalized versions for z-model
cols_for_model = ['Length', 'Width', 'Height', 'Max. weight', 'Front track', 'gears_manual']
cols_for_model = [c for c in cols_for_model if c in df.columns]

if cols_for_model:  # if there is something to normalize
    scaler = StandardScaler()
    normalized = scaler.fit_transform(df[cols_for_model])
    
    for i, col in enumerate(cols_for_model):
        df[f'{col}_z'] = normalized[:, i]


print("Cleaned human readable:")
print(df[['Length', 'Width', 'Height', 'Max. weight', 'gears_manual']].head(10))

print("\nStatistics for normalized model columns:")
z_cols = [c for c in df.columns if c.endswith('_z')]
for col in z_cols:
    clean_name = col.replace('_z', '')
    missing = df[col].isna().sum()
    print(f"  → {col:20} | NaN: {missing:5} | mean ≈ {df[col].mean():.4f} | std ≈ {df[col].std():.4f}")

print(f"\nCreated normalized columns in total: {len(z_cols)}")

Cleaned human readable:
   Length  Width  Height  Max. weight  gears_manual
0    <NA>   <NA>    <NA>         <NA>          <NA>
1    <NA>   <NA>    <NA>         <NA>          <NA>
2    <NA>   <NA>    <NA>         <NA>          <NA>
3    <NA>   <NA>    <NA>         <NA>          <NA>
4    <NA>   <NA>    <NA>         <NA>          <NA>
5    <NA>   <NA>    <NA>         <NA>          <NA>
6    4660   1890    1340         <NA>             5
7    4805   1810    1450         <NA>          <NA>
8    4158   1663    1360         <NA>          <NA>
9    4482   1662    1394         1460             5

Statistics for normalized model columns:
  → Length_z             | NaN:  3171 | mean ≈ -0.0000 | std ≈ 1.0000
  → Width_z              | NaN:  3372 | mean ≈ 0.0000 | std ≈ 1.0000
  → Height_z             | NaN:  3391 | mean ≈ -0.0000 | std ≈ 1.0000
  → Max. weight_z        | NaN: 10892 | mean ≈ 0.0000 | std ≈ 1.0000
  → Front track_z        | NaN:  6574 | mean ≈ -0.0000 | std ≈ 1.0000
  → gears_manu

In [14]:
pd.set_option('display.max_columns', None)    # näitab KÕIKI veerge
pd.set_option('display.max_rows', 10)         # näitab täpselt 10 rida (pool enne ja pool pärast)
pd.set_option('display.width', None)          # automaatne laius
pd.set_option('display.expand_frame_repr', False)  # ei murragi ridasid
pd.set_option('display.max_colwidth', 50)     # veergude sisu ei lõigata liiga lühikeseks

df.head(10)

Unnamed: 0,Powertrain Architecture,Body type,Fuel Type,Max. weight,Length,Width,Height,Front track,Emission standard,Model,Steering type,Seats,Battery capacity,Position of cylinders,Power per litre,Max. roof load,Cylinder Bore,Drivetrain Architecture,Rear brakes,Assisting systems,Front suspension,100 km/h - 0,Doors,Maximum engine speed,Engine location,Kerb Weight,Front overhang,Ramp angle,Engine systems,Number of Gears (automatic transmission),Fuel tank capacity,Acceleration,Piston Stroke,Power,Number of valves per cylinder,Climb angle,Drag coefficient (Cd),Fuel consumption - extra urban,Compression ratio,Valvetrain,Weight-to-power ratio,Trunk (boot) space - maximum,Wheelbase,Front brakes,Coolant,Engine oil capacity,CO2 emissions,Wheel rims size,Engine aspiration,Departure angle,Fuel System,Permitted towbar download,Approach angle,Torque,Permitted trailer load without brakes,Max load,Trunk (boot) space - minimum,Number of cylinders,Permitted trailer load with brakes (12%),Fuel consumption - combined,Drive wheel,Ride height,Minimum turning circle (turning diameter),Permitted trailer load with brakes (8%),Rear suspension,Maximum speed,Rear (Back) track,Tires size,Acceleration 0 - 100 km/h (0 - 62 mph),Engine displacement,Fuel consumption - urban,Rear overhang,Year,Production_years,Brand_Model,Brand_freq,Brand_Model_freq,Year_norm,Production_years_norm,is_ev,is_phev,is_mhev_fhev,is_ice,Powertrain_simple,Body_Cabriolet,Body_Coupe,Body_Hatchback,Body_MPV,Body_Other,Body_Pick-up,Body_Roadster,Body_SUV,Body_Sedan,Body_Station wagon,Fuel_simple,is_diesel,is_petrol,is_hybrid_ev,is_lpg_cng,gears_manual,Length_z,Width_z,Height_z,Max. weight_z,Front track_z,gears_manual_z
0,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,95 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,
1,Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,54 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1972.0,3.0,DAF 66,0.000186,0.000186,0.342105,0.108108,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0,,,,,,,
2,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,90 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,
3,Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,57 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1973.0,2.0,DAF 66,0.000186,0.000186,0.355263,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0,,,,,,,
4,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,110 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,
5,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,70 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,
6,Internal Combustion engine,Coupe,Petrol (Gasoline),,4660.0,1890.0,1340.0,1590.0,,,Steering rack and pinion,4.0,,V-engine,70.9 Hp/l,,90.2 mm,The Internal combustion engine (ICE) drives th...,Ventilated discs,ABS (Anti-lock braking system),Coil spring,,3.0,,,1615 kg,,,,,70 l,,90 mm,326 Hp @ 5800 rpm.,4.0,,,10.5 l/100 km,9.85,DOHC,"5 kg/Hp, 201.9 Hp/tonne",300 l,2720 mm,Ventilated discs,,,,,Naturally aspirated engine,,Multi-point indirect injection,,,427 Nm @ 4600 rpm.,,,185 l,8.0,,,Rear wheel drive,,,,Helical spring,250 km/h,1570 mm,255/55 R18,6.1 sec,4601 cm3,15.1 l/100 km,,1998.0,2.0,AC Aceca,0.000186,5.3e-05,0.684211,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,5.0,-0.125848,-0.075452,-0.199609,,-0.246883,-0.522185
7,Internal Combustion engine,Sedan,Petrol (Gasoline),,4805.0,1810.0,1450.0,1550.0,,,Steering rack and pinion,5.0,,Inline,71.3 Hp/l,,,The Internal combustion engine (ICE) drives th...,Disc,ABS (Anti-lock braking system),Spring Strut,,4.0,,"Front, Transverse",,,,,5.0,65 l,,,142 Hp @ 5800 rpm.,4.0,,,,10.0,,,,2700 mm,Ventilated discs,,,,6J x 15,Naturally aspirated engine,,Multi-point indirect injection,,,187 Nm @ 3800 rpm.,,,480 l,6.0,,,Front wheel drive,,,,Helical spring,,1545 mm,205/65 R15,,1992 cm3,,,2006.0,5.0,Daewoo Tosca,0.004245,0.000106,0.789474,0.162162,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0,,-0.125825,-0.075512,-0.199575,,-0.246894,
8,Internal Combustion engine,Hatchback,Petrol (Gasoline),,4158.0,1663.0,1360.0,1400.0,,,Steering rack and pinion,5.0,,Inline,46.9 Hp/l,,79 mm,The Internal combustion engine (ICE) drives th...,Drum,ABS (Anti-lock braking system),Wishbone,,3.0,,"Front, Transverse",990 kg,,,,3.0,50 l,,81.5 mm,75 Hp @ 5600 rpm.,2.0,,,,8.6,OHC,"13.2 kg/Hp, 75.8 Hp/tonne",,2520 mm,Disc,,,,,Naturally aspirated engine,,Mono-point injection,,,122 Nm @ 2800 rpm.,,,,4.0,,,Front wheel drive,,,,Helical spring,,1406 mm,175/70 R13,,1598 cm3,,,1986.0,9.0,Daewoo Racer,0.004245,0.000212,0.526316,0.27027,0,0,0,1,ICE,0,0,1,0,0,0,0,0,0,0,Petrol,0,1,0,0,,-0.125927,-0.075623,-0.199603,,-0.246935,
9,Internal Combustion engine,Sedan,Petrol (Gasoline),1460.0,4482.0,1662.0,1394.0,1400.0,,,Steering rack and pinion,5.0,,Inline,50.1 Hp/l,,76.5 mm,The Internal combustion engine (ICE) drives th...,Drum,,Wishbone,,4.0,,"Front, Transverse",969 kg,,,,,50 l,,81.5 mm,75 Hp @ 5400 rpm.,2.0,,,5.1 l/100 km,9.5,OHC,"12.9 kg/Hp, 77.4 Hp/tonne",,2520 mm,Ventilated discs,,,,R13,Naturally aspirated engine,,Mono-point injection,,,123 Nm @ 3200 rpm.,,491 kg,530 l,4.0,,6.3 l/100 km,Front wheel drive,,,,Suspension with traction connecting levers,163 km/h,1406 mm,175/70 R13,12.5 sec,1498 cm3,9.3 l/100 km,,1993.0,2.0,Daewoo Racer,0.004245,0.000212,0.618421,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0,5.0,-0.125876,-0.075624,-0.199592,-0.084383,-0.246935,-0.522185


In [15]:
# more normalizing
from sklearn.preprocessing import StandardScaler

# ------------------------------------------------------------------
# 1. Emission standard → ordered categories (more effective than one-hot)
emission_order = {
    'Euro 1': 1, 'Euro 2': 2, 'Euro 3': 3, 'Euro 4': 4,
    'Euro 5': 5, 'Euro 5a': 5.1, 'Euro 5b': 5.2,
    'Euro 6': 6, 'Euro 6d-TEMP': 6.5, 'Euro 6d': 6.7,
    'Euro 6c': 6.3, 'Euro 6b': 6.2, 'Euro 6a': 6.1
}

if 'Emission standard' in df.columns:
    df['emission_level'] = df['Emission standard'].map(emission_order)
    # Normalizing
    scaler_em = StandardScaler()
    df['emission_level_z'] = scaler_em.fit_transform(df[['emission_level']])
    df['Emission standard'] = df['Emission standard'].astype('category')
    

# ------------------------------------------------------------------
# 2. Steering type → one-hot (4 values)
if 'Steering type' in df.columns:
    df = pd.get_dummies(df, columns=['Steering type'], prefix='steering', dummy_na=False)


# ------------------------------------------------------------------
# 3. Seats → cleaning + normalizing
if 'Seats' in df.columns:
    df['Seats'] = pd.to_numeric(df['Seats'], errors='coerce').astype('Int64')
    scaler_s = StandardScaler()
    df['Seats_z'] = scaler_s.fit_transform(df[['Seats']])


# ------------------------------------------------------------------
# 4. Position of cylinders → one-hot (8 values)
if 'Position of cylinders' in df.columns:
    df = pd.get_dummies(df, columns=['Position of cylinders'], prefix='cyl', dummy_na=False)


# ------------------------------------------------------------------
# 5. Power per litre → cleaning + normalized
if 'Power per litre' in df.columns:
    df['Power_per_litre'] = df['Power per litre'].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)
    scaler_ppl = StandardScaler()
    df['Power_per_litre_z'] = scaler_ppl.fit_transform(df[['Power_per_litre']])


# ------------------------------------------------------------------
# 6. Cylinder Bore → mm → numeric + normalized
if 'Cylinder Bore' in df.columns:
    df['Cylinder_Bore_mm'] = df['Cylinder Bore'].astype(str).str.replace(r'\D+', '', regex=True)
    df['Cylinder_Bore_mm'] = pd.to_numeric(df['Cylinder_Bore_mm'], errors='coerce').astype('Int64')
    scaler_bore = StandardScaler()
    df['Cylinder_Bore_z'] = scaler_bore.fit_transform(df[['Cylinder_Bore_mm']])


# ------------------------------------------------------------------
# 7. Max. roof load → kg → numeric + normalized
if 'Max. roof load' in df.columns:
    df['Max_roof_load_kg'] = df['Max. roof load'].astype(str).str.replace(r'\D+', '', regex=True)
    df['Max_roof_load_kg'] = pd.to_numeric(df['Max_roof_load_kg'], errors='coerce').astype('Int64')
    scaler_roof = StandardScaler()
    df['Max_roof_load_z'] = scaler_roof.fit_transform(df[['Max_roof_load_kg']])


# ------------------------------------------------------------------
# 8. Battery capacity → kWh → numeric + normalized
if 'Battery capacity' in df.columns:
    df['Battery_kWh'] = df['Battery capacity'].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)
    scaler_bat = StandardScaler()
    df['Battery_kWh_z'] = scaler_bat.fit_transform(df[['Battery_kWh']])


# ------------------------------------------------------------------
# 9. Model (engine code (K9K, B47D20 etc) → high cardinality → Target Encoding later or drop
# mostly throwing away or doing target encoding
if 'Model' in df.columns:
    df = df.drop('Model', axis=1)  # 4220 unique – to many for one-hot encoding



new_numeric_z = [c for c in df.columns if c.endswith('_z')]
new_cat_cols  = [c for c in df.columns if c.startswith(('steering_', 'cyl_'))]

print("Normalized numeric attributes (_z):")
for c in new_numeric_z:
    print(f"   → {c:22} | NaN: {df[c].isna().sum():5} | mean ≈ {df[c].mean():.3f}")

print(f"\nOne-hot categories: {len(new_cat_cols)} columns")
print(f"Emission standard → emission_level_z (ordered)")
print(f"Model (engine code) was deleted beacuse too many unique values)")

print(f"\nKokku tulpi nüüd: {df.shape[1]}")

Normalized numeric attributes (_z):
   → Length_z               | NaN:  3171 | mean ≈ -0.000
   → Width_z                | NaN:  3372 | mean ≈ 0.000
   → Height_z               | NaN:  3391 | mean ≈ -0.000
   → Max. weight_z          | NaN: 10892 | mean ≈ 0.000
   → Front track_z          | NaN:  6574 | mean ≈ -0.000
   → gears_manual_z         | NaN: 20236 | mean ≈ 0.000
   → emission_level_z       | NaN: 23607 | mean ≈ 0.000
   → Seats_z                | NaN:  3747 | mean ≈ 0.000
   → Power_per_litre_z      | NaN:  3545 | mean ≈ -0.000
   → Cylinder_Bore_z        | NaN:  9576 | mean ≈ 0.000
   → Max_roof_load_z        | NaN: 29771 | mean ≈ 0.000
   → Battery_kWh_z          | NaN: 37072 | mean ≈ -0.000

One-hot categories: 10 columns
Emission standard → emission_level_z (ordered)
Model (engine code) was deleted beacuse too many unique values)

Kokku tulpi nüüd: 124


In [16]:
pd.set_option('display.max_columns', None)    # näitab KÕIKI veerge
pd.set_option('display.max_rows', 10)         # näitab täpselt 10 rida (pool enne ja pool pärast)
pd.set_option('display.width', None)          # automaatne laius
pd.set_option('display.expand_frame_repr', False)  # ei murragi ridasid
pd.set_option('display.max_colwidth', 50)     # veergude sisu ei lõigata liiga lühikeseks

df.head(10)

Unnamed: 0,Powertrain Architecture,Body type,Fuel Type,Max. weight,Length,Width,Height,Front track,Emission standard,Seats,Battery capacity,Power per litre,Max. roof load,Cylinder Bore,Drivetrain Architecture,Rear brakes,Assisting systems,Front suspension,100 km/h - 0,Doors,Maximum engine speed,Engine location,Kerb Weight,Front overhang,Ramp angle,Engine systems,Number of Gears (automatic transmission),Fuel tank capacity,Acceleration,Piston Stroke,Power,Number of valves per cylinder,Climb angle,Drag coefficient (Cd),Fuel consumption - extra urban,Compression ratio,Valvetrain,Weight-to-power ratio,Trunk (boot) space - maximum,Wheelbase,Front brakes,Coolant,Engine oil capacity,CO2 emissions,Wheel rims size,Engine aspiration,Departure angle,Fuel System,Permitted towbar download,Approach angle,Torque,Permitted trailer load without brakes,Max load,Trunk (boot) space - minimum,Number of cylinders,Permitted trailer load with brakes (12%),Fuel consumption - combined,Drive wheel,Ride height,Minimum turning circle (turning diameter),Permitted trailer load with brakes (8%),Rear suspension,Maximum speed,Rear (Back) track,Tires size,Acceleration 0 - 100 km/h (0 - 62 mph),Engine displacement,Fuel consumption - urban,Rear overhang,Year,Production_years,Brand_Model,Brand_freq,Brand_Model_freq,Year_norm,Production_years_norm,is_ev,is_phev,is_mhev_fhev,is_ice,Powertrain_simple,Body_Cabriolet,Body_Coupe,Body_Hatchback,Body_MPV,Body_Other,Body_Pick-up,Body_Roadster,Body_SUV,Body_Sedan,Body_Station wagon,Fuel_simple,is_diesel,is_petrol,is_hybrid_ev,is_lpg_cng,gears_manual,Length_z,Width_z,Height_z,Max. weight_z,Front track_z,gears_manual_z,emission_level,emission_level_z,steering_Cone worm with recirculation balls,steering_Steering rack and pinion,steering_Worm-reduction unit,Seats_z,cyl_180° flat V-engine,cyl_Boxer,cyl_Inline,cyl_Rotary (Wankel),cyl_V-engine,cyl_VR-engine,cyl_W-engine,Power_per_litre,Power_per_litre_z,Cylinder_Bore_mm,Cylinder_Bore_z,Max_roof_load_kg,Max_roof_load_z,Battery_kWh,Battery_kWh_z
0,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,95 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,
1,Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,54 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1972.0,3.0,DAF 66,0.000186,0.000186,0.342105,0.108108,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,
2,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,90 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,
3,Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,57 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1973.0,2.0,DAF 66,0.000186,0.000186,0.355263,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,
4,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,110 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,
5,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,,,,,,70 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,
6,Internal Combustion engine,Coupe,Petrol (Gasoline),,4660.0,1890.0,1340.0,1590.0,,4.0,,70.9 Hp/l,,90.2 mm,The Internal combustion engine (ICE) drives th...,Ventilated discs,ABS (Anti-lock braking system),Coil spring,,3.0,,,1615 kg,,,,,70 l,,90 mm,326 Hp @ 5800 rpm.,4.0,,,10.5 l/100 km,9.85,DOHC,"5 kg/Hp, 201.9 Hp/tonne",300 l,2720 mm,Ventilated discs,,,,,Naturally aspirated engine,,Multi-point indirect injection,,,427 Nm @ 4600 rpm.,,,185 l,8.0,,,Rear wheel drive,,,,Helical spring,250 km/h,1570 mm,255/55 R18,6.1 sec,4601 cm3,15.1 l/100 km,,1998.0,2.0,AC Aceca,0.000186,5.3e-05,0.684211,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,5.0,-0.125848,-0.075452,-0.199609,,-0.246883,-0.522185,,,False,True,False,-0.970701,False,False,False,False,True,False,False,70.9,-0.237852,902.0,0.197687,,,,
7,Internal Combustion engine,Sedan,Petrol (Gasoline),,4805.0,1810.0,1450.0,1550.0,,5.0,,71.3 Hp/l,,,The Internal combustion engine (ICE) drives th...,Disc,ABS (Anti-lock braking system),Spring Strut,,4.0,,"Front, Transverse",,,,,5.0,65 l,,,142 Hp @ 5800 rpm.,4.0,,,,10.0,,,,2700 mm,Ventilated discs,,,,6J x 15,Naturally aspirated engine,,Multi-point indirect injection,,,187 Nm @ 3800 rpm.,,,480 l,6.0,,,Front wheel drive,,,,Helical spring,,1545 mm,205/65 R15,,1992 cm3,,,2006.0,5.0,Daewoo Tosca,0.004245,0.000106,0.789474,0.162162,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0,,-0.125825,-0.075512,-0.199575,,-0.246894,,,,False,True,False,0.104765,False,False,True,False,False,False,False,71.3,-0.221777,,,,,,
8,Internal Combustion engine,Hatchback,Petrol (Gasoline),,4158.0,1663.0,1360.0,1400.0,,5.0,,46.9 Hp/l,,79 mm,The Internal combustion engine (ICE) drives th...,Drum,ABS (Anti-lock braking system),Wishbone,,3.0,,"Front, Transverse",990 kg,,,,3.0,50 l,,81.5 mm,75 Hp @ 5600 rpm.,2.0,,,,8.6,OHC,"13.2 kg/Hp, 75.8 Hp/tonne",,2520 mm,Disc,,,,,Naturally aspirated engine,,Mono-point injection,,,122 Nm @ 2800 rpm.,,,,4.0,,,Front wheel drive,,,,Helical spring,,1406 mm,175/70 R13,,1598 cm3,,,1986.0,9.0,Daewoo Racer,0.004245,0.000212,0.526316,0.27027,0,0,0,1,ICE,0,0,1,0,0,0,0,0,0,0,Petrol,0,1,0,0,,-0.125927,-0.075623,-0.199603,,-0.246935,,,,False,True,False,0.104765,False,False,True,False,False,False,False,46.9,-1.202342,79.0,-0.305259,,,,
9,Internal Combustion engine,Sedan,Petrol (Gasoline),1460.0,4482.0,1662.0,1394.0,1400.0,,5.0,,50.1 Hp/l,,76.5 mm,The Internal combustion engine (ICE) drives th...,Drum,,Wishbone,,4.0,,"Front, Transverse",969 kg,,,,,50 l,,81.5 mm,75 Hp @ 5400 rpm.,2.0,,,5.1 l/100 km,9.5,OHC,"12.9 kg/Hp, 77.4 Hp/tonne",,2520 mm,Ventilated discs,,,,R13,Naturally aspirated engine,,Mono-point injection,,,123 Nm @ 3200 rpm.,,491 kg,530 l,4.0,,6.3 l/100 km,Front wheel drive,,,,Suspension with traction connecting levers,163 km/h,1406 mm,175/70 R13,12.5 sec,1498 cm3,9.3 l/100 km,,1993.0,2.0,Daewoo Racer,0.004245,0.000212,0.618421,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0,5.0,-0.125876,-0.075624,-0.199592,-0.084383,-0.246935,-0.522185,,,False,True,False,0.104765,False,False,True,False,False,False,False,50.1,-1.073743,765.0,0.113965,,,,


In [17]:
from sklearn.preprocessing import StandardScaler

# ------------------------------------------------------------------
# 1. Drivetrain Architecture → FWD/RWD/AWD
if 'Drivetrain Architecture' in df.columns:
    text = df['Drivetrain Architecture'].astype(str).str.lower()
    df['Drivetrain_simple'] = np.select(
        [text.str.contains('front wheels', na=False),
         text.str.contains('rear wheels', na=False),
         text.str.contains('four wheels|all wheels|awd|4x4', na=False)],
        ['FWD', 'RWD', 'AWD'],
        default='Other'
    )
    df = pd.get_dummies(df, columns=['Drivetrain_simple'], prefix='drive', dummy_na=False)


# ------------------------------------------------------------------
# 2. Rear brakes → Disc / Drum / Other
if 'Rear brakes' in df.columns:
    df['rear_brake_type'] = df['Rear brakes'].astype(str)
    df['rear_brake_type'] = df['rear_brake_type'].str.replace(r'.*disc.*|.*ventilated.*', 'Disc', case=False, regex=True)
    df['rear_brake_type'] = df['rear_brake_type'].str.replace(r'.*drum.*', 'Drum', case=False, regex=True)
    df['rear_brake_type'] = df['rear_brake_type'].replace('nan', 'Unknown')
    df['rear_brake_type'] = df['rear_brake_type'].where(df['rear_brake_type'].isin(['Disc','Drum','Unknown']), 'Other')
    df = pd.get_dummies(df, columns=['rear_brake_type'], prefix='rear_brake', dummy_na=False)


# ------------------------------------------------------------------
# 3. Assisting systems → has_abs
if 'Assisting systems' in df.columns:
    df['has_abs'] = df['Assisting systems'].str.contains('ABS', case=False, na=False).astype(int)
    df.drop('Assisting systems', axis=1, inplace=True)


# ------------------------------------------------------------------
# 4. Front suspension → simplified
if 'Front suspension' in df.columns:
    df['front_susp'] = 'Other'
    cond_mcph = df['Front suspension'].str.contains('McPherson|MacPherson|Spring Strut', na=False, case=False)
    cond_dbl  = df['Front suspension'].str.contains('Double wishbone', na=False, case=False)
    cond_ml   = df['Front suspension'].str.contains('Multi-link|multi link', na=False, case=False)
    df.loc[cond_mcph, 'front_susp'] = 'McPherson'
    df.loc[cond_dbl,  'front_susp'] = 'Double_wishbone'
    df.loc[cond_ml,   'front_susp'] = 'Multi_link'
    df = pd.get_dummies(df, columns=['front_susp'], prefix='susp', dummy_na=False)


# ------------------------------------------------------------------
# 5. numeric columns – using regex
numeric_cleaning = {
    'Doors':                {'new': 'Doors_clean',       'pat': r'(\d)',           'type': 'Int64'},
    'Kerb Weight':          {'new': 'Kerb_Weight_kg',    'pat': r'(\d+)',          'type': 'Int64'},
    '100 km/h - 0':         {'new': 'Braking_100to0_m',  'pat': r'(\d+\.?\d*)',    'type': float},
    'Maximum engine speed': {'new': 'Max_RPM',          'pat': r'(\d+)',          'type': float},
    'Front overhang':       {'new': 'Front_overhang_mm','pat': r'(\d+)',          'type': 'Int64'},
    'Ramp angle':           {'new': 'Ramp_angle_deg',   'pat': r'(\d+\.?\d*)',    'type': float}
}

for old_col, cfg in numeric_cleaning.items():
    if old_col in df.columns:
        extracted = df[old_col].astype(str).str.extract(cfg['pat'])[0]
        df[cfg['new']] = pd.to_numeric(extracted, errors='coerce')
        if cfg['type'] == 'Int64':
            df[cfg['new']] = df[cfg['new']].astype('Int64')
        else:
            df[cfg['new']] = df[cfg['new']].astype(float)

        # normalizing
        scaler = StandardScaler()
        df[f"{cfg['new']}_z"] = scaler.fit_transform(df[[cfg['new']]])


# ------------------------------------------------------------------
# 6. Engine location → one-hot
if 'Engine location' in df.columns:
    df = pd.get_dummies(df, columns=['Engine location'], prefix='engine_loc', dummy_na=False)


# ------------------------------------------------------------------
# 7. Erasing only truly messy text-based columns
df.drop(columns=['Drivetrain Architecture', 'Rear brakes', 'Front suspension'], 
        inplace=True, errors='ignore')


z_cols = [c for c in df.columns if c.endswith('_z')]
print("Normalized numeric attributes (_z):")
for c in z_cols:
    print(f"  → {c}")

print(f"\nNormalized in total: {len(z_cols)}")
print(f"One-hot attributes in total: {len([c for c in df.columns if any(c.startswith(p) for p in ['drive_','rear_brake_','susp_','engine_loc_'])])}")

print(f"\nHuman readable:")
human = ['Kerb_Weight_kg','Doors_clean','Front_overhang_mm','Ramp_angle_deg','Braking_100to0_m','Max_RPM']
print([c for c in human if c in df.columns])

print(f"\nshape: {df.shape[0]:,} rows × {df.shape[1]} columns")

Normalized numeric attributes (_z):
  → Length_z
  → Width_z
  → Height_z
  → Max. weight_z
  → Front track_z
  → gears_manual_z
  → emission_level_z
  → Seats_z
  → Power_per_litre_z
  → Cylinder_Bore_z
  → Max_roof_load_z
  → Battery_kWh_z
  → Doors_clean_z
  → Kerb_Weight_kg_z
  → Braking_100to0_m_z
  → Max_RPM_z
  → Front_overhang_mm_z
  → Ramp_angle_deg_z

Normalized in total: 18
One-hot attributes in total: 18

Human readable:
['Kerb_Weight_kg', 'Doors_clean', 'Front_overhang_mm', 'Ramp_angle_deg', 'Braking_100to0_m', 'Max_RPM']

shape: 37,695 rows × 150 columns


In [18]:
pd.set_option('display.max_columns', None)    # näitab KÕIKI veerge
pd.set_option('display.max_rows', 10)         # näitab täpselt 10 rida (pool enne ja pool pärast)
pd.set_option('display.width', None)          # automaatne laius
pd.set_option('display.expand_frame_repr', False)  # ei murragi ridasid
pd.set_option('display.max_colwidth', 50)     # veergude sisu ei lõigata liiga lühikeseks

df.head(10)

Unnamed: 0,Powertrain Architecture,Body type,Fuel Type,Max. weight,Length,Width,Height,Front track,Emission standard,Seats,Battery capacity,Power per litre,Max. roof load,Cylinder Bore,100 km/h - 0,Doors,Maximum engine speed,Kerb Weight,Front overhang,Ramp angle,Engine systems,Number of Gears (automatic transmission),Fuel tank capacity,Acceleration,Piston Stroke,Power,Number of valves per cylinder,Climb angle,Drag coefficient (Cd),Fuel consumption - extra urban,Compression ratio,Valvetrain,Weight-to-power ratio,Trunk (boot) space - maximum,Wheelbase,Front brakes,Coolant,Engine oil capacity,CO2 emissions,Wheel rims size,Engine aspiration,Departure angle,Fuel System,Permitted towbar download,Approach angle,Torque,Permitted trailer load without brakes,Max load,Trunk (boot) space - minimum,Number of cylinders,Permitted trailer load with brakes (12%),Fuel consumption - combined,Drive wheel,Ride height,Minimum turning circle (turning diameter),Permitted trailer load with brakes (8%),Rear suspension,Maximum speed,Rear (Back) track,Tires size,Acceleration 0 - 100 km/h (0 - 62 mph),Engine displacement,Fuel consumption - urban,Rear overhang,Year,Production_years,Brand_Model,Brand_freq,Brand_Model_freq,Year_norm,Production_years_norm,is_ev,is_phev,is_mhev_fhev,is_ice,Powertrain_simple,Body_Cabriolet,Body_Coupe,Body_Hatchback,Body_MPV,Body_Other,Body_Pick-up,Body_Roadster,Body_SUV,Body_Sedan,Body_Station wagon,Fuel_simple,is_diesel,is_petrol,is_hybrid_ev,is_lpg_cng,gears_manual,Length_z,Width_z,Height_z,Max. weight_z,Front track_z,gears_manual_z,emission_level,emission_level_z,steering_Cone worm with recirculation balls,steering_Steering rack and pinion,steering_Worm-reduction unit,Seats_z,cyl_180° flat V-engine,cyl_Boxer,cyl_Inline,cyl_Rotary (Wankel),cyl_V-engine,cyl_VR-engine,cyl_W-engine,Power_per_litre,Power_per_litre_z,Cylinder_Bore_mm,Cylinder_Bore_z,Max_roof_load_kg,Max_roof_load_z,Battery_kWh,Battery_kWh_z,drive_AWD,drive_FWD,drive_Other,drive_RWD,rear_brake_Disc,rear_brake_Drum,rear_brake_Unknown,has_abs,susp_Double_wishbone,susp_McPherson,susp_Multi_link,susp_Other,Doors_clean,Doors_clean_z,Kerb_Weight_kg,Kerb_Weight_kg_z,Braking_100to0_m,Braking_100to0_m_z,Max_RPM,Max_RPM_z,Front_overhang_mm,Front_overhang_mm_z,Ramp_angle_deg,Ramp_angle_deg_z,"engine_loc_Front axle, Transverse","engine_loc_Front, Longitudinal","engine_loc_Front, Transverse","engine_loc_Middle, Longitudinal","engine_loc_Middle, Transverse","engine_loc_Rear, Longitudinal","engine_loc_Rear, Transverse"
0,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,95 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False
1,Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,54 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1972.0,3.0,DAF 66,0.000186,0.000186,0.342105,0.108108,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False
2,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,90 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False
3,Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,57 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1973.0,2.0,DAF 66,0.000186,0.000186,0.355263,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False
4,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,110 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False
5,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,,70 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False
6,Internal Combustion engine,Coupe,Petrol (Gasoline),,4660.0,1890.0,1340.0,1590.0,,4.0,,70.9 Hp/l,,90.2 mm,,3.0,,1615 kg,,,,,70 l,,90 mm,326 Hp @ 5800 rpm.,4.0,,,10.5 l/100 km,9.85,DOHC,"5 kg/Hp, 201.9 Hp/tonne",300 l,2720 mm,Ventilated discs,,,,,Naturally aspirated engine,,Multi-point indirect injection,,,427 Nm @ 4600 rpm.,,,185 l,8.0,,,Rear wheel drive,,,,Helical spring,250 km/h,1570 mm,255/55 R18,6.1 sec,4601 cm3,15.1 l/100 km,,1998.0,2.0,AC Aceca,0.000186,5.3e-05,0.684211,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,5.0,-0.125848,-0.075452,-0.199609,,-0.246883,-0.522185,,,False,True,False,-0.970701,False,False,False,False,True,False,False,70.9,-0.237852,902.0,0.197687,,,,,False,False,False,True,True,False,False,1,False,False,False,True,3.0,-0.995372,1615.0,0.372344,,,,,,,,,False,False,False,False,False,False,False
7,Internal Combustion engine,Sedan,Petrol (Gasoline),,4805.0,1810.0,1450.0,1550.0,,5.0,,71.3 Hp/l,,,,4.0,,,,,,5.0,65 l,,,142 Hp @ 5800 rpm.,4.0,,,,10.0,,,,2700 mm,Ventilated discs,,,,6J x 15,Naturally aspirated engine,,Multi-point indirect injection,,,187 Nm @ 3800 rpm.,,,480 l,6.0,,,Front wheel drive,,,,Helical spring,,1545 mm,205/65 R15,,1992 cm3,,,2006.0,5.0,Daewoo Tosca,0.004245,0.000106,0.789474,0.162162,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0,,-0.125825,-0.075512,-0.199575,,-0.246894,,,,False,True,False,0.104765,False,False,True,False,False,False,False,71.3,-0.221777,,,,,,,False,True,False,False,True,False,False,1,False,True,False,False,4.0,-0.075803,,,,,,,,,,,False,False,True,False,False,False,False
8,Internal Combustion engine,Hatchback,Petrol (Gasoline),,4158.0,1663.0,1360.0,1400.0,,5.0,,46.9 Hp/l,,79 mm,,3.0,,990 kg,,,,3.0,50 l,,81.5 mm,75 Hp @ 5600 rpm.,2.0,,,,8.6,OHC,"13.2 kg/Hp, 75.8 Hp/tonne",,2520 mm,Disc,,,,,Naturally aspirated engine,,Mono-point injection,,,122 Nm @ 2800 rpm.,,,,4.0,,,Front wheel drive,,,,Helical spring,,1406 mm,175/70 R13,,1598 cm3,,,1986.0,9.0,Daewoo Racer,0.004245,0.000212,0.526316,0.27027,0,0,0,1,ICE,0,0,1,0,0,0,0,0,0,0,Petrol,0,1,0,0,,-0.125927,-0.075623,-0.199603,,-0.246935,,,,False,True,False,0.104765,False,False,True,False,False,False,False,46.9,-1.202342,79.0,-0.305259,,,,,False,True,False,False,False,True,False,1,False,False,False,True,3.0,-0.995372,990.0,-1.334203,,,,,,,,,False,False,True,False,False,False,False
9,Internal Combustion engine,Sedan,Petrol (Gasoline),1460.0,4482.0,1662.0,1394.0,1400.0,,5.0,,50.1 Hp/l,,76.5 mm,,4.0,,969 kg,,,,,50 l,,81.5 mm,75 Hp @ 5400 rpm.,2.0,,,5.1 l/100 km,9.5,OHC,"12.9 kg/Hp, 77.4 Hp/tonne",,2520 mm,Ventilated discs,,,,R13,Naturally aspirated engine,,Mono-point injection,,,123 Nm @ 3200 rpm.,,491 kg,530 l,4.0,,6.3 l/100 km,Front wheel drive,,,,Suspension with traction connecting levers,163 km/h,1406 mm,175/70 R13,12.5 sec,1498 cm3,9.3 l/100 km,,1993.0,2.0,Daewoo Racer,0.004245,0.000212,0.618421,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0,5.0,-0.125876,-0.075624,-0.199592,-0.084383,-0.246935,-0.522185,,,False,True,False,0.104765,False,False,True,False,False,False,False,50.1,-1.073743,765.0,0.113965,,,,,False,True,False,False,False,True,False,0,False,False,False,True,4.0,-0.075803,969.0,-1.391543,,,,,,,,,False,False,True,False,False,False,False


In [19]:
from sklearn.preprocessing import StandardScaler

# ------------------------------------------------------------------
# 1. Engine systems → 3 binary attributes (most wide-known)
if 'Engine systems' in df.columns:
    txt = df['Engine systems'].astype(str).str.lower()
    df['has_particulate_filter'] = txt.str.contains('particulate', na=False).astype(int)
    df['has_start_stop']         = txt.str.contains('start & stop|start/stop', na=False).astype(int)
    df['has_cylinder_deact']    = txt.str.contains('cylinder deactivation', na=False).astype(int)
    df.drop('Engine systems', axis=1, inplace=True)


# ------------------------------------------------------------------
# 2. Number of Gears (automatic transmission) → values + z-score
if 'Number of Gears (automatic transmission)' in df.columns:
    df['gears_auto'] = df['Number of Gears (automatic transmission)'].astype(str).str.extract(r'(\d+)')[0]
    df['gears_auto'] = pd.to_numeric(df['gears_auto'], errors='coerce').astype('Int64')
    scaler = StandardScaler()
    df['gears_auto_z'] = scaler.fit_transform(df[['gears_auto']])


# ------------------------------------------------------------------
# 3. Fuel tank capacity → L + z-score
if 'Fuel tank capacity' in df.columns:
    df['Fuel_tank_L'] = df['Fuel tank capacity'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    scaler = StandardScaler()
    df['Fuel_tank_L_z'] = scaler.fit_transform(df[['Fuel_tank_L']])


# ------------------------------------------------------------------
# 4. Acceleration (0–100 km/h) → s + z-score
if 'Acceleration' in df.columns:
    df['Accel_0_100_sec'] = df['Acceleration'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    scaler = StandardScaler()
    df['Accel_0_100_z'] = scaler.fit_transform(df[['Accel_0_100_sec']])


# ------------------------------------------------------------------
# 5. Piston Stroke → mm + z-score
if 'Piston Stroke' in df.columns:
    df['Piston_Stroke_mm'] = df['Piston Stroke'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    scaler = StandardScaler()
    df['Piston_Stroke_z'] = scaler.fit_transform(df[['Piston_Stroke_mm']])


# ------------------------------------------------------------------
# 6. Power → Hp + z-score
if 'Power' in df.columns:
    txt = df['Power'].astype(str).str.replace(' ', '')  # eemaldame tühikud
    
    # 1.HP
    df['Power_hp'] = txt.str.extract(r'(\d+)Hp', flags=re.IGNORECASE)[0].astype(float)

    # 2. HP at RPM
    # For the range we take the first value
    #    Nt: "150Hp@4000rpm" → 4000
    #        "150Hp@5000-6000rpm" → 5000
    #        "150Hp" → NaN
    df['Power_rpm'] = txt.str.extract(r'@(\d+)')[0].astype(float)
    
    # Normalizing separately
    scaler_hp  = StandardScaler()
    scaler_rpm = StandardScaler()
    
    df['Power_hp_z']  = scaler_hp.fit_transform(df[['Power_hp']])
    df['Power_rpm_z'] = scaler_rpm.fit_transform(df[['Power_rpm']])


# ------------------------------------------------------------------
# 7. Number of valves per cylinder → 2,3,4,5 → number + z-score
if 'Number of valves per cylinder' in df.columns:
    df['Valves_per_cyl'] = pd.to_numeric(df['Number of valves per cylinder'], errors='coerce').astype('Int64')
    scaler = StandardScaler()
    df['Valves_per_cyl_z'] = scaler.fit_transform(df[['Valves_per_cyl']])


# ------------------------------------------------------------------
# 8. Climb angle → degrees + z-score
if 'Climb angle' in df.columns:
    df['Climb_angle_deg'] = df['Climb angle'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    scaler = StandardScaler()
    df['Climb_angle_z'] = scaler.fit_transform(df[['Climb_angle_deg']])


# ------------------------------------------------------------------
# 9. Drag coefficient (Cd) → z-score
if 'Drag coefficient (Cd)' in df.columns:
    df['Drag_Cd'] = df['Drag coefficient (Cd)'].astype(str).str.extract(r'0\.(\d+\.?\d*)')[0]
    df['Drag_Cd'] = ('0.' + df['Drag_Cd']).astype(float)
    scaler = StandardScaler()
    df['Drag_Cd_z'] = scaler.fit_transform(df[['Drag_Cd']])


# ------------------------------------------------------------------
# 10. Fuel consumption - extra urban → l/100km + z-score
if 'Fuel consumption - extra urban' in df.columns:
    df['Fuel_extra_urban_L100km'] = df['Fuel consumption - extra urban'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    scaler = StandardScaler()
    df['Fuel_extra_urban_z'] = scaler.fit_transform(df[['Fuel_extra_urban_L100km']])


z_cols = [c for c in df.columns if c.endswith('_z') and c not in ['Power_hp_z']]  # väldime duplikaati
new_binary = ['has_particulate_filter','has_start_stop','has_cylinder_deact']

print("New binary attributes:")
for c in new_binary:
    print(f"  → {c:30} | 1-ide arv: {df[c].sum():5}")

print("\nNew normalized numeric z-attributes:")
for c in z_cols:
    orig = c.replace('_z','').replace('_L100km','').replace('_sec','').replace('_mm','').replace('_deg','')
    print(f"  → {c:35} ← {orig}")

print(f"\nNormalized attributes in total: {len(z_cols)}")

print(f"\nshape: {df.shape[0]:,} rows × {df.shape[1]} columns")

New binary attributes:
  → has_particulate_filter         | 1-ide arv:  6299
  → has_start_stop                 | 1-ide arv:  2966
  → has_cylinder_deact             | 1-ide arv:   179

New normalized numeric z-attributes:
  → Length_z                            ← Length
  → Width_z                             ← Width
  → Height_z                            ← Height
  → Max. weight_z                       ← Max. weight
  → Front track_z                       ← Front track
  → gears_manual_z                      ← gears_manual
  → emission_level_z                    ← emission_level
  → Seats_z                             ← Seats
  → Power_per_litre_z                   ← Power_per_litre
  → Cylinder_Bore_z                     ← Cylinder_Bore
  → Max_roof_load_z                     ← Max_roof_load
  → Battery_kWh_z                       ← Battery_kWh
  → Doors_clean_z                       ← Doors_clean
  → Kerb_Weight_kg_z                    ← Kerb_Weight_kg
  → Braking_100to0_m_z      

In [20]:
pd.set_option('display.max_columns', None)    # näitab KÕIKI veerge
pd.set_option('display.max_rows', 10)         # näitab täpselt 10 rida (pool enne ja pool pärast)
pd.set_option('display.width', None)          # automaatne laius
pd.set_option('display.expand_frame_repr', False)  # ei murragi ridasid
pd.set_option('display.max_colwidth', 50)     # veergude sisu ei lõigata liiga lühikeseks

df.head(10)

Unnamed: 0,Powertrain Architecture,Body type,Fuel Type,Max. weight,Length,Width,Height,Front track,Emission standard,Seats,Battery capacity,Power per litre,Max. roof load,Cylinder Bore,100 km/h - 0,Doors,Maximum engine speed,Kerb Weight,Front overhang,Ramp angle,Number of Gears (automatic transmission),Fuel tank capacity,Acceleration,Piston Stroke,Power,Number of valves per cylinder,Climb angle,Drag coefficient (Cd),Fuel consumption - extra urban,Compression ratio,Valvetrain,Weight-to-power ratio,Trunk (boot) space - maximum,Wheelbase,Front brakes,Coolant,Engine oil capacity,CO2 emissions,Wheel rims size,Engine aspiration,Departure angle,Fuel System,Permitted towbar download,Approach angle,Torque,Permitted trailer load without brakes,Max load,Trunk (boot) space - minimum,Number of cylinders,Permitted trailer load with brakes (12%),Fuel consumption - combined,Drive wheel,Ride height,Minimum turning circle (turning diameter),Permitted trailer load with brakes (8%),Rear suspension,Maximum speed,Rear (Back) track,Tires size,Acceleration 0 - 100 km/h (0 - 62 mph),Engine displacement,Fuel consumption - urban,Rear overhang,Year,Production_years,Brand_Model,Brand_freq,Brand_Model_freq,Year_norm,Production_years_norm,is_ev,is_phev,is_mhev_fhev,is_ice,Powertrain_simple,Body_Cabriolet,Body_Coupe,Body_Hatchback,Body_MPV,Body_Other,Body_Pick-up,Body_Roadster,Body_SUV,Body_Sedan,Body_Station wagon,Fuel_simple,is_diesel,is_petrol,is_hybrid_ev,is_lpg_cng,gears_manual,Length_z,Width_z,Height_z,Max. weight_z,Front track_z,gears_manual_z,emission_level,emission_level_z,steering_Cone worm with recirculation balls,steering_Steering rack and pinion,steering_Worm-reduction unit,Seats_z,cyl_180° flat V-engine,cyl_Boxer,cyl_Inline,cyl_Rotary (Wankel),cyl_V-engine,cyl_VR-engine,cyl_W-engine,Power_per_litre,Power_per_litre_z,Cylinder_Bore_mm,Cylinder_Bore_z,Max_roof_load_kg,Max_roof_load_z,Battery_kWh,Battery_kWh_z,drive_AWD,drive_FWD,drive_Other,drive_RWD,rear_brake_Disc,rear_brake_Drum,rear_brake_Unknown,has_abs,susp_Double_wishbone,susp_McPherson,susp_Multi_link,susp_Other,Doors_clean,Doors_clean_z,Kerb_Weight_kg,Kerb_Weight_kg_z,Braking_100to0_m,Braking_100to0_m_z,Max_RPM,Max_RPM_z,Front_overhang_mm,Front_overhang_mm_z,Ramp_angle_deg,Ramp_angle_deg_z,"engine_loc_Front axle, Transverse","engine_loc_Front, Longitudinal","engine_loc_Front, Transverse","engine_loc_Middle, Longitudinal","engine_loc_Middle, Transverse","engine_loc_Rear, Longitudinal","engine_loc_Rear, Transverse",has_particulate_filter,has_start_stop,has_cylinder_deact,gears_auto,gears_auto_z,Fuel_tank_L,Fuel_tank_L_z,Accel_0_100_sec,Accel_0_100_z,Piston_Stroke_mm,Piston_Stroke_z,Power_hp,Power_rpm,Power_hp_z,Power_rpm_z,Valves_per_cyl,Valves_per_cyl_z,Climb_angle_deg,Climb_angle_z,Drag_Cd,Drag_Cd_z,Fuel_extra_urban_L100km,Fuel_extra_urban_z
0,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,95 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,95.0,,-0.728227,,,,,,,,,
1,Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,54 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1972.0,3.0,DAF 66,0.000186,0.000186,0.342105,0.108108,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,54.0,,-1.09111,,,,,,,,,
2,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,90 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,90.0,,-0.772481,,,,,,,,,
3,Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,57 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1973.0,2.0,DAF 66,0.000186,0.000186,0.355263,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,57.0,,-1.064557,,,,,,,,,
4,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,110 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,110.0,,-0.595466,,,,,,,,,
5,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,70 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,70.0,,-0.949497,,,,,,,,,
6,Internal Combustion engine,Coupe,Petrol (Gasoline),,4660.0,1890.0,1340.0,1590.0,,4.0,,70.9 Hp/l,,90.2 mm,,3.0,,1615 kg,,,,70 l,,90 mm,326 Hp @ 5800 rpm.,4.0,,,10.5 l/100 km,9.85,DOHC,"5 kg/Hp, 201.9 Hp/tonne",300 l,2720 mm,Ventilated discs,,,,,Naturally aspirated engine,,Multi-point indirect injection,,,427 Nm @ 4600 rpm.,,,185 l,8.0,,,Rear wheel drive,,,,Helical spring,250 km/h,1570 mm,255/55 R18,6.1 sec,4601 cm3,15.1 l/100 km,,1998.0,2.0,AC Aceca,0.000186,5.3e-05,0.684211,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,5.0,-0.125848,-0.075452,-0.199609,,-0.246883,-0.522185,,,False,True,False,-0.970701,False,False,False,False,True,False,False,70.9,-0.237852,902.0,0.197687,,,,,False,False,False,True,True,False,False,1,False,False,False,True,3.0,-0.995372,1615.0,0.372344,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,70.0,0.491792,,,90.0,0.486582,326.0,5800.0,1.316305,0.640161,4.0,0.55783,,,,,10.5,2.01895
7,Internal Combustion engine,Sedan,Petrol (Gasoline),,4805.0,1810.0,1450.0,1550.0,,5.0,,71.3 Hp/l,,,,4.0,,,,,5.0,65 l,,,142 Hp @ 5800 rpm.,4.0,,,,10.0,,,,2700 mm,Ventilated discs,,,,6J x 15,Naturally aspirated engine,,Multi-point indirect injection,,,187 Nm @ 3800 rpm.,,,480 l,6.0,,,Front wheel drive,,,,Helical spring,,1545 mm,205/65 R15,,1992 cm3,,,2006.0,5.0,Daewoo Tosca,0.004245,0.000106,0.789474,0.162162,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0,,-0.125825,-0.075512,-0.199575,,-0.246894,,,,False,True,False,0.104765,False,False,True,False,False,False,False,71.3,-0.221777,,,,,,,False,True,False,False,True,False,False,1,False,True,False,False,4.0,-0.075803,,,,,,,,,,,False,False,True,False,False,False,False,0,0,0,5.0,-0.470964,65.0,0.168961,,,,,142.0,5800.0,-0.31224,0.640161,4.0,0.55783,,,,,,
8,Internal Combustion engine,Hatchback,Petrol (Gasoline),,4158.0,1663.0,1360.0,1400.0,,5.0,,46.9 Hp/l,,79 mm,,3.0,,990 kg,,,3.0,50 l,,81.5 mm,75 Hp @ 5600 rpm.,2.0,,,,8.6,OHC,"13.2 kg/Hp, 75.8 Hp/tonne",,2520 mm,Disc,,,,,Naturally aspirated engine,,Mono-point injection,,,122 Nm @ 2800 rpm.,,,,4.0,,,Front wheel drive,,,,Helical spring,,1406 mm,175/70 R13,,1598 cm3,,,1986.0,9.0,Daewoo Racer,0.004245,0.000212,0.526316,0.27027,0,0,0,1,ICE,0,0,1,0,0,0,0,0,0,0,Petrol,0,1,0,0,,-0.125927,-0.075623,-0.199603,,-0.246935,,,,False,True,False,0.104765,False,False,True,False,False,False,False,46.9,-1.202342,79.0,-0.305259,,,,,False,True,False,False,False,True,False,1,False,False,False,True,3.0,-0.995372,990.0,-1.334203,,,,,,,,,False,False,True,False,False,False,False,0,0,0,3.0,-1.598456,50.0,-0.799534,,,81.5,-0.591406,75.0,5600.0,-0.905243,0.439987,2.0,-1.729519,,,,,,
9,Internal Combustion engine,Sedan,Petrol (Gasoline),1460.0,4482.0,1662.0,1394.0,1400.0,,5.0,,50.1 Hp/l,,76.5 mm,,4.0,,969 kg,,,,50 l,,81.5 mm,75 Hp @ 5400 rpm.,2.0,,,5.1 l/100 km,9.5,OHC,"12.9 kg/Hp, 77.4 Hp/tonne",,2520 mm,Ventilated discs,,,,R13,Naturally aspirated engine,,Mono-point injection,,,123 Nm @ 3200 rpm.,,491 kg,530 l,4.0,,6.3 l/100 km,Front wheel drive,,,,Suspension with traction connecting levers,163 km/h,1406 mm,175/70 R13,12.5 sec,1498 cm3,9.3 l/100 km,,1993.0,2.0,Daewoo Racer,0.004245,0.000212,0.618421,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0,5.0,-0.125876,-0.075624,-0.199592,-0.084383,-0.246935,-0.522185,,,False,True,False,0.104765,False,False,True,False,False,False,False,50.1,-1.073743,765.0,0.113965,,,,,False,True,False,False,False,True,False,0,False,False,False,True,4.0,-0.075803,969.0,-1.391543,,,,,,,,,False,False,True,False,False,False,False,0,0,0,,,50.0,-0.799534,,,81.5,-0.591406,75.0,5400.0,-0.905243,0.239812,2.0,-1.729519,,,,,5.1,-0.568392


In [21]:
from sklearn.preprocessing import StandardScaler

# ------------------------------------------------------------------
# 1. Fuel consumption - extra urban → original + clean value
if 'Fuel consumption - extra urban' in df.columns:
    # Inimloetav jääb alles!
    df['Fuel_extra_L100'] = df['Fuel consumption - extra urban'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    scaler = StandardScaler()
    df['Fuel_extra_L100_z'] = scaler.fit_transform(df[['Fuel_extra_L100']])


# ------------------------------------------------------------------
# 2. Compression ratio → original + celan value
if 'Compression ratio' in df.columns:
    df['Compression_ratio'] = df['Compression ratio'].astype(str).str.extract(r'(\d*\.?\d+)')[0].astype(float)
    scaler = StandardScaler()
    df['Compression_ratio_z'] = scaler.fit_transform(df[['Compression_ratio']])


# ------------------------------------------------------------------
# 3. Valvetrain → original + one-hot
if 'Valvetrain' in df.columns:
    df['Valvetrain_simple'] = df['Valvetrain'].astype(str).str.upper()
    df['Valvetrain_simple'] = df['Valvetrain_simple'].replace(['NAN','<NA>','NONE',''], 'Unknown')
    top = ['DOHC','SOHC','OHC','OHV']
    df['Valvetrain_simple'] = df['Valvetrain_simple'].where(df['Valvetrain_simple'].isin(top), 'Other')
    df = pd.get_dummies(df, columns=['Valvetrain_simple'], prefix='valve', dummy_na=False)


# ------------------------------------------------------------------
# 4. Weight-to-power ratio → original + kg/Hp
if 'Weight-to-power ratio' in df.columns:
    df['WtoP_kg_per_hp'] = df['Weight-to-power ratio'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    scaler = StandardScaler()
    df['WtoP_kg_per_hp_z'] = scaler.fit_transform(df[['WtoP_kg_per_hp']])


# ------------------------------------------------------------------
# 5. Trunk (boot) space - maximum → original + L
if 'Trunk (boot) space - maximum' in df.columns:
    df['Trunk_max_L'] = df['Trunk (boot) space - maximum'].astype(str).str.extract(r'(\d+)')[0].astype(float)
    scaler = StandardScaler()
    df['Trunk_max_L_z'] = scaler.fit_transform(df[['Trunk_max_L']])


# ------------------------------------------------------------------
# 6. Wheelbase → original + mm
if 'Wheelbase' in df.columns:
    df['Wheelbase_mm'] = df['Wheelbase'].astype(str).str.replace(r'\D+', '', regex=True)
    df['Wheelbase_mm'] = pd.to_numeric(df['Wheelbase_mm'], errors='coerce').astype('Int64')
    scaler = StandardScaler()
    df['Wheelbase_z'] = scaler.fit_transform(df[['Wheelbase_mm']])


# ------------------------------------------------------------------
# 7. Front brakes → original + simplified one-hot
if 'Front brakes' in df.columns:
    df['front_brake_type'] = df['Front brakes'].astype(str)
    df['front_brake_type'] = df['front_brake_type'].str.replace(r'.*Ventilated.*|.*Disc.*', 'Disc', case=False, regex=True)
    df['front_brake_type'] = df['front_brake_type'].str.replace(r'.*Drum.*', 'Drum', case=False, regex=True)
    df['front_brake_type'] = df['front_brake_type'].replace('nan', 'Unknown')
    df['front_brake_type'] = df['front_brake_type'].where(df['front_brake_type'].isin(['Disc','Drum','Unknown']), 'Other')
    df = pd.get_dummies(df, columns=['front_brake_type'], prefix='front_brake', dummy_na=False)


# ------------------------------------------------------------------
# 8. Coolant → original + L
if 'Coolant' in df.columns:
    df['Coolant_L'] = df['Coolant'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    scaler = StandardScaler()
    df['Coolant_L_z'] = scaler.fit_transform(df[['Coolant_L']])


# ------------------------------------------------------------------
# 9. Engine oil capacity → original + L
if 'Engine oil capacity' in df.columns:
    df['Oil_capacity_L'] = df['Engine oil capacity'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    scaler = StandardScaler()
    df['Oil_capacity_L_z'] = scaler.fit_transform(df[['Oil_capacity_L']])


# ------------------------------------------------------------------
# 10. CO2 emissions → original + g/km
if 'CO2 emissions' in df.columns:
    df['CO2_gkm'] = df['CO2 emissions'].astype(str).str.extract(r'(\d+)')[0].astype(float)
    scaler = StandardScaler()
    df['CO2_gkm_z'] = scaler.fit_transform(df[['CO2_gkm']])


# ------------------------------------------------------------------
# 11. Wheel rims size → originals + inches (R17 → 17)
if 'Wheel rims size' in df.columns:
    df['Rim_size_inch'] = df['Wheel rims size'].astype(str).str.extract(r'R(\d+)')[0].astype(float)
    scaler = StandardScaler()
    df['Rim_size_inch_z'] = scaler.fit_transform(df[['Rim_size_inch']])
    
human_readable = [
    'Fuel consumption - extra urban', 'Compression ratio', 'Valvetrain',
    'Weight-to-power ratio', 'Trunk (boot) space - maximum', 'Wheelbase',
    'Front brakes', 'Coolant', 'Engine oil capacity', 'CO2 emissions', 'Wheel rims size',
    'Fuel_extra_L100', 'Wheelbase_mm', 'CO2_gkm', 'Rim_size_inch'
]

new_z = ['Fuel_extra_L100_z','Compression_ratio_z','WtoP_kg_per_hp_z','Trunk_max_L_z',
         'Wheelbase_z','Coolant_L_z','Oil_capacity_L_z','CO2_gkm_z','Rim_size_inch_z']

print("human readable columns:")
for c in human_readable:
    if c in df.columns:
        ex = df[c].dropna().iloc[0] if df[c].notna().any() else "NaN"
        print(f"  → {c:40} | example: {ex}")

print(f"\nNew normalized attributes:")
for c in new_z:
    if c in df.columns:
        print(f"  → {c}")

print(f"\nIn total: {len(new_z)} new _z attributes")
print(f"data: {df.shape[0]:,} rows × {df.shape[1]} columns")

human readable columns:
  → Fuel consumption - extra urban           | example: 10.5 l/100 km
  → Compression ratio                        | example: 9.85
  → Valvetrain                               | example: DOHC
  → Weight-to-power ratio                    | example: 5 kg/Hp, 201.9 Hp/tonne
  → Trunk (boot) space - maximum             | example: 300 l
  → Wheelbase                                | example: 2720 mm
  → Front brakes                             | example: Ventilated discs
  → Coolant                                  | example: 6.3 l
  → Engine oil capacity                      | example: 4.2 l
  → CO2 emissions                            | example: 207 g/km
  → Wheel rims size                          | example: 6J x 15
  → Fuel_extra_L100                          | example: 10.5
  → Wheelbase_mm                             | example: 2720
  → CO2_gkm                                  | example: 207.0
  → Rim_size_inch                            | example: 13.0

New no

In [22]:
pd.set_option('display.max_columns', None)    # näitab KÕIKI veerge
pd.set_option('display.max_rows', 10)         # näitab täpselt 10 rida (pool enne ja pool pärast)
pd.set_option('display.width', None)          # automaatne laius
pd.set_option('display.expand_frame_repr', False)  # ei murragi ridasid
pd.set_option('display.max_colwidth', 50)     # veergude sisu ei lõigata liiga lühikeseks

df.head(10)

Unnamed: 0,Powertrain Architecture,Body type,Fuel Type,Max. weight,Length,Width,Height,Front track,Emission standard,Seats,Battery capacity,Power per litre,Max. roof load,Cylinder Bore,100 km/h - 0,Doors,Maximum engine speed,Kerb Weight,Front overhang,Ramp angle,Number of Gears (automatic transmission),Fuel tank capacity,Acceleration,Piston Stroke,Power,Number of valves per cylinder,Climb angle,Drag coefficient (Cd),Fuel consumption - extra urban,Compression ratio,Valvetrain,Weight-to-power ratio,Trunk (boot) space - maximum,Wheelbase,Front brakes,Coolant,Engine oil capacity,CO2 emissions,Wheel rims size,Engine aspiration,Departure angle,Fuel System,Permitted towbar download,Approach angle,Torque,Permitted trailer load without brakes,Max load,Trunk (boot) space - minimum,Number of cylinders,Permitted trailer load with brakes (12%),Fuel consumption - combined,Drive wheel,Ride height,Minimum turning circle (turning diameter),Permitted trailer load with brakes (8%),Rear suspension,Maximum speed,Rear (Back) track,Tires size,Acceleration 0 - 100 km/h (0 - 62 mph),Engine displacement,Fuel consumption - urban,Rear overhang,Year,Production_years,Brand_Model,Brand_freq,Brand_Model_freq,Year_norm,Production_years_norm,is_ev,is_phev,is_mhev_fhev,is_ice,Powertrain_simple,Body_Cabriolet,Body_Coupe,Body_Hatchback,Body_MPV,Body_Other,Body_Pick-up,Body_Roadster,Body_SUV,Body_Sedan,Body_Station wagon,Fuel_simple,is_diesel,is_petrol,is_hybrid_ev,is_lpg_cng,gears_manual,Length_z,Width_z,Height_z,Max. weight_z,Front track_z,gears_manual_z,emission_level,emission_level_z,steering_Cone worm with recirculation balls,steering_Steering rack and pinion,steering_Worm-reduction unit,Seats_z,cyl_180° flat V-engine,cyl_Boxer,cyl_Inline,cyl_Rotary (Wankel),cyl_V-engine,cyl_VR-engine,cyl_W-engine,Power_per_litre,Power_per_litre_z,Cylinder_Bore_mm,Cylinder_Bore_z,Max_roof_load_kg,Max_roof_load_z,Battery_kWh,Battery_kWh_z,drive_AWD,drive_FWD,drive_Other,drive_RWD,rear_brake_Disc,rear_brake_Drum,rear_brake_Unknown,has_abs,susp_Double_wishbone,susp_McPherson,susp_Multi_link,susp_Other,Doors_clean,Doors_clean_z,Kerb_Weight_kg,Kerb_Weight_kg_z,Braking_100to0_m,Braking_100to0_m_z,Max_RPM,Max_RPM_z,Front_overhang_mm,Front_overhang_mm_z,Ramp_angle_deg,Ramp_angle_deg_z,"engine_loc_Front axle, Transverse","engine_loc_Front, Longitudinal","engine_loc_Front, Transverse","engine_loc_Middle, Longitudinal","engine_loc_Middle, Transverse","engine_loc_Rear, Longitudinal","engine_loc_Rear, Transverse",has_particulate_filter,has_start_stop,has_cylinder_deact,gears_auto,gears_auto_z,Fuel_tank_L,Fuel_tank_L_z,Accel_0_100_sec,Accel_0_100_z,Piston_Stroke_mm,Piston_Stroke_z,Power_hp,Power_rpm,Power_hp_z,Power_rpm_z,Valves_per_cyl,Valves_per_cyl_z,Climb_angle_deg,Climb_angle_z,Drag_Cd,Drag_Cd_z,Fuel_extra_urban_L100km,Fuel_extra_urban_z,Fuel_extra_L100,Fuel_extra_L100_z,Compression_ratio,Compression_ratio_z,valve_DOHC,valve_OHC,valve_OHV,valve_Other,valve_SOHC,WtoP_kg_per_hp,WtoP_kg_per_hp_z,Trunk_max_L,Trunk_max_L_z,Wheelbase_mm,Wheelbase_z,front_brake_Disc,front_brake_Drum,front_brake_Unknown,Coolant_L,Coolant_L_z,Oil_capacity_L,Oil_capacity_L_z,CO2_gkm,CO2_gkm_z,Rim_size_inch,Rim_size_inch_z
0,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,95 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,95.0,,-0.728227,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,
1,Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,54 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1972.0,3.0,DAF 66,0.000186,0.000186,0.342105,0.108108,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,54.0,,-1.09111,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,
2,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,90 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,90.0,,-0.772481,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,
3,Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,57 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1973.0,2.0,DAF 66,0.000186,0.000186,0.355263,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,57.0,,-1.064557,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,
4,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,110 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,110.0,,-0.595466,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,
5,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,70 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,70.0,,-0.949497,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,
6,Internal Combustion engine,Coupe,Petrol (Gasoline),,4660.0,1890.0,1340.0,1590.0,,4.0,,70.9 Hp/l,,90.2 mm,,3.0,,1615 kg,,,,70 l,,90 mm,326 Hp @ 5800 rpm.,4.0,,,10.5 l/100 km,9.85,DOHC,"5 kg/Hp, 201.9 Hp/tonne",300 l,2720 mm,Ventilated discs,,,,,Naturally aspirated engine,,Multi-point indirect injection,,,427 Nm @ 4600 rpm.,,,185 l,8.0,,,Rear wheel drive,,,,Helical spring,250 km/h,1570 mm,255/55 R18,6.1 sec,4601 cm3,15.1 l/100 km,,1998.0,2.0,AC Aceca,0.000186,5.3e-05,0.684211,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,5.0,-0.125848,-0.075452,-0.199609,,-0.246883,-0.522185,,,False,True,False,-0.970701,False,False,False,False,True,False,False,70.9,-0.237852,902.0,0.197687,,,,,False,False,False,True,True,False,False,1,False,False,False,True,3.0,-0.995372,1615.0,0.372344,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,70.0,0.491792,,,90.0,0.486582,326.0,5800.0,1.316305,0.640161,4.0,0.55783,,,,,10.5,2.01895,10.5,2.01895,9.85,-0.600363,True,False,False,False,False,5.0,-1.410896,300.0,-1.62922,2720.0,-0.103616,True,False,False,,,,,,,,
7,Internal Combustion engine,Sedan,Petrol (Gasoline),,4805.0,1810.0,1450.0,1550.0,,5.0,,71.3 Hp/l,,,,4.0,,,,,5.0,65 l,,,142 Hp @ 5800 rpm.,4.0,,,,10.0,,,,2700 mm,Ventilated discs,,,,6J x 15,Naturally aspirated engine,,Multi-point indirect injection,,,187 Nm @ 3800 rpm.,,,480 l,6.0,,,Front wheel drive,,,,Helical spring,,1545 mm,205/65 R15,,1992 cm3,,,2006.0,5.0,Daewoo Tosca,0.004245,0.000106,0.789474,0.162162,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0,,-0.125825,-0.075512,-0.199575,,-0.246894,,,,False,True,False,0.104765,False,False,True,False,False,False,False,71.3,-0.221777,,,,,,,False,True,False,False,True,False,False,1,False,True,False,False,4.0,-0.075803,,,,,,,,,,,False,False,True,False,False,False,False,0,0,0,5.0,-0.470964,65.0,0.168961,,,,,142.0,5800.0,-0.31224,0.640161,4.0,0.55783,,,,,,,,,10.0,-0.558925,False,False,False,True,False,,,,,2700.0,-0.103622,True,False,False,,,,,,,,
8,Internal Combustion engine,Hatchback,Petrol (Gasoline),,4158.0,1663.0,1360.0,1400.0,,5.0,,46.9 Hp/l,,79 mm,,3.0,,990 kg,,,3.0,50 l,,81.5 mm,75 Hp @ 5600 rpm.,2.0,,,,8.6,OHC,"13.2 kg/Hp, 75.8 Hp/tonne",,2520 mm,Disc,,,,,Naturally aspirated engine,,Mono-point injection,,,122 Nm @ 2800 rpm.,,,,4.0,,,Front wheel drive,,,,Helical spring,,1406 mm,175/70 R13,,1598 cm3,,,1986.0,9.0,Daewoo Racer,0.004245,0.000212,0.526316,0.27027,0,0,0,1,ICE,0,0,1,0,0,0,0,0,0,0,Petrol,0,1,0,0,,-0.125927,-0.075623,-0.199603,,-0.246935,,,,False,True,False,0.104765,False,False,True,False,False,False,False,46.9,-1.202342,79.0,-0.305259,,,,,False,True,False,False,False,True,False,1,False,False,False,True,3.0,-0.995372,990.0,-1.334203,,,,,,,,,False,False,True,False,False,False,False,0,0,0,3.0,-1.598456,50.0,-0.799534,,,81.5,-0.591406,75.0,5600.0,-0.905243,0.439987,2.0,-1.729519,,,,,,,,,8.6,-0.945687,False,True,False,False,False,13.2,1.062185,,,2520.0,-0.103675,True,False,False,,,,,,,,
9,Internal Combustion engine,Sedan,Petrol (Gasoline),1460.0,4482.0,1662.0,1394.0,1400.0,,5.0,,50.1 Hp/l,,76.5 mm,,4.0,,969 kg,,,,50 l,,81.5 mm,75 Hp @ 5400 rpm.,2.0,,,5.1 l/100 km,9.5,OHC,"12.9 kg/Hp, 77.4 Hp/tonne",,2520 mm,Ventilated discs,,,,R13,Naturally aspirated engine,,Mono-point injection,,,123 Nm @ 3200 rpm.,,491 kg,530 l,4.0,,6.3 l/100 km,Front wheel drive,,,,Suspension with traction connecting levers,163 km/h,1406 mm,175/70 R13,12.5 sec,1498 cm3,9.3 l/100 km,,1993.0,2.0,Daewoo Racer,0.004245,0.000212,0.618421,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0,5.0,-0.125876,-0.075624,-0.199592,-0.084383,-0.246935,-0.522185,,,False,True,False,0.104765,False,False,True,False,False,False,False,50.1,-1.073743,765.0,0.113965,,,,,False,True,False,False,False,True,False,0,False,False,False,True,4.0,-0.075803,969.0,-1.391543,,,,,,,,,False,False,True,False,False,False,False,0,0,0,,,50.0,-0.799534,,,81.5,-0.591406,75.0,5400.0,-0.905243,0.239812,2.0,-1.729519,,,,,5.1,-0.568392,5.1,-0.568392,9.5,-0.697054,False,True,False,False,False,12.9,0.971706,,,2520.0,-0.103675,True,False,False,,,,,,,13.0,-1.201539


In [23]:
# validating brakes info
print(df['Front brakes'].to_string(index=False))   

                               NaN
                               NaN
                               NaN
                               NaN
                               NaN
                               NaN
                  Ventilated discs
                  Ventilated discs
                              Disc
                  Ventilated discs
                  Ventilated discs
                  Ventilated discs
                              Disc
                               NaN
                               NaN
                               NaN
                  Ventilated discs
                              Disc
                  Ventilated discs
                  Ventilated discs
                  Ventilated discs
                  Ventilated discs
                              Disc
                  Ventilated discs
                              Disc
                  Ventilated discs
                              Disc
                  Ventilated discs
                    

In [24]:
from sklearn.preprocessing import StandardScaler

# ------------------------------------------------------------------
# 1. Engine aspiration → simplifying to 5 main ones + one-hot
if 'Engine aspiration' in df.columns:
    txt = df['Engine aspiration'].astype(str)
    df['aspiration'] = 'Naturally_aspirated'
    df.loc[txt.str.contains('Turbo', case=False, na=False), 'aspiration'] = 'Turbo'
    df.loc[txt.str.contains('Twin.*turbo|BiTurbo', case=False, na=False), 'aspiration'] = 'Twin_turbo'
    df.loc[txt.str.contains('Supercharg|Compressor', case=False, na=False), 'aspiration'] = 'Supercharged'
    df.loc[txt.str.contains('4 Turbochargers', case=False, na=False), 'aspiration'] = 'Quad_turbo'
    
    # dummy_na=False → NaN jääb NaN-iks → saab väärtuseks 0 kõigis tulbades
    df = pd.get_dummies(df, columns=['aspiration'], prefix='asp', dummy_na=False)


# ------------------------------------------------------------------
# 2. Departure angle → degrees + z-score
if 'Departure angle' in df.columns:
    df['Departure_angle_deg'] = df['Departure angle'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    scaler = StandardScaler()
    df['Departure_angle_z'] = scaler.fit_transform(df[['Departure_angle_deg']])


# ------------------------------------------------------------------
# 3. Fuel System → one-hot (10 values)
if 'Fuel System' in df.columns:
    df = pd.get_dummies(df, columns=['Fuel System'], prefix='fuel_sys', dummy_na=False)


# ------------------------------------------------------------------
# 4. Permitted towbar download → kg + z-score
if 'Permitted towbar download' in df.columns:
    df['Towbar_download_kg'] = df['Permitted towbar download'].astype(str).str.extract(r'(\d+)')[0].astype(float)
    scaler = StandardScaler()
    df['Towbar_download_z'] = scaler.fit_transform(df[['Towbar_download_kg']])


# ------------------------------------------------------------------
# 5. Approach angle → degrees + z-score
if 'Approach angle' in df.columns:
    df['Approach_angle_deg'] = df['Approach angle'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    scaler = StandardScaler()
    df['Approach_angle_z'] = scaler.fit_transform(df[['Approach_angle_deg']])


# ------------------------------------------------------------------
# 6. Torque → Nm + z-score
if 'Torque' in df.columns:
    txt = df['Torque'].astype(str).str.replace(' ', '')

    # 1. Only Nm value
    df['Torque_Nm'] = txt.str.extract(r'(\d+)Nm')[0].astype(float)

    # 2. Only RPM value
    #    (nt "400Nm@1750-3000rpm" → 1750 | "250Nm@1500rpm" → 1500)
    df['Torque_rpm'] = txt.str.extract(r'@(\d+)')[0].astype(float)

    # normalizing both separately
    scaler_nm  = StandardScaler()
    scaler_rpm = StandardScaler()

    df['Torque_Nm_z']  = scaler_nm.fit_transform(df[['Torque_Nm']])
    df['Torque_rpm_z'] = scaler_rpm.fit_transform(df[['Torque_rpm']])

# ------------------------------------------------------------------
# 7. Permitted trailer load without brakes → kg + z-score
if 'Permitted trailer load without brakes' in df.columns:
    df['Trailer_no_brakes_kg'] = df['Permitted trailer load without brakes'].astype(str).str.extract(r'(\d+)')[0].astype(float)
    scaler = StandardScaler()
    df['Trailer_no_brakes_z'] = scaler.fit_transform(df[['Trailer_no_brakes_kg']])


# ------------------------------------------------------------------
# 8. Max load → kg + z-score
if 'Max load' in df.columns:
    df['Max_load_kg'] = df['Max load'].astype(str).str.extract(r'(\d+)')[0].astype(float)
    scaler = StandardScaler()
    df['Max_load_z'] = scaler.fit_transform(df[['Max_load_kg']])


# ------------------------------------------------------------------
# 9. Trunk (boot) space - minimum → L + z-score
if 'Trunk (boot) space - minimum' in df.columns:
    df['Trunk_min_L'] = df['Trunk (boot) space - minimum'].astype(str).str.extract(r'(\d+)')[0].astype(float)
    scaler = StandardScaler()
    df['Trunk_min_L_z'] = scaler.fit_transform(df[['Trunk_min_L']])


# ------------------------------------------------------------------
# 10. Number of cylinders → number + z-score
if 'Number of cylinders' in df.columns:
    df['Cylinders'] = pd.to_numeric(df['Number of cylinders'], errors='coerce').astype('Int64')
    scaler = StandardScaler()
    df['Cylinders_z'] = scaler.fit_transform(df[['Cylinders']])


# ------------------------------------------------------------------
# 11. Permitted trailer load with brakes (12%) → kg + z-score
if 'Permitted trailer load with brakes (12%)' in df.columns:
    df['Trailer_with_brakes_kg'] = df['Permitted trailer load with brakes (12%)'].astype(str).str.extract(r'(\d+)')[0].astype(float)
    scaler = StandardScaler()
    df['Trailer_with_brakes_z'] = scaler.fit_transform(df[['Trailer_with_brakes_kg']])

new_z_cols = ['Departure_angle_z','Towbar_download_z','Approach_angle_z','Torque_Nm_z',
              'Trailer_no_brakes_z','Max_load_z','Trunk_min_L_z','Cylinders_z','Trailer_with_brakes_z']

print("New normalized numeric attributes (_z):")
for c in new_z_cols:
    if c in df.columns:
        print(f"  → {c:30} ← {c.replace('_z','')}")

print(f"\nHuman readable:")
examples = ['Engine aspiration','Torque','Wheelbase','Trunk (boot) space - minimum','Number of cylinders']
for c in examples:
    if c in df.columns:
        print(f"  → {c:45} | example: {df[c].dropna().iloc[0] if df[c].notna().any() else 'NaN'}")

print(f"\nAdded {len(new_z_cols)} normalized attributes")
print(f"Data: {df.shape[0]:,} rows × {df.shape[1]} columns")

New normalized numeric attributes (_z):
  → Departure_angle_z              ← Departure_angle
  → Towbar_download_z              ← Towbar_download
  → Approach_angle_z               ← Approach_angle
  → Torque_Nm_z                    ← Torque_Nm
  → Trailer_no_brakes_z            ← Trailer_no_brakes
  → Max_load_z                     ← Max_load
  → Trunk_min_L_z                  ← Trunk_min_L
  → Cylinders_z                    ← Cylinders
  → Trailer_with_brakes_z          ← Trailer_with_brakes

Human readable:
  → Engine aspiration                             | example: Naturally aspirated engine
  → Torque                                        | example: 427 Nm @ 4600 rpm.
  → Wheelbase                                     | example: 2720 mm
  → Trunk (boot) space - minimum                  | example: 185 l
  → Number of cylinders                           | example: 8.0

Added 9 normalized attributes
Data: 37,695 rows × 231 columns


In [25]:
pd.set_option('display.max_columns', None)    # näitab KÕIKI veerge
pd.set_option('display.max_rows', 10)         # näitab täpselt 10 rida (pool enne ja pool pärast)
pd.set_option('display.width', None)          # automaatne laius
pd.set_option('display.expand_frame_repr', False)  # ei murragi ridasid
pd.set_option('display.max_colwidth', 50)     # veergude sisu ei lõigata liiga lühikeseks

df.head(10)

Unnamed: 0,Powertrain Architecture,Body type,Fuel Type,Max. weight,Length,Width,Height,Front track,Emission standard,Seats,Battery capacity,Power per litre,Max. roof load,Cylinder Bore,100 km/h - 0,Doors,Maximum engine speed,Kerb Weight,Front overhang,Ramp angle,Number of Gears (automatic transmission),Fuel tank capacity,Acceleration,Piston Stroke,Power,Number of valves per cylinder,Climb angle,Drag coefficient (Cd),Fuel consumption - extra urban,Compression ratio,Valvetrain,Weight-to-power ratio,Trunk (boot) space - maximum,Wheelbase,Front brakes,Coolant,Engine oil capacity,CO2 emissions,Wheel rims size,Engine aspiration,Departure angle,Permitted towbar download,Approach angle,Torque,Permitted trailer load without brakes,Max load,Trunk (boot) space - minimum,Number of cylinders,Permitted trailer load with brakes (12%),Fuel consumption - combined,Drive wheel,Ride height,Minimum turning circle (turning diameter),Permitted trailer load with brakes (8%),Rear suspension,Maximum speed,Rear (Back) track,Tires size,Acceleration 0 - 100 km/h (0 - 62 mph),Engine displacement,Fuel consumption - urban,Rear overhang,Year,Production_years,Brand_Model,Brand_freq,Brand_Model_freq,Year_norm,Production_years_norm,is_ev,is_phev,is_mhev_fhev,is_ice,Powertrain_simple,Body_Cabriolet,Body_Coupe,Body_Hatchback,Body_MPV,Body_Other,Body_Pick-up,Body_Roadster,Body_SUV,Body_Sedan,Body_Station wagon,Fuel_simple,is_diesel,is_petrol,is_hybrid_ev,is_lpg_cng,gears_manual,Length_z,Width_z,Height_z,Max. weight_z,Front track_z,gears_manual_z,emission_level,emission_level_z,steering_Cone worm with recirculation balls,steering_Steering rack and pinion,steering_Worm-reduction unit,Seats_z,cyl_180° flat V-engine,cyl_Boxer,cyl_Inline,cyl_Rotary (Wankel),cyl_V-engine,cyl_VR-engine,cyl_W-engine,Power_per_litre,Power_per_litre_z,Cylinder_Bore_mm,Cylinder_Bore_z,Max_roof_load_kg,Max_roof_load_z,Battery_kWh,Battery_kWh_z,drive_AWD,drive_FWD,drive_Other,drive_RWD,rear_brake_Disc,rear_brake_Drum,rear_brake_Unknown,has_abs,susp_Double_wishbone,susp_McPherson,susp_Multi_link,susp_Other,Doors_clean,Doors_clean_z,Kerb_Weight_kg,Kerb_Weight_kg_z,Braking_100to0_m,Braking_100to0_m_z,Max_RPM,Max_RPM_z,Front_overhang_mm,Front_overhang_mm_z,Ramp_angle_deg,Ramp_angle_deg_z,"engine_loc_Front axle, Transverse","engine_loc_Front, Longitudinal","engine_loc_Front, Transverse","engine_loc_Middle, Longitudinal","engine_loc_Middle, Transverse","engine_loc_Rear, Longitudinal","engine_loc_Rear, Transverse",has_particulate_filter,has_start_stop,has_cylinder_deact,gears_auto,gears_auto_z,Fuel_tank_L,Fuel_tank_L_z,Accel_0_100_sec,Accel_0_100_z,Piston_Stroke_mm,Piston_Stroke_z,Power_hp,Power_rpm,Power_hp_z,Power_rpm_z,Valves_per_cyl,Valves_per_cyl_z,Climb_angle_deg,Climb_angle_z,Drag_Cd,Drag_Cd_z,Fuel_extra_urban_L100km,Fuel_extra_urban_z,Fuel_extra_L100,Fuel_extra_L100_z,Compression_ratio,Compression_ratio_z,valve_DOHC,valve_OHC,valve_OHV,valve_Other,valve_SOHC,WtoP_kg_per_hp,WtoP_kg_per_hp_z,Trunk_max_L,Trunk_max_L_z,Wheelbase_mm,Wheelbase_z,front_brake_Disc,front_brake_Drum,front_brake_Unknown,Coolant_L,Coolant_L_z,Oil_capacity_L,Oil_capacity_L_z,CO2_gkm,CO2_gkm_z,Rim_size_inch,Rim_size_inch_z,asp_Naturally_aspirated,asp_Quad_turbo,asp_Supercharged,asp_Turbo,asp_Twin_turbo,Departure_angle_deg,Departure_angle_z,fuel_sys_Carburettor,fuel_sys_Diesel - Standard diesel injection (SDI),fuel_sys_Diesel Commonrail,fuel_sys_Direct injection,fuel_sys_Direct injection / Multi-point indirect injection,fuel_sys_Dual point injection,fuel_sys_Mono-point injection,fuel_sys_Multi-point indirect injection,fuel_sys_Pump-nozzle (Unit Injector),Towbar_download_kg,Towbar_download_z,Approach_angle_deg,Approach_angle_z,Torque_Nm,Torque_rpm,Torque_Nm_z,Torque_rpm_z,Trailer_no_brakes_kg,Trailer_no_brakes_z,Max_load_kg,Max_load_z,Trunk_min_L,Trunk_min_L_z,Cylinders,Cylinders_z,Trailer_with_brakes_kg,Trailer_with_brakes_z
0,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,95 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,95.0,,-0.728227,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,
1,Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,54 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,1972.0,3.0,DAF 66,0.000186,0.000186,0.342105,0.108108,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,54.0,,-1.09111,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,
2,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,90 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,90.0,,-0.772481,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,
3,Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,57 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,1973.0,2.0,DAF 66,0.000186,0.000186,0.355263,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,57.0,,-1.064557,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,
4,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,110 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,110.0,,-0.595466,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,
5,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,70 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,70.0,,-0.949497,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,
6,Internal Combustion engine,Coupe,Petrol (Gasoline),,4660.0,1890.0,1340.0,1590.0,,4.0,,70.9 Hp/l,,90.2 mm,,3.0,,1615 kg,,,,70 l,,90 mm,326 Hp @ 5800 rpm.,4.0,,,10.5 l/100 km,9.85,DOHC,"5 kg/Hp, 201.9 Hp/tonne",300 l,2720 mm,Ventilated discs,,,,,Naturally aspirated engine,,,,427 Nm @ 4600 rpm.,,,185 l,8.0,,,Rear wheel drive,,,,Helical spring,250 km/h,1570 mm,255/55 R18,6.1 sec,4601 cm3,15.1 l/100 km,,1998.0,2.0,AC Aceca,0.000186,5.3e-05,0.684211,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,5.0,-0.125848,-0.075452,-0.199609,,-0.246883,-0.522185,,,False,True,False,-0.970701,False,False,False,False,True,False,False,70.9,-0.237852,902.0,0.197687,,,,,False,False,False,True,True,False,False,1,False,False,False,True,3.0,-0.995372,1615.0,0.372344,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,70.0,0.491792,,,90.0,0.486582,326.0,5800.0,1.316305,0.640161,4.0,0.55783,,,,,10.5,2.01895,10.5,2.01895,9.85,-0.600363,True,False,False,False,False,5.0,-1.410896,300.0,-1.62922,2720.0,-0.103616,True,False,False,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,True,False,,,,,427.0,4600.0,0.897111,1.463985,,,,,185.0,-1.149808,8.0,2.097549,,
7,Internal Combustion engine,Sedan,Petrol (Gasoline),,4805.0,1810.0,1450.0,1550.0,,5.0,,71.3 Hp/l,,,,4.0,,,,,5.0,65 l,,,142 Hp @ 5800 rpm.,4.0,,,,10.0,,,,2700 mm,Ventilated discs,,,,6J x 15,Naturally aspirated engine,,,,187 Nm @ 3800 rpm.,,,480 l,6.0,,,Front wheel drive,,,,Helical spring,,1545 mm,205/65 R15,,1992 cm3,,,2006.0,5.0,Daewoo Tosca,0.004245,0.000106,0.789474,0.162162,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0,,-0.125825,-0.075512,-0.199575,,-0.246894,,,,False,True,False,0.104765,False,False,True,False,False,False,False,71.3,-0.221777,,,,,,,False,True,False,False,True,False,False,1,False,True,False,False,4.0,-0.075803,,,,,,,,,,,False,False,True,False,False,False,False,0,0,0,5.0,-0.470964,65.0,0.168961,,,,,142.0,5800.0,-0.31224,0.640161,4.0,0.55783,,,,,,,,,10.0,-0.558925,False,False,False,True,False,,,,,2700.0,-0.103622,True,False,False,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,True,False,,,,,187.0,3800.0,-0.66226,0.792664,,,,,480.0,0.050834,6.0,0.785834,,
8,Internal Combustion engine,Hatchback,Petrol (Gasoline),,4158.0,1663.0,1360.0,1400.0,,5.0,,46.9 Hp/l,,79 mm,,3.0,,990 kg,,,3.0,50 l,,81.5 mm,75 Hp @ 5600 rpm.,2.0,,,,8.6,OHC,"13.2 kg/Hp, 75.8 Hp/tonne",,2520 mm,Disc,,,,,Naturally aspirated engine,,,,122 Nm @ 2800 rpm.,,,,4.0,,,Front wheel drive,,,,Helical spring,,1406 mm,175/70 R13,,1598 cm3,,,1986.0,9.0,Daewoo Racer,0.004245,0.000212,0.526316,0.27027,0,0,0,1,ICE,0,0,1,0,0,0,0,0,0,0,Petrol,0,1,0,0,,-0.125927,-0.075623,-0.199603,,-0.246935,,,,False,True,False,0.104765,False,False,True,False,False,False,False,46.9,-1.202342,79.0,-0.305259,,,,,False,True,False,False,False,True,False,1,False,False,False,True,3.0,-0.995372,990.0,-1.334203,,,,,,,,,False,False,True,False,False,False,False,0,0,0,3.0,-1.598456,50.0,-0.799534,,,81.5,-0.591406,75.0,5600.0,-0.905243,0.439987,2.0,-1.729519,,,,,,,,,8.6,-0.945687,False,True,False,False,False,13.2,1.062185,,,2520.0,-0.103675,True,False,False,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,True,False,False,,,,,122.0,2800.0,-1.08459,-0.046486,,,,,,,4.0,-0.52588,,
9,Internal Combustion engine,Sedan,Petrol (Gasoline),1460.0,4482.0,1662.0,1394.0,1400.0,,5.0,,50.1 Hp/l,,76.5 mm,,4.0,,969 kg,,,,50 l,,81.5 mm,75 Hp @ 5400 rpm.,2.0,,,5.1 l/100 km,9.5,OHC,"12.9 kg/Hp, 77.4 Hp/tonne",,2520 mm,Ventilated discs,,,,R13,Naturally aspirated engine,,,,123 Nm @ 3200 rpm.,,491 kg,530 l,4.0,,6.3 l/100 km,Front wheel drive,,,,Suspension with traction connecting levers,163 km/h,1406 mm,175/70 R13,12.5 sec,1498 cm3,9.3 l/100 km,,1993.0,2.0,Daewoo Racer,0.004245,0.000212,0.618421,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0,5.0,-0.125876,-0.075624,-0.199592,-0.084383,-0.246935,-0.522185,,,False,True,False,0.104765,False,False,True,False,False,False,False,50.1,-1.073743,765.0,0.113965,,,,,False,True,False,False,False,True,False,0,False,False,False,True,4.0,-0.075803,969.0,-1.391543,,,,,,,,,False,False,True,False,False,False,False,0,0,0,,,50.0,-0.799534,,,81.5,-0.591406,75.0,5400.0,-0.905243,0.239812,2.0,-1.729519,,,,,5.1,-0.568392,5.1,-0.568392,9.5,-0.697054,False,True,False,False,False,12.9,0.971706,,,2520.0,-0.103675,True,False,False,,,,,,,13.0,-1.201539,True,False,False,False,False,,,False,False,False,False,False,False,True,False,False,,,,,123.0,3200.0,-1.078092,0.289174,,,491.0,-0.338552,530.0,0.254333,4.0,-0.52588,,


In [26]:
# validating changes
pd.set_option('display.max_rows', 100)
print(df[['Engine aspiration', 'asp_Naturally_aspirated', 'asp_Quad_turbo', 
          'asp_Supercharged', 'asp_Turbo', 'asp_Twin_turbo']].head(100))

             Engine aspiration  asp_Naturally_aspirated  asp_Quad_turbo  asp_Supercharged  asp_Turbo  asp_Twin_turbo
0   Naturally aspirated engine                     True           False             False      False           False
1   Naturally aspirated engine                     True           False             False      False           False
2   Naturally aspirated engine                     True           False             False      False           False
3   Naturally aspirated engine                     True           False             False      False           False
4   Naturally aspirated engine                     True           False             False      False           False
5   Naturally aspirated engine                     True           False             False      False           False
6   Naturally aspirated engine                     True           False             False      False           False
7   Naturally aspirated engine                     True         

In [27]:
from sklearn.preprocessing import StandardScaler

# ------------------------------------------------------------------
# 67. Fuel consumption - combined → l/100km + z-score
if 'Fuel consumption - combined' in df.columns:
    df['Fuel_combined_L100'] = df['Fuel consumption - combined'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    scaler = StandardScaler()
    df['Fuel_combined_L100_z'] = scaler.fit_transform(df[['Fuel_combined_L100']])

# ------------------------------------------------------------------
# 68. Drive wheel → one-hot (3 peamist)
if 'Drive wheel' in df.columns:
    df = pd.get_dummies(df, columns=['Drive wheel'], prefix='drive', dummy_na=False)

# ------------------------------------------------------------------
# 69. Ride height → mm + z-score
if 'Ride height' in df.columns:
    df['Ride_height_mm'] = df['Ride height'].astype(str).str.extract(r'(\d+)')[0].astype(float)
    scaler = StandardScaler()
    df['Ride_height_z'] = scaler.fit_transform(df[['Ride_height_mm']])

# ------------------------------------------------------------------
# 70. Minimum turning circle → m + z-score
if 'Minimum turning circle (turning diameter)' in df.columns:
    df['Turning_circle_m'] = df['Minimum turning circle (turning diameter)'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    scaler = StandardScaler()
    df['Turning_circle_z'] = scaler.fit_transform(df[['Turning_circle_m']])

# ------------------------------------------------------------------
# 71. Permitted trailer load with brakes (8%) → kg + z-score
if 'Permitted trailer load with brakes (8%)' in df.columns:
    df['Trailer_brakes8_kg'] = df['Permitted trailer load with brakes (8%)'].astype(str).str.extract(r'(\d+)')[0].astype(float)
    scaler = StandardScaler()
    df['Trailer_brakes8_z'] = scaler.fit_transform(df[['Trailer_brakes8_kg']])

# ------------------------------------------------------------------
# 72. Rear suspension → simplifiying to 5 main ones + one-hot
if 'Rear suspension' in df.columns:
    txt = df['Rear suspension'].astype(str)
    df['rear_susp'] = 'Other'
    df.loc[txt.str.contains('Multi.link|multi.link', case=False, na=False), 'rear_susp'] = 'Multi_link'
    df.loc[txt.str.contains('Helical.spring|coil.spring', case=False, na=False), 'rear_susp'] = 'Coil_spring'
    df.loc[txt.str.contains('Semi.independent|semi.independent', case=False, na=False), 'rear_susp'] = 'Torsion_beam'
    df.loc[txt.str.contains('Independent.*spring', case=False, na=False), 'rear_susp'] = 'Independent'
    df.loc[txt.str.contains('Leaf.spring|leaf spring', case=False, na=False), 'rear_susp'] = 'Leaf_spring'
    df = pd.get_dummies(df, columns=['rear_susp'], prefix='susp_rear', dummy_na=False)

# ------------------------------------------------------------------
# 73. Maximum speed → km/h + z-score (väga võimas tunnus!)
if 'Maximum speed' in df.columns:
    df['Max_speed_kmh'] = df['Maximum speed'].astype(str).str.extract(r'(\d+)')[0].astype(float)
    scaler = StandardScaler()
    df['Max_speed_z'] = scaler.fit_transform(df[['Max_speed_kmh']])

# ------------------------------------------------------------------
# 74. Rear (Back) track → mm + z-score
if 'Rear (Back) track' in df.columns:
    df['Rear_track_mm'] = df['Rear (Back) track'].astype(str).str.extract(r'(\d+)')[0].astype(float)
    scaler = StandardScaler()
    df['Rear_track_z'] = scaler.fit_transform(df[['Rear_track_mm']])

# ------------------------------------------------------------------
# 75. Tires size → taking 3 most important parts (205/55 R16 → 205, 55, 16)
if 'Tires size' in df.columns:
    txt = df['Tires size'].astype(str)
    df['Tire_width']   = txt.str.extract(r'^(\d+)').astype(float)
    df['Tire_profile'] = txt.str.extract(r'/(\d+)').astype(float)
    df['Rim_diameter'] = txt.str.extract(r'R(\d+)').astype(float)
    
    scaler = StandardScaler()
    df[['Tire_width_z', 'Tire_profile_z', 'Rim_diameter_z']] = scaler.fit_transform(
        df[['Tire_width', 'Tire_profile', 'Rim_diameter']]
    )

# ------------------------------------------------------------------
# 76. Acceleration 0–100 km/h → in seconds + z-score
if 'Acceleration 0 - 100 km/h (0 - 62 mph)' in df.columns:
    df['Accel_0_100_sec'] = df['Acceleration 0 - 100 km/h (0 - 62 mph)'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    scaler = StandardScaler()
    df['Accel_0_100_z'] = scaler.fit_transform(df[['Accel_0_100_sec']])

# ------------------------------------------------------------------
# 77. Engine displacement → cm³ + z-score
if 'Engine displacement' in df.columns:
    df['Displacement_cm3'] = df['Engine displacement'].astype(str).str.extract(r'(\d+)')[0].astype(float)
    scaler = StandardScaler()
    df['Displacement_z'] = scaler.fit_transform(df[['Displacement_cm3']])

# ------------------------------------------------------------------
# 78. Fuel consumption - urban → l/100km + z-score
if 'Fuel consumption - urban' in df.columns:
    df['Fuel_urban_L100'] = df['Fuel consumption - urban'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    scaler = StandardScaler()
    df['Fuel_urban_L100_z'] = scaler.fit_transform(df[['Fuel_urban_L100']])

# ------------------------------------------------------------------
# 79. Rear overhang → mm + z-score
if 'Rear overhang' in df.columns:
    df['Rear_overhang_mm'] = df['Rear overhang'].astype(str).str.extract(r'(\d+)')[0].astype(float)
    scaler = StandardScaler()
    df['Rear_overhang_z'] = scaler.fit_transform(df[['Rear_overhang_mm']])

new_z = ['Fuel_combined_L100_z','Ride_height_z','Turning_circle_z','Max_speed_z',
         'Accel_0_100_z','Displacement_z','Tire_width_z','Tire_profile_z','Rim_diameter_z']

print("New z-attributes:")
for c in new_z:
    if c in df.columns:
        base = c.replace('_z','').replace('Tire_','Rehvi ').replace('Rim_diameter','Velje läbimõõt')
        print(f"  → {c:30} ← {base}")

print(f"\nAdded {len([c for c in new_z if c in df.columns])} _z-attributes")
print(f"Data: {df.shape[0]:,} rows × {df.shape[1]} columns")

New z-attributes:
  → Fuel_combined_L100_z           ← Fuel_combined_L100
  → Ride_height_z                  ← Ride_height
  → Turning_circle_z               ← Turning_circle
  → Max_speed_z                    ← Max_speed
  → Accel_0_100_z                  ← Accel_0_100
  → Displacement_z                 ← Displacement
  → Tire_width_z                   ← Rehvi width
  → Tire_profile_z                 ← Rehvi profile
  → Rim_diameter_z                 ← Velje läbimõõt

Added 9 _z-attributes
Data: 37,695 rows × 262 columns


In [28]:
pd.set_option('display.max_columns', None)    # näitab KÕIKI veerge
pd.set_option('display.max_rows', 10)         # näitab täpselt 10 rida (pool enne ja pool pärast)
pd.set_option('display.width', None)          # automaatne laius
pd.set_option('display.expand_frame_repr', False)  # ei murragi ridasid
pd.set_option('display.max_colwidth', 50)     # veergude sisu ei lõigata liiga lühikeseks

df.head(10)

Unnamed: 0,Powertrain Architecture,Body type,Fuel Type,Max. weight,Length,Width,Height,Front track,Emission standard,Seats,Battery capacity,Power per litre,Max. roof load,Cylinder Bore,100 km/h - 0,Doors,Maximum engine speed,Kerb Weight,Front overhang,Ramp angle,Number of Gears (automatic transmission),Fuel tank capacity,Acceleration,Piston Stroke,Power,Number of valves per cylinder,Climb angle,Drag coefficient (Cd),Fuel consumption - extra urban,Compression ratio,Valvetrain,Weight-to-power ratio,Trunk (boot) space - maximum,Wheelbase,Front brakes,Coolant,Engine oil capacity,CO2 emissions,Wheel rims size,Engine aspiration,Departure angle,Permitted towbar download,Approach angle,Torque,Permitted trailer load without brakes,Max load,Trunk (boot) space - minimum,Number of cylinders,Permitted trailer load with brakes (12%),Fuel consumption - combined,Ride height,Minimum turning circle (turning diameter),Permitted trailer load with brakes (8%),Rear suspension,Maximum speed,Rear (Back) track,Tires size,Acceleration 0 - 100 km/h (0 - 62 mph),Engine displacement,Fuel consumption - urban,Rear overhang,Year,Production_years,Brand_Model,Brand_freq,Brand_Model_freq,Year_norm,Production_years_norm,is_ev,is_phev,is_mhev_fhev,is_ice,Powertrain_simple,Body_Cabriolet,Body_Coupe,Body_Hatchback,Body_MPV,Body_Other,Body_Pick-up,Body_Roadster,Body_SUV,Body_Sedan,Body_Station wagon,Fuel_simple,is_diesel,is_petrol,is_hybrid_ev,is_lpg_cng,gears_manual,Length_z,Width_z,Height_z,Max. weight_z,Front track_z,gears_manual_z,emission_level,emission_level_z,steering_Cone worm with recirculation balls,steering_Steering rack and pinion,steering_Worm-reduction unit,Seats_z,cyl_180° flat V-engine,cyl_Boxer,cyl_Inline,cyl_Rotary (Wankel),cyl_V-engine,cyl_VR-engine,cyl_W-engine,Power_per_litre,Power_per_litre_z,Cylinder_Bore_mm,Cylinder_Bore_z,Max_roof_load_kg,Max_roof_load_z,Battery_kWh,Battery_kWh_z,drive_AWD,drive_FWD,drive_Other,drive_RWD,rear_brake_Disc,rear_brake_Drum,rear_brake_Unknown,has_abs,susp_Double_wishbone,susp_McPherson,susp_Multi_link,susp_Other,Doors_clean,Doors_clean_z,Kerb_Weight_kg,Kerb_Weight_kg_z,Braking_100to0_m,Braking_100to0_m_z,Max_RPM,Max_RPM_z,Front_overhang_mm,Front_overhang_mm_z,Ramp_angle_deg,Ramp_angle_deg_z,"engine_loc_Front axle, Transverse","engine_loc_Front, Longitudinal","engine_loc_Front, Transverse","engine_loc_Middle, Longitudinal","engine_loc_Middle, Transverse","engine_loc_Rear, Longitudinal","engine_loc_Rear, Transverse",has_particulate_filter,has_start_stop,has_cylinder_deact,gears_auto,gears_auto_z,Fuel_tank_L,Fuel_tank_L_z,Accel_0_100_sec,Accel_0_100_z,Piston_Stroke_mm,Piston_Stroke_z,Power_hp,Power_rpm,Power_hp_z,Power_rpm_z,Valves_per_cyl,Valves_per_cyl_z,Climb_angle_deg,Climb_angle_z,Drag_Cd,Drag_Cd_z,Fuel_extra_urban_L100km,Fuel_extra_urban_z,Fuel_extra_L100,Fuel_extra_L100_z,Compression_ratio,Compression_ratio_z,valve_DOHC,valve_OHC,valve_OHV,valve_Other,valve_SOHC,WtoP_kg_per_hp,WtoP_kg_per_hp_z,Trunk_max_L,Trunk_max_L_z,Wheelbase_mm,Wheelbase_z,front_brake_Disc,front_brake_Drum,front_brake_Unknown,Coolant_L,Coolant_L_z,Oil_capacity_L,Oil_capacity_L_z,CO2_gkm,CO2_gkm_z,Rim_size_inch,Rim_size_inch_z,asp_Naturally_aspirated,asp_Quad_turbo,asp_Supercharged,asp_Turbo,asp_Twin_turbo,Departure_angle_deg,Departure_angle_z,fuel_sys_Carburettor,fuel_sys_Diesel - Standard diesel injection (SDI),fuel_sys_Diesel Commonrail,fuel_sys_Direct injection,fuel_sys_Direct injection / Multi-point indirect injection,fuel_sys_Dual point injection,fuel_sys_Mono-point injection,fuel_sys_Multi-point indirect injection,fuel_sys_Pump-nozzle (Unit Injector),Towbar_download_kg,Towbar_download_z,Approach_angle_deg,Approach_angle_z,Torque_Nm,Torque_rpm,Torque_Nm_z,Torque_rpm_z,Trailer_no_brakes_kg,Trailer_no_brakes_z,Max_load_kg,Max_load_z,Trunk_min_L,Trunk_min_L_z,Cylinders,Cylinders_z,Trailer_with_brakes_kg,Trailer_with_brakes_z,Fuel_combined_L100,Fuel_combined_L100_z,drive_All wheel drive (4x4),drive_Front wheel drive,drive_Rear wheel drive,Ride_height_mm,Ride_height_z,Turning_circle_m,Turning_circle_z,Trailer_brakes8_kg,Trailer_brakes8_z,susp_rear_Coil_spring,susp_rear_Independent,susp_rear_Leaf_spring,susp_rear_Multi_link,susp_rear_Other,Max_speed_kmh,Max_speed_z,Rear_track_mm,Rear_track_z,Tire_width,Tire_profile,Rim_diameter,Tire_width_z,Tire_profile_z,Rim_diameter_z,Displacement_cm3,Displacement_z,Fuel_urban_L100,Fuel_urban_L100_z,Rear_overhang_mm,Rear_overhang_z
0,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,95 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,95.0,,-0.728227,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,,,,False,False,False,,,,,,,False,False,False,False,True,,,,,,,,,,,,,,,,
1,Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,54 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,1972.0,3.0,DAF 66,0.000186,0.000186,0.342105,0.108108,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,54.0,,-1.09111,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,,,,False,False,False,,,,,,,False,False,False,False,True,,,,,,,,,,,,,,,,
2,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,90 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,90.0,,-0.772481,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,,,,False,False,False,,,,,,,False,False,False,False,True,,,,,,,,,,,,,,,,
3,Internal Combustion engine,Station wagon (estate),Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,57 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,1973.0,2.0,DAF 66,0.000186,0.000186,0.355263,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,57.0,,-1.064557,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,,,,False,False,False,,,,,,,False,False,False,False,True,,,,,,,,,,,,,,,,
4,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,110 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,110.0,,-0.595466,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,,,,False,False,False,,,,,,,False,False,False,False,True,,,,,,,,,,,,,,,,
5,Internal Combustion engine,Coupe,Petrol (Gasoline),,,,,,,,,,,,,,,,,,,,,,70 Hp,,,,,,,,,,,,,,,Naturally aspirated engine,,,,,,,,,,,,,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,70.0,,-0.949497,,,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,,,,False,False,False,,,,,,,False,False,False,False,True,,,,,,,,,,,,,,,,
6,Internal Combustion engine,Coupe,Petrol (Gasoline),,4660.0,1890.0,1340.0,1590.0,,4.0,,70.9 Hp/l,,90.2 mm,,3.0,,1615 kg,,,,70 l,,90 mm,326 Hp @ 5800 rpm.,4.0,,,10.5 l/100 km,9.85,DOHC,"5 kg/Hp, 201.9 Hp/tonne",300 l,2720 mm,Ventilated discs,,,,,Naturally aspirated engine,,,,427 Nm @ 4600 rpm.,,,185 l,8.0,,,,,,Helical spring,250 km/h,1570 mm,255/55 R18,6.1 sec,4601 cm3,15.1 l/100 km,,1998.0,2.0,AC Aceca,0.000186,5.3e-05,0.684211,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,5.0,-0.125848,-0.075452,-0.199609,,-0.246883,-0.522185,,,False,True,False,-0.970701,False,False,False,False,True,False,False,70.9,-0.237852,902.0,0.197687,,,,,False,False,False,True,True,False,False,1,False,False,False,True,3.0,-0.995372,1615.0,0.372344,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,70.0,0.491792,6.1,-1.143729,90.0,0.486582,326.0,5800.0,1.316305,0.640161,4.0,0.55783,,,,,10.5,2.01895,10.5,2.01895,9.85,-0.600363,True,False,False,False,False,5.0,-1.410896,300.0,-1.62922,2720.0,-0.103616,True,False,False,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,True,False,,,,,427.0,4600.0,0.897111,1.463985,,,,,185.0,-1.149808,8.0,2.097549,,,,,False,False,True,,,,,,,True,False,False,False,False,250.0,1.273126,1570.0,0.551901,255.0,55.0,18.0,0.241723,-0.278537,0.011969,4601.0,1.876639,15.1,1.253766,,
7,Internal Combustion engine,Sedan,Petrol (Gasoline),,4805.0,1810.0,1450.0,1550.0,,5.0,,71.3 Hp/l,,,,4.0,,,,,5.0,65 l,,,142 Hp @ 5800 rpm.,4.0,,,,10.0,,,,2700 mm,Ventilated discs,,,,6J x 15,Naturally aspirated engine,,,,187 Nm @ 3800 rpm.,,,480 l,6.0,,,,,,Helical spring,,1545 mm,205/65 R15,,1992 cm3,,,2006.0,5.0,Daewoo Tosca,0.004245,0.000106,0.789474,0.162162,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0,,-0.125825,-0.075512,-0.199575,,-0.246894,,,,False,True,False,0.104765,False,False,True,False,False,False,False,71.3,-0.221777,,,,,,,False,True,False,False,True,False,False,1,False,True,False,False,4.0,-0.075803,,,,,,,,,,,False,False,True,False,False,False,False,0,0,0,5.0,-0.470964,65.0,0.168961,,,,,142.0,5800.0,-0.31224,0.640161,4.0,0.55783,,,,,,,,,10.0,-0.558925,False,False,False,True,False,,,,,2700.0,-0.103622,True,False,False,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,True,False,,,,,187.0,3800.0,-0.66226,0.792664,,,,,480.0,0.050834,6.0,0.785834,,,,,False,True,False,,,,,,,True,False,False,False,False,,,1545.0,0.249944,205.0,65.0,15.0,-0.038054,0.347557,-0.014582,1992.0,-0.32806,,,,
8,Internal Combustion engine,Hatchback,Petrol (Gasoline),,4158.0,1663.0,1360.0,1400.0,,5.0,,46.9 Hp/l,,79 mm,,3.0,,990 kg,,,3.0,50 l,,81.5 mm,75 Hp @ 5600 rpm.,2.0,,,,8.6,OHC,"13.2 kg/Hp, 75.8 Hp/tonne",,2520 mm,Disc,,,,,Naturally aspirated engine,,,,122 Nm @ 2800 rpm.,,,,4.0,,,,,,Helical spring,,1406 mm,175/70 R13,,1598 cm3,,,1986.0,9.0,Daewoo Racer,0.004245,0.000212,0.526316,0.27027,0,0,0,1,ICE,0,0,1,0,0,0,0,0,0,0,Petrol,0,1,0,0,,-0.125927,-0.075623,-0.199603,,-0.246935,,,,False,True,False,0.104765,False,False,True,False,False,False,False,46.9,-1.202342,79.0,-0.305259,,,,,False,True,False,False,False,True,False,1,False,False,False,True,3.0,-0.995372,990.0,-1.334203,,,,,,,,,False,False,True,False,False,False,False,0,0,0,3.0,-1.598456,50.0,-0.799534,,,81.5,-0.591406,75.0,5600.0,-0.905243,0.439987,2.0,-1.729519,,,,,,,,,8.6,-0.945687,False,True,False,False,False,13.2,1.062185,,,2520.0,-0.103675,True,False,False,,,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,True,False,False,,,,,122.0,2800.0,-1.08459,-0.046486,,,,,,,4.0,-0.52588,,,,,False,True,False,,,,,,,True,False,False,False,False,,,1406.0,-1.428942,175.0,70.0,13.0,-0.20592,0.660604,-0.032283,1598.0,-0.661004,,,,
9,Internal Combustion engine,Sedan,Petrol (Gasoline),1460.0,4482.0,1662.0,1394.0,1400.0,,5.0,,50.1 Hp/l,,76.5 mm,,4.0,,969 kg,,,,50 l,,81.5 mm,75 Hp @ 5400 rpm.,2.0,,,5.1 l/100 km,9.5,OHC,"12.9 kg/Hp, 77.4 Hp/tonne",,2520 mm,Ventilated discs,,,,R13,Naturally aspirated engine,,,,123 Nm @ 3200 rpm.,,491 kg,530 l,4.0,,6.3 l/100 km,,,,Suspension with traction connecting levers,163 km/h,1406 mm,175/70 R13,12.5 sec,1498 cm3,9.3 l/100 km,,1993.0,2.0,Daewoo Racer,0.004245,0.000212,0.618421,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0,5.0,-0.125876,-0.075624,-0.199592,-0.084383,-0.246935,-0.522185,,,False,True,False,0.104765,False,False,True,False,False,False,False,50.1,-1.073743,765.0,0.113965,,,,,False,True,False,False,False,True,False,0,False,False,False,True,4.0,-0.075803,969.0,-1.391543,,,,,,,,,False,False,True,False,False,False,False,0,0,0,,,50.0,-0.799534,12.5,0.791844,81.5,-0.591406,75.0,5400.0,-0.905243,0.239812,2.0,-1.729519,,,,,5.1,-0.568392,5.1,-0.568392,9.5,-0.697054,False,True,False,False,False,12.9,0.971706,,,2520.0,-0.103675,True,False,False,,,,,,,13.0,-1.201539,True,False,False,False,False,,,False,False,False,False,False,False,True,False,False,,,,,123.0,3200.0,-1.078092,0.289174,,,491.0,-0.338552,530.0,0.254333,4.0,-0.52588,,,6.3,-0.426788,False,True,False,,,,,,,False,False,False,False,True,163.0,-1.165669,1406.0,-1.428942,175.0,70.0,13.0,-0.20592,0.660604,-0.032283,1498.0,-0.745508,9.3,-0.179866,,


In [29]:
# exporting cleaned CSV
#df.to_csv('carsdirectory_cleaned.csv', index=False, encoding='utf-8')

In [30]:
print(df.head(10))

      Powertrain Architecture               Body type          Fuel Type  Max. weight  Length  Width  Height  Front track Emission standard  Seats Battery capacity Power per litre Max. roof load Cylinder Bore 100 km/h - 0 Doors Maximum engine speed Kerb Weight Front overhang Ramp angle Number of Gears (automatic transmission) Fuel tank capacity Acceleration Piston Stroke               Power  Number of valves per cylinder Climb angle Drag coefficient (Cd) Fuel consumption - extra urban  Compression ratio Valvetrain      Weight-to-power ratio Trunk (boot) space - maximum Wheelbase      Front brakes Coolant Engine oil capacity CO2 emissions Wheel rims size           Engine aspiration Departure angle Permitted towbar download Approach angle              Torque Permitted trailer load without brakes Max load Trunk (boot) space - minimum  Number of cylinders Permitted trailer load with brakes (12%) Fuel consumption - combined Ride height Minimum turning circle (turning diameter) Permitted tra

In [31]:
# final cleanup

# combining attributes:
df['Fuel_avg_L100'] = df[['Fuel_combined_L100', 'Fuel_urban_L100', 'Fuel_extra_urban_L100km']].mean(axis=1, skipna=True)
df['Fuel_avg_z'] = StandardScaler().fit_transform(df[['Fuel_avg_L100']])

df['Trunk_avg_L'] = df[['Trunk_min_L', 'Trunk_max_L']].mean(axis=1, skipna=True)
df['Trunk_avg_z'] = StandardScaler().fit_transform(df[['Trunk_avg_L']])


# erasing unnecessary original columns
original_columns = [
    'Powertrain Architecture', 'Body type', 'Fuel Type', 'Battery capacity',
    'Power per litre', 'Max. roof load', 'Cylinder Bore', '100 km/h - 0',
    'Doors', 'Maximum engine speed', 'Kerb Weight', 'Front overhang', 'Ramp angle', 'Number of Gears (automatic transmission)',
    'Fuel tank capacity', 'Acceleration', 'Piston Stroke', 'Power', 'Number of vales per cylinder', 'Climb angle', 'Drag coefficient(Cd)',
    'Fuel consumption - extra urban', 'Compression ratio', 'Weight-to-power ratio', 'Trunk (boot) space - maximum',
    'Wheelbase', 'Coolant', 'Engine oil capacity', 'CO2 emissions', 'Departure angle',
    'Permitted towbar download', 'Approach angle', 'Torque', 'Permitted trailer load without brakes', 
    'Max load', 'Trunk (boot) space - minimum', 'Number on cylinders', 'Permitted trailer load with brakes (12%)', 
    'Fuel consumption - combined', 'Ride height', 'Minimum turning circle (turning diameter)', 'Permitted trailer load with brakes (8%)', 
    'Maximum speed', 'Rear (Back) track', 'Tires size', 'Acceleration 0 - 100 km/h (0 - 62 mph)', 'Engine displacement', 
    'Fuel consumption - urban', 'Rear overhang', 'Fuel_extra_L100', 'Fuel_extra_L100_z'
]

# renaming some columns
df = df.rename(columns={
    'Trailer_with_brakes_kg': 'Trailer_brakes12_kg',
    'Trailer_with_brakes_z': 'Trailer_brakes12_z',
    'cyl_180° flat V-engine': 'cyl_180deg flat V-engine'
})

# removing redundant columns that add pointless noise
redundant_columns = [
    'Power_per_litre', 'Power_per_litre_z',
    'Emission standard', 'emission_level', 'emission_level_z',
    'Wheel rims size', 'Rim_size_inch', 'Rim_size_inch_z'
]

context_columns = [
    'Valvetrain', 'Front brakes', 'Engine aspiration', 'Rear suspension'
]

cols_to_drop = original_columns + redundant_columns + context_columns
cols_to_drop = [col for col in cols_to_drop if col in df.columns]

print(f" {len(cols_to_drop)} columns are being deleted..")

df.drop(columns=cols_to_drop, inplace=True)
print(f"Remaining {df.shape[1]} columns")

df.head(10)

 60 columns are being deleted..
Remaining 206 columns


Unnamed: 0,Max. weight,Length,Width,Height,Front track,Seats,Number of valves per cylinder,Drag coefficient (Cd),Number of cylinders,Year,Production_years,Brand_Model,Brand_freq,Brand_Model_freq,Year_norm,Production_years_norm,is_ev,is_phev,is_mhev_fhev,is_ice,Powertrain_simple,Body_Cabriolet,Body_Coupe,Body_Hatchback,Body_MPV,Body_Other,Body_Pick-up,Body_Roadster,Body_SUV,Body_Sedan,Body_Station wagon,Fuel_simple,is_diesel,is_petrol,is_hybrid_ev,is_lpg_cng,gears_manual,Length_z,Width_z,Height_z,Max. weight_z,Front track_z,gears_manual_z,steering_Cone worm with recirculation balls,steering_Steering rack and pinion,steering_Worm-reduction unit,Seats_z,cyl_180deg flat V-engine,cyl_Boxer,cyl_Inline,cyl_Rotary (Wankel),cyl_V-engine,cyl_VR-engine,cyl_W-engine,Cylinder_Bore_mm,Cylinder_Bore_z,Max_roof_load_kg,Max_roof_load_z,Battery_kWh,Battery_kWh_z,drive_AWD,drive_FWD,drive_Other,drive_RWD,rear_brake_Disc,rear_brake_Drum,rear_brake_Unknown,has_abs,susp_Double_wishbone,susp_McPherson,susp_Multi_link,susp_Other,Doors_clean,Doors_clean_z,Kerb_Weight_kg,Kerb_Weight_kg_z,Braking_100to0_m,Braking_100to0_m_z,Max_RPM,Max_RPM_z,Front_overhang_mm,Front_overhang_mm_z,Ramp_angle_deg,Ramp_angle_deg_z,"engine_loc_Front axle, Transverse","engine_loc_Front, Longitudinal","engine_loc_Front, Transverse","engine_loc_Middle, Longitudinal","engine_loc_Middle, Transverse","engine_loc_Rear, Longitudinal","engine_loc_Rear, Transverse",has_particulate_filter,has_start_stop,has_cylinder_deact,gears_auto,gears_auto_z,Fuel_tank_L,Fuel_tank_L_z,Accel_0_100_sec,Accel_0_100_z,Piston_Stroke_mm,Piston_Stroke_z,Power_hp,Power_rpm,Power_hp_z,Power_rpm_z,Valves_per_cyl,Valves_per_cyl_z,Climb_angle_deg,Climb_angle_z,Drag_Cd,Drag_Cd_z,Fuel_extra_urban_L100km,Fuel_extra_urban_z,Compression_ratio,Compression_ratio_z,valve_DOHC,valve_OHC,valve_OHV,valve_Other,valve_SOHC,WtoP_kg_per_hp,WtoP_kg_per_hp_z,Trunk_max_L,Trunk_max_L_z,Wheelbase_mm,Wheelbase_z,front_brake_Disc,front_brake_Drum,front_brake_Unknown,Coolant_L,Coolant_L_z,Oil_capacity_L,Oil_capacity_L_z,CO2_gkm,CO2_gkm_z,asp_Naturally_aspirated,asp_Quad_turbo,asp_Supercharged,asp_Turbo,asp_Twin_turbo,Departure_angle_deg,Departure_angle_z,fuel_sys_Carburettor,fuel_sys_Diesel - Standard diesel injection (SDI),fuel_sys_Diesel Commonrail,fuel_sys_Direct injection,fuel_sys_Direct injection / Multi-point indirect injection,fuel_sys_Dual point injection,fuel_sys_Mono-point injection,fuel_sys_Multi-point indirect injection,fuel_sys_Pump-nozzle (Unit Injector),Towbar_download_kg,Towbar_download_z,Approach_angle_deg,Approach_angle_z,Torque_Nm,Torque_rpm,Torque_Nm_z,Torque_rpm_z,Trailer_no_brakes_kg,Trailer_no_brakes_z,Max_load_kg,Max_load_z,Trunk_min_L,Trunk_min_L_z,Cylinders,Cylinders_z,Trailer_brakes12_kg,Trailer_brakes12_z,Fuel_combined_L100,Fuel_combined_L100_z,drive_All wheel drive (4x4),drive_Front wheel drive,drive_Rear wheel drive,Ride_height_mm,Ride_height_z,Turning_circle_m,Turning_circle_z,Trailer_brakes8_kg,Trailer_brakes8_z,susp_rear_Coil_spring,susp_rear_Independent,susp_rear_Leaf_spring,susp_rear_Multi_link,susp_rear_Other,Max_speed_kmh,Max_speed_z,Rear_track_mm,Rear_track_z,Tire_width,Tire_profile,Rim_diameter,Tire_width_z,Tire_profile_z,Rim_diameter_z,Displacement_cm3,Displacement_z,Fuel_urban_L100,Fuel_urban_L100_z,Rear_overhang_mm,Rear_overhang_z,Fuel_avg_L100,Fuel_avg_z,Trunk_avg_L,Trunk_avg_z
0,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,95.0,,-0.728227,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,,,,False,False,False,,,,,,,False,False,False,False,True,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,1972.0,3.0,DAF 66,0.000186,0.000186,0.342105,0.108108,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,54.0,,-1.09111,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,,,,False,False,False,,,,,,,False,False,False,False,True,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,90.0,,-0.772481,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,,,,False,False,False,,,,,,,False,False,False,False,True,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,1973.0,2.0,DAF 66,0.000186,0.000186,0.355263,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,0,1,Petrol,0,1,0,0,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,57.0,,-1.064557,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,,,,False,False,False,,,,,,,False,False,False,False,True,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,1968.0,2.0,Alpine A110,0.000557,0.000186,0.289474,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,110.0,,-0.595466,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,,,,False,False,False,,,,,,,False,False,False,False,True,,,,,,,,,,,,,,,,,,,,
5,,,,,,,,,,1966.0,4.0,Alpine A110,0.000557,0.000186,0.263158,0.135135,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,,,,,,,,False,False,False,,False,False,False,False,False,False,False,,,,,,,False,False,True,False,False,False,True,0,False,False,False,True,,,,,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,,,,,,,70.0,,-0.949497,,,,,,,,,,,,False,False,False,True,False,,,,,,,False,False,True,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,,,,False,False,False,,,,,,,False,False,False,False,True,,,,,,,,,,,,,,,,,,,,
6,,4660.0,1890.0,1340.0,1590.0,4.0,4.0,,8.0,1998.0,2.0,AC Aceca,0.000186,5.3e-05,0.684211,0.081081,0,0,0,1,ICE,0,1,0,0,0,0,0,0,0,0,Petrol,0,1,0,0,5.0,-0.125848,-0.075452,-0.199609,,-0.246883,-0.522185,False,True,False,-0.970701,False,False,False,False,True,False,False,902.0,0.197687,,,,,False,False,False,True,True,False,False,1,False,False,False,True,3.0,-0.995372,1615.0,0.372344,,,,,,,,,False,False,False,False,False,False,False,0,0,0,,,70.0,0.491792,6.1,-1.143729,90.0,0.486582,326.0,5800.0,1.316305,0.640161,4.0,0.55783,,,,,10.5,2.01895,9.85,-0.600363,True,False,False,False,False,5.0,-1.410896,300.0,-1.62922,2720.0,-0.103616,True,False,False,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,True,False,,,,,427.0,4600.0,0.897111,1.463985,,,,,185.0,-1.149808,8.0,2.097549,,,,,False,False,True,,,,,,,True,False,False,False,False,250.0,1.273126,1570.0,0.551901,255.0,55.0,18.0,0.241723,-0.278537,0.011969,4601.0,1.876639,15.1,1.253766,,,12.8,1.591205,242.5,-1.171396
7,,4805.0,1810.0,1450.0,1550.0,5.0,4.0,,6.0,2006.0,5.0,Daewoo Tosca,0.004245,0.000106,0.789474,0.162162,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0,,-0.125825,-0.075512,-0.199575,,-0.246894,,False,True,False,0.104765,False,False,True,False,False,False,False,,,,,,,False,True,False,False,True,False,False,1,False,True,False,False,4.0,-0.075803,,,,,,,,,,,False,False,True,False,False,False,False,0,0,0,5.0,-0.470964,65.0,0.168961,,,,,142.0,5800.0,-0.31224,0.640161,4.0,0.55783,,,,,,,10.0,-0.558925,False,False,False,True,False,,,,,2700.0,-0.103622,True,False,False,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,False,True,False,,,,,187.0,3800.0,-0.66226,0.792664,,,,,480.0,0.050834,6.0,0.785834,,,,,False,True,False,,,,,,,True,False,False,False,False,,,1545.0,0.249944,205.0,65.0,15.0,-0.038054,0.347557,-0.014582,1992.0,-0.32806,,,,,,,480.0,-0.663952
8,,4158.0,1663.0,1360.0,1400.0,5.0,2.0,,4.0,1986.0,9.0,Daewoo Racer,0.004245,0.000212,0.526316,0.27027,0,0,0,1,ICE,0,0,1,0,0,0,0,0,0,0,Petrol,0,1,0,0,,-0.125927,-0.075623,-0.199603,,-0.246935,,False,True,False,0.104765,False,False,True,False,False,False,False,79.0,-0.305259,,,,,False,True,False,False,False,True,False,1,False,False,False,True,3.0,-0.995372,990.0,-1.334203,,,,,,,,,False,False,True,False,False,False,False,0,0,0,3.0,-1.598456,50.0,-0.799534,,,81.5,-0.591406,75.0,5600.0,-0.905243,0.439987,2.0,-1.729519,,,,,,,8.6,-0.945687,False,True,False,False,False,13.2,1.062185,,,2520.0,-0.103675,True,False,False,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,True,False,False,,,,,122.0,2800.0,-1.08459,-0.046486,,,,,,,4.0,-0.52588,,,,,False,True,False,,,,,,,True,False,False,False,False,,,1406.0,-1.428942,175.0,70.0,13.0,-0.20592,0.660604,-0.032283,1598.0,-0.661004,,,,,,,,
9,1460.0,4482.0,1662.0,1394.0,1400.0,5.0,2.0,,4.0,1993.0,2.0,Daewoo Racer,0.004245,0.000212,0.618421,0.081081,0,0,0,1,ICE,0,0,0,0,0,0,0,0,1,0,Petrol,0,1,0,0,5.0,-0.125876,-0.075624,-0.199592,-0.084383,-0.246935,-0.522185,False,True,False,0.104765,False,False,True,False,False,False,False,765.0,0.113965,,,,,False,True,False,False,False,True,False,0,False,False,False,True,4.0,-0.075803,969.0,-1.391543,,,,,,,,,False,False,True,False,False,False,False,0,0,0,,,50.0,-0.799534,12.5,0.791844,81.5,-0.591406,75.0,5400.0,-0.905243,0.239812,2.0,-1.729519,,,,,5.1,-0.568392,9.5,-0.697054,False,True,False,False,False,12.9,0.971706,,,2520.0,-0.103675,True,False,False,,,,,,,True,False,False,False,False,,,False,False,False,False,False,False,True,False,False,,,,,123.0,3200.0,-1.078092,0.289174,,,491.0,-0.338552,530.0,0.254333,4.0,-0.52588,,,6.3,-0.426788,False,True,False,,,,,,,False,False,False,False,True,163.0,-1.165669,1406.0,-1.428942,175.0,70.0,13.0,-0.20592,0.660604,-0.032283,1498.0,-0.745508,9.3,-0.179866,,,6.9,-0.387638,530.0,-0.557122


In [32]:
# splitting human readable columns that are not used while training
"""
carsdirectory_context.csv = 

carsdirectory_data.csv = 
"""

'\ncarsdirectory_context.csv = \n\ncarsdirectory_data.csv = \n'

In [33]:
# Nomralizing column names
# ===========================================================================
df.columns = (df.columns
              .str.replace(r'[()\)|]', '', regex=True)   # removes ( ) [ ]
              .str.replace(r'[-\s.]+', '_', regex=True)   # - and spaces → _
              .str.replace(r'_+', '_', regex=True)
              .str.strip('_')
              .str.lower())

low_fill_cols_3 = analyse_instances(df, low_fill_threshold=5)
print(low_fill_cols_3)

Dataset: 37,695 rows × 206 columns

----------------------------------------------------------------------------------------------------
  1.   [max_weight]
     Fill: 26,803 / 37,695 (71.10%) | Unique: 1,356
     Too many unique values (1,356), showing top 5:
         - <NaN>: 10,892
         - 2000: 288
         - 1870: 256
         - 1800: 246
         - 1900: 231
         ... and 1,351 more unique values

  2.   [length]
     Fill: 34,524 / 37,695 (91.59%) | Unique: 1,628
     Too many unique values (1,628), showing top 5:
         - <NaN>: 3,171
         - 4635: 200
         - 4765: 196
         - 4740: 195
         - 4796: 160
         ... and 1,623 more unique values

  3.   [width]
     Fill: 34,323 / 37,695 (91.05%) | Unique: 581
     Too many unique values (581), showing top 5:
         - <NaN>: 3,372
         - 1695: 1,096
         - 1810: 645
         - 1740: 570
         - 1690: 543
         ... and 576 more unique values

  4.   [height]
     Fill: 34,304 / 37,695 (91.00%

In [34]:
# Exporting dataset

df.to_csv('carsdirectory_data.csv', index=False)