In [1]:
import polars as pl

In [2]:
data = pl.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [3]:
data.head()

engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
i64,i64,i64,f64,f64,i64,str,str,str,i64,f64
170,3.0,159.0,3413.433759,17.7,2003,"""Europe""","""Gasoline""","""All-wheel drive""",0,13.231729
130,5.0,97.0,3149.664934,17.8,2007,"""USA""","""Gasoline""","""Front-wheel drive""",0,13.688217
170,,78.0,3079.038997,15.1,2018,"""Europe""","""Gasoline""","""Front-wheel drive""",0,14.246341
220,4.0,,2542.392402,20.2,2009,"""USA""","""Diesel""","""All-wheel drive""",2,16.912736
210,1.0,140.0,3460.87099,14.4,2009,"""Europe""","""Gasoline""","""All-wheel drive""",2,12.488369


In [4]:
print(data.shape)
print(data.head())
print(data.describe())

(9704, 11)
shape: (5, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ engine_di ┆ num_cylin ┆ horsepowe ┆ vehicle_w ┆ … ┆ fuel_type ┆ drivetrai ┆ num_doors ┆ fuel_eff │
│ splacemen ┆ ders      ┆ r         ┆ eight     ┆   ┆ ---       ┆ n         ┆ ---       ┆ iciency_ │
│ t         ┆ ---       ┆ ---       ┆ ---       ┆   ┆ str       ┆ ---       ┆ i64       ┆ mpg      │
│ ---       ┆ i64       ┆ i64       ┆ f64       ┆   ┆           ┆ str       ┆           ┆ ---      │
│ i64       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 170       ┆ 3         ┆ 159       ┆ 3413.4337 ┆ … ┆ Gasoline  ┆ All-wheel ┆ 0         ┆ 13.23172 │
│           ┆           ┆           ┆ 59        ┆   ┆           ┆ drive     ┆           ┆ 9        │
│ 130       ┆ 5         ┆ 97        ┆ 3149.6649 ┆ … ┆ Gasoline  ┆

In [7]:
print("Missing values before:")
print(data.null_count())

# Fill all missing values with 0
df = data.fill_null(0)

# Verify no missing values remain
print("\nMissing values after:")
print(df.null_count())

Missing values before:
shape: (1, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ engine_di ┆ num_cylin ┆ horsepowe ┆ vehicle_w ┆ … ┆ fuel_type ┆ drivetrai ┆ num_doors ┆ fuel_eff │
│ splacemen ┆ ders      ┆ r         ┆ eight     ┆   ┆ ---       ┆ n         ┆ ---       ┆ iciency_ │
│ t         ┆ ---       ┆ ---       ┆ ---       ┆   ┆ u32       ┆ ---       ┆ u32       ┆ mpg      │
│ ---       ┆ u32       ┆ u32       ┆ u32       ┆   ┆           ┆ u32       ┆           ┆ ---      │
│ u32       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ u32      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 0         ┆ 0         ┆ 0         ┆ 0         ┆ … ┆ 0         ┆ 0         ┆ 0         ┆ 0        │
└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘

Missing values after:
shape: (1, 11)
┌───────────┬──

In [8]:
from sklearn.model_selection import train_test_split
import numpy as np

# Create indices for splitting
indices = np.arange(len(df))

# First split: 60% train, 40% temp (validation + test)
train_idx, temp_idx = train_test_split(indices, test_size=0.4, random_state=1)

# Second split: split the 40% temp into 50/50 (giving us 20% validation, 20% test)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=1)

# Use indices to select rows in Polars
df_train = df[train_idx]
df_val = df[val_idx]
df_test = df[test_idx]

# Verify the split proportions
total = len(df)
print(f"Total: {total}")
print(f"Train: {len(df_train)} ({len(df_train)/total*100:.1f}%)")
print(f"Validation: {len(df_val)} ({len(df_val)/total*100:.1f}%)")
print(f"Test: {len(df_test)} ({len(df_test)/total*100:.1f}%)")

Total: 9704
Train: 5822 (60.0%)
Validation: 1941 (20.0%)
Test: 1941 (20.0%)


In [9]:
# Separate target variable from features for each split
y_train = df_train['fuel_efficiency_mpg'].to_numpy()
y_val = df_val['fuel_efficiency_mpg'].to_numpy()
y_test = df_test['fuel_efficiency_mpg'].to_numpy()

# Remove target from features
df_train = df_train.drop('fuel_efficiency_mpg')
df_val = df_val.drop('fuel_efficiency_mpg')
df_test = df_test.drop('fuel_efficiency_mpg')

print(f"Train features shape: {df_train.shape}")
print(f"Train target shape: {y_train.shape}")

Train features shape: (5822, 10)
Train target shape: (5822,)
