In [2]:
import polars as pl

In [3]:
data = pl.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [4]:
data.head()

engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
i64,i64,i64,f64,f64,i64,str,str,str,i64,f64
170,3.0,159.0,3413.433759,17.7,2003,"""Europe""","""Gasoline""","""All-wheel drive""",0,13.231729
130,5.0,97.0,3149.664934,17.8,2007,"""USA""","""Gasoline""","""Front-wheel drive""",0,13.688217
170,,78.0,3079.038997,15.1,2018,"""Europe""","""Gasoline""","""Front-wheel drive""",0,14.246341
220,4.0,,2542.392402,20.2,2009,"""USA""","""Diesel""","""All-wheel drive""",2,16.912736
210,1.0,140.0,3460.87099,14.4,2009,"""Europe""","""Gasoline""","""All-wheel drive""",2,12.488369


In [5]:
print(data.shape)
print(data.head())
print(data.describe())

(9704, 11)
shape: (5, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ engine_di ┆ num_cylin ┆ horsepowe ┆ vehicle_w ┆ … ┆ fuel_type ┆ drivetrai ┆ num_doors ┆ fuel_eff │
│ splacemen ┆ ders      ┆ r         ┆ eight     ┆   ┆ ---       ┆ n         ┆ ---       ┆ iciency_ │
│ t         ┆ ---       ┆ ---       ┆ ---       ┆   ┆ str       ┆ ---       ┆ i64       ┆ mpg      │
│ ---       ┆ i64       ┆ i64       ┆ f64       ┆   ┆           ┆ str       ┆           ┆ ---      │
│ i64       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 170       ┆ 3         ┆ 159       ┆ 3413.4337 ┆ … ┆ Gasoline  ┆ All-wheel ┆ 0         ┆ 13.23172 │
│           ┆           ┆           ┆ 59        ┆   ┆           ┆ drive     ┆           ┆ 9        │
│ 130       ┆ 5         ┆ 97        ┆ 3149.6649 ┆ … ┆ Gasoline  ┆

In [6]:
print("Missing values before:")
print(data.null_count())

# Fill all missing values with 0
df = data.fill_null(0)

# Verify no missing values remain
print("\nMissing values after:")
print(df.null_count())

Missing values before:
shape: (1, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ engine_di ┆ num_cylin ┆ horsepowe ┆ vehicle_w ┆ … ┆ fuel_type ┆ drivetrai ┆ num_doors ┆ fuel_eff │
│ splacemen ┆ ders      ┆ r         ┆ eight     ┆   ┆ ---       ┆ n         ┆ ---       ┆ iciency_ │
│ t         ┆ ---       ┆ ---       ┆ ---       ┆   ┆ u32       ┆ ---       ┆ u32       ┆ mpg      │
│ ---       ┆ u32       ┆ u32       ┆ u32       ┆   ┆           ┆ u32       ┆           ┆ ---      │
│ u32       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ u32      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 0         ┆ 482       ┆ 708       ┆ 0         ┆ … ┆ 0         ┆ 0         ┆ 502       ┆ 0        │
└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘

Missing values after:
shape: (1, 11)
┌───────────┬──

In [7]:
from sklearn.model_selection import train_test_split
import numpy as np

# Create indices for splitting
indices = np.arange(len(df))

# First split: 60% train, 40% temp (validation + test)
train_idx, temp_idx = train_test_split(indices, test_size=0.4, random_state=1)

# Second split: split the 40% temp into 50/50 (giving us 20% validation, 20% test)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=1)

# Use indices to select rows in Polars
df_train = df[train_idx]
df_val = df[val_idx]
df_test = df[test_idx]

# Verify the split proportions
total = len(df)
print(f"Total: {total}")
print(f"Train: {len(df_train)} ({len(df_train)/total*100:.1f}%)")
print(f"Validation: {len(df_val)} ({len(df_val)/total*100:.1f}%)")
print(f"Test: {len(df_test)} ({len(df_test)/total*100:.1f}%)")

Total: 9704
Train: 5822 (60.0%)
Validation: 1941 (20.0%)
Test: 1941 (20.0%)


In [8]:
# Separate target variable from features for each split
y_train = df_train['fuel_efficiency_mpg'].to_numpy()
y_val = df_val['fuel_efficiency_mpg'].to_numpy()
y_test = df_test['fuel_efficiency_mpg'].to_numpy()

# Remove target from features
df_train = df_train.drop('fuel_efficiency_mpg')
df_val = df_val.drop('fuel_efficiency_mpg')
df_test = df_test.drop('fuel_efficiency_mpg')

print(f"Train features shape: {df_train.shape}")
print(f"Train target shape: {y_train.shape}")

Train features shape: (5822, 10)
Train target shape: (5822,)


In [15]:
# Step 3: Convert Polars DataFrames to dictionaries
from sklearn.feature_extraction import DictVectorizer

train_dicts = df_train.to_dicts()
val_dicts = df_val.to_dicts()
test_dicts = df_test.to_dicts()

# Step 4: Apply DictVectorizer
dv = DictVectorizer(sparse=True)
X_train_matrix = dv.fit_transform(train_dicts)
X_val_matrix = dv.transform(val_dicts)
X_test_matrix = dv.transform(test_dicts)

print(f"\nMatrix shapes:")
print(f"X_train: {X_train_matrix.shape}")
print(f"X_val: {X_val_matrix.shape}")
print(f"X_test: {X_test_matrix.shape}")


Matrix shapes:
X_train: (5822, 14)
X_val: (1941, 14)
X_test: (1941, 14)


In [19]:
from sklearn.tree import DecisionTreeRegressor

# Train a Decision Tree with max_depth=1
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train_matrix, y_train)

print("Decision Tree model trained!")
print(f"Max depth: {dt.max_depth}")

Decision Tree model trained!
Max depth: 1


In [20]:
# Get the feature that was used for splitting
feature_idx = dt.tree_.feature[0]  # Index 0 is the root node

# Get the feature name from the DictVectorizer
feature_names = dv.get_feature_names_out()
splitting_feature = feature_names[feature_idx]

print(f"Feature used for splitting: {splitting_feature}")
print(f"Feature index: {feature_idx}")

# You can also see the threshold value used
threshold = dt.tree_.threshold[0]
print(f"Split threshold: {threshold}")

Feature used for splitting: vehicle_weight
Feature index: 13
Split threshold: 3028.8232421875


In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Train Random Forest Regressor
rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)

rf.fit(X_train_matrix, y_train)

print("Random Forest model trained!")

# Make predictions on validation set
y_val_pred = rf.predict(X_val_matrix)

# Calculate RMSE - two options:
# Option 1: Calculate MSE then take square root
mse = mean_squared_error(y_val, y_val_pred)
rmse = np.sqrt(mse)

# Option 2: Manual calculation
# rmse = np.sqrt(np.mean((y_val - y_val_pred)**2))

print(f"Validation RMSE: {rmse:.4f}")
print(f"Rounded RMSE: {rmse:.1f}")

Random Forest model trained!
Validation RMSE: 0.4616
Rounded RMSE: 0.5


In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Store results
results = []

# Try n_estimators from 10 to 200 with step 10
for n in range(10, 201, 10):
    # Train model
    rf = RandomForestRegressor(
        n_estimators=n,
        random_state=1,
        n_jobs=-1
    )
    rf.fit(X_train_matrix, y_train)
    
    # Predict and calculate RMSE
    y_val_pred = rf.predict(X_val_matrix)
    mse = mean_squared_error(y_val, y_val_pred)
    rmse = np.sqrt(mse)
    
    results.append({
        'n_estimators': n,
        'rmse': rmse
    })
    
    print(f"n_estimators={n:3d}, RMSE={rmse:.4f}")

# Find when RMSE stops improving (considering 3 decimal places)
print("\n" + "="*50)

# Round to 3 decimal places and find where it stops decreasing
best_rmse = float('inf')
stopped_at = None

for result in results:
    current_rmse = round(result['rmse'], 3)
    
    if current_rmse < best_rmse:
        best_rmse = current_rmse
        stopped_at = result['n_estimators']
        print(f"Improvement at n_estimators={result['n_estimators']}, RMSE={current_rmse:.3f}")

print(f"\nRMSE stopped improving after n_estimators = {stopped_at}")
print(f"Best RMSE (3 decimals): {best_rmse:.3f}")

n_estimators= 10, RMSE=0.4616
n_estimators= 20, RMSE=0.4460
n_estimators= 30, RMSE=0.4405
n_estimators= 40, RMSE=0.4390
n_estimators= 50, RMSE=0.4373
n_estimators= 60, RMSE=0.4360
n_estimators= 70, RMSE=0.4366
n_estimators= 80, RMSE=0.4369
n_estimators= 90, RMSE=0.4363
n_estimators=100, RMSE=0.4361
n_estimators=110, RMSE=0.4358
n_estimators=120, RMSE=0.4362
n_estimators=130, RMSE=0.4359
n_estimators=140, RMSE=0.4359
n_estimators=150, RMSE=0.4359
n_estimators=160, RMSE=0.4358
n_estimators=170, RMSE=0.4357
n_estimators=180, RMSE=0.4356
n_estimators=190, RMSE=0.4358
n_estimators=200, RMSE=0.4354

Improvement at n_estimators=10, RMSE=0.462
Improvement at n_estimators=20, RMSE=0.446
Improvement at n_estimators=30, RMSE=0.441
Improvement at n_estimators=40, RMSE=0.439
Improvement at n_estimators=50, RMSE=0.437
Improvement at n_estimators=60, RMSE=0.436
Improvement at n_estimators=200, RMSE=0.435

RMSE stopped improving after n_estimators = 200
Best RMSE (3 decimals): 0.435


In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Parameters to try
max_depths = [10, 15, 20, 25]
n_estimators_range = range(10, 201, 10)

# Store results for each max_depth
depth_results = {}

for depth in max_depths:
    print(f"\nTesting max_depth = {depth}")
    print("-" * 50)
    
    rmse_scores = []
    
    for n in n_estimators_range:
        # Train model
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=depth,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train_matrix, y_train)
        
        # Predict and calculate RMSE
        y_val_pred = rf.predict(X_val_matrix)
        mse = mean_squared_error(y_val, y_val_pred)
        rmse = np.sqrt(mse)
        
        rmse_scores.append(rmse)
        print(f"  n_estimators={n:3d}, RMSE={rmse:.4f}")
    
    # Calculate mean RMSE for this max_depth
    mean_rmse = np.mean(rmse_scores)
    depth_results[depth] = mean_rmse
    print(f"Mean RMSE for max_depth={depth}: {mean_rmse:.4f}")

# Find the best max_depth
print("\n" + "="*50)
print("SUMMARY:")
print("="*50)
for depth, mean_rmse in sorted(depth_results.items()):
    print(f"max_depth={depth:2d}: Mean RMSE = {mean_rmse:.4f}")

best_depth = min(depth_results, key=depth_results.get)
print(f"\nBest max_depth: {best_depth}")
print(f"Best mean RMSE: {depth_results[best_depth]:.4f}")


Testing max_depth = 10
--------------------------------------------------
  n_estimators= 10, RMSE=0.4514
  n_estimators= 20, RMSE=0.4429
  n_estimators= 30, RMSE=0.4387
  n_estimators= 40, RMSE=0.4375
  n_estimators= 50, RMSE=0.4364
  n_estimators= 60, RMSE=0.4353
  n_estimators= 70, RMSE=0.4354
  n_estimators= 80, RMSE=0.4352
  n_estimators= 90, RMSE=0.4351
  n_estimators=100, RMSE=0.4348
  n_estimators=110, RMSE=0.4343
  n_estimators=120, RMSE=0.4346
  n_estimators=130, RMSE=0.4344
  n_estimators=140, RMSE=0.4345
  n_estimators=150, RMSE=0.4348
  n_estimators=160, RMSE=0.4348
  n_estimators=170, RMSE=0.4347
  n_estimators=180, RMSE=0.4348
  n_estimators=190, RMSE=0.4347
  n_estimators=200, RMSE=0.4344
Mean RMSE for max_depth=10: 0.4364

Testing max_depth = 15
--------------------------------------------------
  n_estimators= 10, RMSE=0.4598
  n_estimators= 20, RMSE=0.4463
  n_estimators= 30, RMSE=0.4407
  n_estimators= 40, RMSE=0.4394
  n_estimators= 50, RMSE=0.4376
  n_estimators=

In [26]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Train the model with specified parameters
rf = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)

rf.fit(X_train_matrix, y_train)

print("Model trained!")

# Get feature importances
feature_importances = rf.feature_importances_

# Get feature names from DictVectorizer
feature_names = dv.get_feature_names_out()

# Create a list of (feature_name, importance) pairs
importance_pairs = list(zip(feature_names, feature_importances))

# Sort by importance (descending)
importance_pairs_sorted = sorted(importance_pairs, key=lambda x: x[1], reverse=True)

# Display top 10 most important features
print("\nTop 10 Most Important Features:")
print("="*60)
for i, (feature, importance) in enumerate(importance_pairs_sorted[:10], 1):
    print(f"{i:2d}. {feature:30s}: {importance:.6f}")

# Now let's specifically check the 4 features mentioned
target_features = ['vehicle_weight', 'horsepower', 'acceleration', 'engine_displacement']

print("\n" + "="*60)
print("Importance of the 4 Target Features:")
print("="*60)

target_importances = {}
for feature_name, importance in importance_pairs:
    for target in target_features:
        if target in feature_name:
            target_importances[target] = importance
            print(f"{target:25s}: {importance:.6f}")

# Find the most important among the 4
if target_importances:
    most_important = max(target_importances, key=target_importances.get)
    print(f"\nMost important feature: {most_important}")
    print(f"Importance value: {target_importances[most_important]:.6f}")

Model trained!

Top 10 Most Important Features:
 1. vehicle_weight                : 0.959847
 2. horsepower                    : 0.015927
 3. acceleration                  : 0.011442
 4. engine_displacement           : 0.003167
 5. model_year                    : 0.003105
 6. num_cylinders                 : 0.002352
 7. num_doors                     : 0.001562
 8. origin=USA                    : 0.000481
 9. origin=Asia                   : 0.000444
10. origin=Europe                 : 0.000414

Importance of the 4 Target Features:
acceleration             : 0.011442
engine_displacement      : 0.003167
horsepower               : 0.015927
vehicle_weight           : 0.959847

Most important feature: vehicle_weight
Importance value: 0.959847


In [27]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.28.7-py3-none-manylinux_2_18_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl (115.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m02[0m
[?25hDownloading nvidia_nccl_cu12-2.28.7-py3-none-manylinux_2_18_x86_64.whl (296.8 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.8/296.8 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [xgboost]━━━[0m [32m1/2[0m [xgboost]
[1A[2KSuccessfully installed nvidia-nccl-cu12-2.28.7 xgboost-3.1.1

[1m[[0m

In [30]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

# Step 2: Create DMatrix objects
# DMatrix is XGBoost's optimized data structure
dtrain = xgb.DMatrix(X_train_matrix, label=y_train)
dval = xgb.DMatrix(X_val_matrix, label=y_val)

# Step 3: Create watchlist to monitor performance during training
watchlist = [(dtrain, 'train'), (dval, 'validation')]

# Step 4: Set parameters
# Parameters
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

# Train the model for 100 rounds
model = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=100,
    evals=watchlist,
    verbose_eval=10  # print every 10 rounds
)

print("\nModel training complete!")

# Make predictions on validation set
y_val_pred = model.predict(dval)

# Calculate RMSE
mse = mean_squared_error(y_val, y_val_pred)
rmse = np.sqrt(mse)
print(f"\nValidation RMSE: {rmse:.4f}")

[0]	train-rmse:2.31334	validation-rmse:2.30592
[10]	train-rmse:0.91846	validation-rmse:0.92183
[20]	train-rmse:0.49316	validation-rmse:0.52039
[30]	train-rmse:0.38552	validation-rmse:0.43528
[40]	train-rmse:0.35546	validation-rmse:0.41889
[50]	train-rmse:0.34115	validation-rmse:0.41644
[60]	train-rmse:0.33254	validation-rmse:0.41610
[70]	train-rmse:0.32553	validation-rmse:0.41617
[80]	train-rmse:0.32012	validation-rmse:0.41654
[90]	train-rmse:0.31718	validation-rmse:0.41654
[99]	train-rmse:0.31183	validation-rmse:0.41674

Model training complete!

Validation RMSE: 0.4167
