In [1]:
import numpy as np
import scipy.stats as stats
import math
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from itertools import combinations

In [2]:
# Pull the CSV and check out the layout.

raw_cars_df = pd.read_csv("../data/cars-dataset.csv")

raw_cars_df.sample(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car_name
38,14.0,8,350.0,165.0,4209.0,12.0,71,1,chevrolet impala
52,30.0,4,88.0,76.0,2065.0,14.5,71,2,fiat 124b
261,18.1,6,258.0,120.0,3410.0,15.1,78,1,amc concord d/l
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
387,38.0,6,262.0,85.0,3015.0,17.0,82,1,oldsmobile cutlass ciera (diesel)
229,16.0,8,400.0,180.0,4220.0,11.1,77,1,pontiac grand prix lj
165,20.0,8,262.0,110.0,3221.0,13.5,75,1,chevrolet monza 2+2
119,20.0,4,114.0,91.0,2582.0,14.0,73,2,audi 100ls
345,35.1,4,81.0,60.0,1760.0,16.1,81,3,honda civic 1300
315,24.3,4,151.0,90.0,3003.0,20.1,80,1,amc concord


In [4]:
"""
The data is mostly good, but some of the horsepowers are missing. The dataset is small enough
that I don't want to just drop the whole row, but leaving the value set for "?" will cause problems.
So we're going to impute the missing values by replacing the "?" with mean/median horsepower values.

To reduce margin of error I'll use the mean values from the same origin and number of cylinders.
"""

# Replace "?" with NaN so we can use fillna() later
raw_cars_df['horsepower'].replace('?', pd.NA, inplace=True)

# Convert horsepower to numeric, forcing errors to NaN
raw_cars_df['horsepower'] = pd.to_numeric(
    raw_cars_df['horsepower'], errors='coerce')

# Function to impute missing horsepower with the mean value of the same cylinders and origin group
def impute_missing_hp(row, grouped_means):
    if pd.isna(row['horsepower']):
        return grouped_means.loc[(row['cylinders'], row['origin'])]
    return row['horsepower']

# Calculate the mean horsepower for each cylinders-origin group
grouped_means = raw_cars_df.groupby(['cylinders', 'origin'])[
    'horsepower'].mean()

# Apply the imputation function to each row
raw_cars_df['horsepower'] = raw_cars_df.apply(
    lambda row: impute_missing_hp(row, grouped_means), axis=1)

all_cars_df = raw_cars_df

all_cars_df.sample(10)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_cars_df['horsepower'].replace('?', pd.NA, inplace=True)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car_name
81,28.0,4,97.0,92.0,2288.0,17.0,72,3,datsun 510 (sw)
202,17.5,6,258.0,95.0,3193.0,17.8,76,1,amc pacer d/l
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
154,15.0,6,250.0,72.0,3432.0,21.0,75,1,mercury monarch
111,18.0,3,70.0,90.0,2124.0,13.5,73,3,maxda rx3
68,13.0,8,350.0,155.0,4502.0,13.5,72,1,buick lesabre custom
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
341,23.5,6,173.0,110.0,2725.0,12.6,81,1,chevrolet citation
298,23.0,8,350.0,125.0,3900.0,17.4,79,1,cadillac eldorado
261,18.1,6,258.0,120.0,3410.0,15.1,78,1,amc concord d/l


In [5]:
# Here's a separate dataframe for each origin.

american_cars_df = all_cars_df[all_cars_df['origin']== 1]

european_cars_df = all_cars_df[all_cars_df['origin'] == 2]

japanese_cars_df = all_cars_df[all_cars_df['origin'] == 3]

american_cars_df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
392,27.0,4,151.0,90.0,2950.0,17.3,82,1,chevrolet camaro
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger


In [6]:
# These should be gone now.
missing_hp_df = all_cars_df[all_cars_df['horsepower']=='?']

missing_hp_df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car_name
