In [2]:
import numpy as np
import scipy.stats as stats
import math
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from itertools import combinations

In [3]:
# Pull the CSV and check out the layout.

raw_cars_df = pd.read_csv("../data/cars-dataset.csv")

raw_cars_df.sample(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car_name
137,13.0,8,350.0,150.0,4699.0,14.5,74,1,buick century luxus (sw)
211,16.5,6,168.0,120.0,3820.0,16.7,76,2,mercedes-benz 280s
360,30.7,6,145.0,76.0,3160.0,19.6,81,2,volvo diesel
251,20.2,8,302.0,139.0,3570.0,12.8,78,1,mercury monarch ghia
312,37.2,4,86.0,65.0,2019.0,16.4,80,3,datsun 310
242,21.5,4,121.0,110.0,2600.0,12.8,77,2,bmw 320i
17,21.0,6,200.0,85.0,2587.0,16.0,70,1,ford maverick
208,13.0,8,318.0,150.0,3940.0,13.2,76,1,plymouth volare premier v8
138,14.0,8,318.0,150.0,4457.0,13.5,74,1,dodge coronet custom (sw)
348,37.7,4,89.0,62.0,2050.0,17.3,81,3,toyota tercel


In [5]:
"""
The data is mostly good, but some of the horsepowers are missing. The dataset is small enough
that I don't want to just drop the whole row, but leaving the value set for "?" will cause problems.
So we're going to impute the missing values by replacing the "?" with mean/median horsepower values.

To reduce margin of error I'll use the mean values from the same origin and number of cylinders.
"""

# Replace "?" with NaN so we can use fillna() later
raw_cars_df['horsepower'] = raw_cars_df['horsepower'].replace('?', pd.NA)

# Convert horsepower to numeric, forcing errors to NaN
raw_cars_df['horsepower'] = pd.to_numeric(
    raw_cars_df['horsepower'], errors='coerce')

# This function will replace all the missing horsepower values with 
# mean values based on cylinder and origin.
def impute_missing_hp(row, grouped_means):
    if pd.isna(row['horsepower']):
        return grouped_means.loc[(row['cylinders'], row['origin'])]
    return row['horsepower']

# Calculate the mean horsepower for each cylinders-origin group
grouped_means = raw_cars_df.groupby(['cylinders', 'origin'])[
    'horsepower'].mean()

# Use the imput_missing_hp function on every NaN row
raw_cars_df['horsepower'] = raw_cars_df.apply(
    lambda row: impute_missing_hp(row, grouped_means), axis=1)

all_cars_df = raw_cars_df

all_cars_df.sample(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car_name
66,17.0,8,304.0,150.0,3672.0,11.5,72,1,amc ambassador sst
276,21.6,4,121.0,115.0,2795.0,15.7,78,2,saab 99gle
51,30.0,4,79.0,70.0,2074.0,19.5,71,2,peugeot 304
265,17.5,8,318.0,140.0,4080.0,13.7,78,1,dodge magnum xe
85,13.0,8,350.0,175.0,4100.0,13.0,73,1,buick century 350
75,14.0,8,318.0,150.0,4077.0,14.0,72,1,plymouth satellite custom (sw)
273,23.9,4,119.0,97.0,2405.0,14.9,78,3,datsun 200-sx
81,28.0,4,97.0,92.0,2288.0,17.0,72,3,datsun 510 (sw)
10,15.0,8,383.0,170.0,3563.0,10.0,70,1,dodge challenger se
317,34.3,4,97.0,78.0,2188.0,15.8,80,2,audi 4000


In [6]:
# Here's a separate dataframe for each origin.

american_cars_df = all_cars_df[all_cars_df['origin']== 1]

european_cars_df = all_cars_df[all_cars_df['origin'] == 2]

japanese_cars_df = all_cars_df[all_cars_df['origin'] == 3]

american_cars_df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
392,27.0,4,151.0,90.0,2950.0,17.3,82,1,chevrolet camaro
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger


In [7]:
# These should be gone now.
missing_hp_df = all_cars_df[all_cars_df['horsepower']=='?']

missing_hp_df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car_name
