In [1]:
import numpy as np
import scipy.stats as stats
import math
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from itertools import combinations

In [2]:
# Pull the CSV and check out the layout.

raw_cars_df = pd.read_csv("../data/cars-dataset.csv")

raw_cars_df.sample(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car_name
373,24.0,4,140.0,92.0,2865.0,16.4,82,1,ford fairmont futura
263,17.7,6,231.0,165.0,3445.0,13.4,78,1,buick regal sport coupe (turbo)
253,20.5,6,200.0,95.0,3155.0,18.2,78,1,chevrolet malibu
183,25.0,4,116.0,81.0,2220.0,16.9,76,2,opel 1900
237,30.5,4,98.0,63.0,2051.0,17.0,77,1,chevrolet chevette
219,25.5,4,122.0,96.0,2300.0,15.5,77,1,plymouth arrow gs
199,20.0,6,225.0,100.0,3651.0,17.7,76,1,dodge aspen se
35,17.0,6,250.0,100.0,3329.0,15.5,71,1,chevrolet chevelle malibu
328,30.0,4,146.0,67.0,3250.0,21.8,80,2,mercedes-benz 240d
229,16.0,8,400.0,180.0,4220.0,11.1,77,1,pontiac grand prix lj


In [6]:
"""
The data is mostly good, but some of the horsepowers are missing. The dataset is small enough
that I don't want to just drop the whole row, but leaving the value set for "?" will cause problems.
So we're going to impute the missing values by replacing the "?" with mean/median horsepower values.
"""

# Replace "?" with NaN
raw_cars_df['horsepower'].replace('?', pd.NA, inplace=True)

# Convert horsepower to numeric, forcing errors to NaN
raw_cars_df['horsepower'] = pd.to_numeric(
    raw_cars_df['horsepower'], errors='coerce')

# Impute missing values with the mean
median_hp = raw_cars_df['horsepower'].mean()
raw_cars_df['horsepower'].fillna(median_hp, inplace=True)

# Output the dataframe to a new variable called all_cars_df
all_cars_df = raw_cars_df

all_cars_df.sample(10)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_cars_df['horsepower'].replace('?', pd.NA, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_cars_df['horsepower'].fillna(median_hp, inplace=True)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car_name
211,16.5,6,168.0,120.0,3820.0,16.7,76,2,mercedes-benz 280s
397,31.0,4,119.0,82.0,2720.0,19.4,82,1,chevy s-10
94,13.0,8,440.0,215.0,4735.0,11.0,73,1,chrysler new yorker brougham
307,26.8,6,173.0,115.0,2700.0,12.9,79,1,oldsmobile omega brougham
147,24.0,4,90.0,75.0,2108.0,15.5,74,2,fiat 128
326,43.4,4,90.0,48.0,2335.0,23.7,80,2,vw dasher (diesel)
355,33.7,4,107.0,75.0,2210.0,14.4,81,3,honda prelude
27,11.0,8,318.0,210.0,4382.0,13.5,70,1,dodge d200
387,38.0,6,262.0,85.0,3015.0,17.0,82,1,oldsmobile cutlass ciera (diesel)
168,23.0,4,140.0,83.0,2639.0,17.0,75,1,ford pinto


In [7]:
# Here's a separate dataframe for each origin.

american_cars_df = all_cars_df[all_cars_df['origin']== 1]

european_cars_df = all_cars_df[all_cars_df['origin'] == 2]

japanese_cars_df = all_cars_df[all_cars_df['origin'] == 3]

american_cars_df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
392,27.0,4,151.0,90.0,2950.0,17.3,82,1,chevrolet camaro
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger


In [8]:
# These should be gone now.
missing_hp_df = all_cars_df[all_cars_df['horsepower']=='?']

missing_hp_df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car_name
