In [1]:
import io
import pandas as pd
import numpy as np

In [2]:
cols = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 
        'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 
        'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg',
        'highway-mpg', 'price']

data = pd.read_csv('imports-85.data', names=cols)

In [3]:
data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

A close inspection indicates that some columns are strictly numeric (conceptually) and useful to our algorithm

'normalized-losses', 'wheel-base', 
'length', 'width', 
'height', 'curb-weight', 
'bore', 'stroke', 
'compression-rate', 'horsepower', 
'peak-rpm', 'city-mpg', 
'highway-mpg', 'price'

In the following steps we will clean these columns

# 1. Data Cleaning

In [5]:
num_cols = ['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

numeric_cars = data[num_cols]

numeric_cars.head()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,?,88.6,168.8,64.1,48.8,2548,3.47,2.68,9.0,111,5000,21,27,13495
1,?,88.6,168.8,64.1,48.8,2548,3.47,2.68,9.0,111,5000,21,27,16500
2,?,94.5,171.2,65.5,52.4,2823,2.68,3.47,9.0,154,5000,19,26,16500
3,164,99.8,176.6,66.2,54.3,2337,3.19,3.4,10.0,102,5500,24,30,13950
4,164,99.4,176.6,66.4,54.3,2824,3.19,3.4,8.0,115,5500,18,22,17450


In [6]:
numeric_cars = numeric_cars.replace('?', np.nan)

numeric_cars.head()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,,88.6,168.8,64.1,48.8,2548,3.47,2.68,9.0,111,5000,21,27,13495
1,,88.6,168.8,64.1,48.8,2548,3.47,2.68,9.0,111,5000,21,27,16500
2,,94.5,171.2,65.5,52.4,2823,2.68,3.47,9.0,154,5000,19,26,16500
3,164.0,99.8,176.6,66.2,54.3,2337,3.19,3.4,10.0,102,5500,24,30,13950
4,164.0,99.4,176.6,66.4,54.3,2824,3.19,3.4,8.0,115,5500,18,22,17450


In [7]:
numeric_cars = numeric_cars.astype(float)

numeric_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   normalized-losses  164 non-null    float64
 1   wheel-base         205 non-null    float64
 2   length             205 non-null    float64
 3   width              205 non-null    float64
 4   height             205 non-null    float64
 5   curb-weight        205 non-null    float64
 6   bore               201 non-null    float64
 7   stroke             201 non-null    float64
 8   compression-rate   205 non-null    float64
 9   horsepower         203 non-null    float64
 10  peak-rpm           203 non-null    float64
 11  city-mpg           205 non-null    float64
 12  highway-mpg        205 non-null    float64
 13  price              201 non-null    float64
dtypes: float64(14)
memory usage: 22.5 KB


In [8]:
numeric_cars.isnull().sum()

normalized-losses    41
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

"Price" is the dependent variable in our analysis. Therefore, we must eliminate rows where this value is missing

In [9]:
numeric_cars = numeric_cars.dropna(subset=['price'])

numeric_cars.isnull().sum()

normalized-losses    37
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

In [10]:
numeric_cars = numeric_cars.fillna(numeric_cars.mean())

numeric_cars.isnull().sum()

normalized-losses    0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
bore                 0
stroke               0
compression-rate     0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

In [11]:
price_col = numeric_cars['price']

# Now we normalize all columns (All scales will go from 0-1)
numeric_cars = (numeric_cars - numeric_cars.min()) / (numeric_cars.max() - numeric_cars.min())

numeric_cars['price'] = price_col

# 2. Univariate model

In [14]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

def knn_train_test(train_col, target_col, df):
    knn = KNeighborsRegressor()
    np.random.seed(1)

    # Randomize order of rows in dataframe
    shuffled_index = np.random.permutation(df.index)
    rand_df = df.reindex(shuffled_index)

    # Setting cut for train and test sets
    last_train_row = int(len(df) / 2)

    # First half as training set
    train_df = rand_df.iloc[:last_train_row]
    test_df = rand_df.iloc[last_train_row:]

    # Fit KNN model using default K value
    knn.fit(train_df[[train_col]], train_df[target_col])

    # Predict using the model
    prediction = knn.predict(test_df[[train_col]])

    # Return root-mean-squared error
    rmse = mean_squared_error(test_df[target_col], prediction) ** 0.5
    return rmse

In [16]:
rmse_results = {}

train_cols = numeric_cars.columns.drop('price')

# For each column except our dependent variable, calculate the RMSE value and store in the dictionary above
for col in train_cols:
    rmse_results[col] = knn_train_test(col, 'price', numeric_cars)

# Turning dictionary into series to better interpret results
rmse_results_series = pd.Series(rmse_results)
ranked_rmse_results = rmse_results_series.sort_values() # Lower RMSE = Better result

ranked_rmse_results

horsepower           4037.037713
curb-weight          4401.118255
highway-mpg          4630.026799
width                4704.482590
city-mpg             4766.422505
length               5427.200961
wheel-base           5461.553998
compression-rate     6610.812153
bore                 6780.627785
normalized-losses    7330.197653
peak-rpm             7697.459696
stroke               8006.529545
height               8144.441043
dtype: float64

The list above can be understood as a rank of factors that better predict the price of a car

The best predictors, according to the model, are "horsepower", "curb-weight", and "highway-mpg"

# 3. Multivariate model

In the next step we will calculate the RMSE with "horsepower" + 2nd, 3rd, 4th, and 5th best-performing factors in the dataframe

In [18]:
def knn_train_test_multi(target_col, df):

    np.random.seed(1)

    # Randomize row order
    shuffled_index = np.random.permutation(df.index)
    rand_df = df.reindex(shuffled_index)

    # Divide number of rows in half
    last_train_row = int(len(rand_df) / 2)

    # Divide data into train and test sets
    train_df = rand_df.iloc[:last_train_row]
    test_df = rand_df.iloc[last_train_row:]

    # Store RMSE results
    multi_rmse_results = {}

    for i in range(1,5):

        if i == 1:
            train_cols = ['horsepower', 'curb-weight']
            knn = KNeighborsRegressor()
            knn.fit(train_df[train_cols], train_df[target_col])

            # Predict using model
            prediction = knn.predict(test_df[train_cols])

            # Calculate and store RMSE
            multi_rmse_results['2 best features'] = mean_squared_error(test_df[target_col], prediction) ** 0.5

        elif i == 2:
            train_cols = ['horsepower', 'curb-weight', 'highway-mpg']
            knn = KNeighborsRegressor()
            knn.fit(train_df[train_cols], train_df[target_col])

            # Predict using model
            prediction = knn.predict(test_df[train_cols])

            # Calculate and store RMSE
            multi_rmse_results['3 best features'] = mean_squared_error(test_df[target_col], prediction) ** 0.5

        elif i == 3:
            train_cols = ['horsepower','curb-weight','highway-mpg','width']
            knn = KNeighborsRegressor()
            knn.fit(train_df[train_cols], train_df[target_col])

            # Predict using model
            prediction = knn.predict(test_df[train_cols])

            # Calculate and store RMSE
            multi_rmse_results['4 best features'] = mean_squared_error(test_df[target_col], prediction) ** 0.5         

        elif i == 4:
            train_cols = ['horsepower','curb-weight','highway-mpg','width','city-mpg']
            knn = KNeighborsRegressor()
            knn.fit(train_df[train_cols], train_df[target_col])

            # Predict using model
            prediction = knn.predict(test_df[train_cols])

            # Calculate and store RMSE
            multi_rmse_results['5 best features'] = mean_squared_error(test_df[target_col], prediction) ** 0.5
    
    return multi_rmse_results

knn_train_test_multi('price', numeric_cars)


{'2 best features': 3257.849049435976,
 '3 best features': 3365.9110004529675,
 '4 best features': 3358.6915801682458,
 '5 best features': 3341.6024539726504}

The multivariate model indicates '2 best features' and '4 best features' result in the lowest RMSE scores

# 4. Hyperparameter tuning

We will try to change the number of neighbors used in the algorithm to reduce the RMSE

In [30]:
def knn_train_test_multi_k(target_col, df):

    np.random.seed(1)

    # Randomize row order
    shuffled_index = np.random.permutation(df.index)
    rand_df = df.reindex(shuffled_index)

    # Divide number of rows in half
    last_train_row = int(len(rand_df) / 2)

    # Divide data into train and test sets
    train_df = rand_df.iloc[:last_train_row]
    test_df = rand_df.iloc[last_train_row:]

    # Store RMSE results
    multi_k_rmse_results = {}

    for i in range(1,5):

        if i == 1:
            train_cols = ['horsepower', 'curb-weight']
            for n in range(1, 25):
                knn = KNeighborsRegressor(n_neighbors=n)
                knn.fit(train_df[train_cols], train_df[target_col])
                
                # Predict using model
                prediction = knn.predict(test_df[train_cols])
                
                # Calculate and store RMSE
                multi_k_rmse_results['2 best features and {} neighbors'.format(n)] = mean_squared_error(test_df[target_col], prediction) ** 0.5

        elif i == 2:
            train_cols = ['horsepower', 'curb-weight', 'highway-mpg']
            for n in range(1, 25):
                knn = KNeighborsRegressor(n_neighbors=n)
                knn.fit(train_df[train_cols], train_df[target_col])
                
                # Predict using model
                prediction = knn.predict(test_df[train_cols])
                
                # Calculate and store RMSE
                multi_k_rmse_results['3 best features and {} neighbors'.format(n)] = mean_squared_error(test_df[target_col], prediction) ** 0.5

        elif i == 3:
            train_cols = ['horsepower','curb-weight','highway-mpg','width']
            for n in range(1, 25):
                knn = KNeighborsRegressor(n_neighbors=n)
                knn.fit(train_df[train_cols], train_df[target_col])
                
                # Predict using model
                prediction = knn.predict(test_df[train_cols])
                
                # Calculate and store RMSE
                multi_k_rmse_results['4 best features and {} neighbors'.format(n)] = mean_squared_error(test_df[target_col], prediction) ** 0.5

        elif i == 4:
            train_cols = ['horsepower','curb-weight','highway-mpg','width','city-mpg']
            for n in range(1, 25):
                knn = KNeighborsRegressor(n_neighbors=n)
                knn.fit(train_df[train_cols], train_df[target_col])
                
                # Predict using model
                prediction = knn.predict(test_df[train_cols])
                
                # Calculate and store RMSE
                multi_k_rmse_results['5 best features and {} neighbors'.format(n)] = mean_squared_error(test_df[target_col], prediction) ** 0.5
    
    result_series = pd.Series(multi_k_rmse_results)     
    result_series_final = result_series.sort_values()
    return result_series_final.head(10)

results = knn_train_test_multi_k('price', numeric_cars)

results

5 best features and 1 neighbors    2530.055408
4 best features and 1 neighbors    2600.746384
2 best features and 2 neighbors    2700.747235
4 best features and 2 neighbors    2725.432507
3 best features and 2 neighbors    2748.397511
3 best features and 1 neighbors    2777.396404
2 best features and 1 neighbors    2790.107143
5 best features and 2 neighbors    2897.175797
3 best features and 3 neighbors    2974.222577
2 best features and 3 neighbors    3003.748806
dtype: float64

# 5. Conclusion

The top 5 configurations of the "K-Nearest Neighbors" algorithm for this dataset:

1. 5 best features and 1 neighbors
2. 4 best features and 1 neighbors
3. 2 best features and 2 neighbors
4. 4 best features and 2 neighbors
5. 3 best features and 2 neighbors

The lowest RMSE achieved was 2530.055408, which can be understood as roughly $2500



In [32]:
numeric_cars['price'].mean()

13207.129353233831

Considering the average car price in this dataset to be about $13000, the error can account for approximately 20% of the price

If necessary, the following steps could be taken to reduce the RMSE:

1) Add data cleaning and processing to the function
2) Perform k-fold cross validation