In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

vehicles_data_set = pd.read_csv('/home/jovyan/work/datasets/vehicles_cleaned.csv')

vehicles_data_set.head()

Unnamed: 0,price,year,manufacturer,fuel,odometer,title_status,transmission,type,state
0,6000,,,,,,,,az
1,11900,,,,,,,,ar
2,21000,,,,,,,,fl
3,1500,,,,,,,,ma
4,4900,,,,,,,,nc


In [2]:
X = vehicles_data_set.copy()

y = vehicles_data_set['price']

X.drop(columns=['price'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=0.6,
    random_state=42
)

X_test, X_val, y_test, y_val = train_test_split(
    X_test,
    y_test,
    train_size=0.5,
    random_state=42
)

X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape

((256128, 8), (256128,), (85376, 8), (85376,), (85376, 8), (85376,))

In [3]:
cols_with_null_values = [col for col in X_train.columns
                        if X_train[col].isnull().any()]

for col_with_null_values in cols_with_null_values:
    print(
        X_train[col_with_null_values].value_counts(dropna=False),
        '\n'
    )

year
2018.0    21799
2017.0    21747
2015.0    19016
2013.0    18486
2016.0    18367
          ...  
1920.0        1
1913.0        1
1918.0        1
1901.0        1
1943.0        1
Name: count, Length: 112, dtype: int64 

manufacturer
ford               42587
chevrolet          33119
toyota             20426
honda              12784
jeep               11603
nissan             11405
ram                10902
NaN                10624
gmc                 9975
bmw                 8756
dodge               8255
mercedes-benz       7066
hyundai             6149
subaru              5702
volkswagen          5598
kia                 5097
lexus               4911
audi                4630
cadillac            4172
chrysler            3560
acura               3547
buick               3290
mazda               3281
infiniti            2911
lincoln             2509
volvo               2069
mitsubishi          1968
mini                1416
pontiac             1384
rover               1284
jaguar         

In [4]:
X_train.reset_index(inplace=True, drop=True)

In [15]:
X_train['odometer'].describe(), X_train['odometer'].median()

(count    2.534550e+05
 mean     9.774669e+04
 std      2.076887e+05
 min      0.000000e+00
 25%      3.772400e+04
 50%      8.566200e+04
 75%      1.337470e+05
 max      1.000000e+07
 Name: odometer, dtype: float64,
 85662.0)

In [33]:
def simple_impute_odometer(data_frame, simple_imputer):
    return simple_imputer.fit_transform(
        np.reshape(
            data_frame['odometer'],
            (-1, 1)
        )
    )

def concat_odometer_imputed_values(data_frame, odometer_imputed_values):
    return pd.concat(
        [
            data_frame,
            pd.Series(
                odometer_imputed_values[:,0],
                name='odometer'
            ),
            pd.Series(
                odometer_imputed_values[:,1],
                name='odometer_missing'
            )
        ], axis=1
    )

rows_with_odometer_null_values = X_train[X_train['odometer'].isnull()].index.to_list()

X_train_odometer_mean_imputed = X_train.copy()
X_train_odometer_median_imputed = X_train.copy()

print(
    X_train_odometer_mean_imputed['odometer'].isnull().any(),
    X_train_odometer_median_imputed['odometer'].isnull().any()
)

mean_imputer = SimpleImputer(add_indicator=True)
median_imputer = SimpleImputer(strategy='median', add_indicator=True)

mean_imputed_values = simple_impute_odometer(X_train_odometer_mean_imputed, mean_imputer)
median_imputed_values = simple_impute_odometer(X_train_odometer_median_imputed, median_imputer)

X_train_odometer_mean_imputed.drop(columns=['odometer'], inplace=True)
X_train_odometer_median_imputed.drop(columns=['odometer'], inplace=True)

X_train_odometer_mean_imputed = concat_odometer_imputed_values(
    X_train_odometer_mean_imputed,
    mean_imputed_values
)

X_train_odometer_median_imputed = concat_odometer_imputed_values(
    X_train_odometer_median_imputed, 
    median_imputed_values
)

print(
    X_train_odometer_mean_imputed['odometer'].isnull().any(),
    X_train_odometer_median_imputed['odometer'].isnull().any()
)


X_train_odometer_mean_imputed['odometer'] = X_train_odometer_mean_imputed['odometer'].astype('int')
X_train_odometer_median_imputed['odometer'] = X_train_odometer_median_imputed['odometer'].astype('int')

for row_with_odometer_null_values in rows_with_odometer_null_values:
    print(
        X_train['odometer'][row_with_odometer_null_values],
        '-----------------',
        X_train_odometer_mean_imputed['odometer'][row_with_odometer_null_values],
        '----------------',
        X_train_odometer_mean_imputed['odometer_missing'][row_with_odometer_null_values],
        '----------------',
        X_train_odometer_median_imputed['odometer'][row_with_odometer_null_values],
        '-----------------',
        X_train_odometer_median_imputed['odometer_missing'][row_with_odometer_null_values]
    )

True True
False False
nan ----------------- 97746 ---------------- 1.0 ---------------- 85662 ----------------- 1.0
nan ----------------- 97746 ---------------- 1.0 ---------------- 85662 ----------------- 1.0
nan ----------------- 97746 ---------------- 1.0 ---------------- 85662 ----------------- 1.0
nan ----------------- 97746 ---------------- 1.0 ---------------- 85662 ----------------- 1.0
nan ----------------- 97746 ---------------- 1.0 ---------------- 85662 ----------------- 1.0
nan ----------------- 97746 ---------------- 1.0 ---------------- 85662 ----------------- 1.0
nan ----------------- 97746 ---------------- 1.0 ---------------- 85662 ----------------- 1.0
nan ----------------- 97746 ---------------- 1.0 ---------------- 85662 ----------------- 1.0
nan ----------------- 97746 ---------------- 1.0 ---------------- 85662 ----------------- 1.0
nan ----------------- 97746 ---------------- 1.0 ---------------- 85662 ----------------- 1.0
nan ----------------- 97746 ----------