In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

vehicles_dataset = pd.read_csv('/home/jovyan/work/datasets/vehicles_cleaned.csv')
vehicles_dataset

Unnamed: 0,region,price,year,manufacturer,model,fuel,odometer,title_status,transmission,type,state
0,prescott,6000,,,,,,,,,az
1,fayetteville,11900,,,,,,,,,ar
2,florida keys,21000,,,,,,,,,fl
3,worcester / central MA,1500,,,,,,,,,ma
4,greensboro,4900,,,,,,,,,nc
...,...,...,...,...,...,...,...,...,...,...,...
426875,wyoming,23590,2019.0,nissan,maxima s sedan 4d,gas,32226.0,clean,other,sedan,wy
426876,wyoming,30590,2020.0,volvo,s60 t5 momentum sedan 4d,gas,12029.0,clean,other,sedan,wy
426877,wyoming,34990,2020.0,cadillac,xt4 sport suv 4d,diesel,4174.0,clean,other,hatchback,wy
426878,wyoming,28990,2018.0,lexus,es 350 sedan 4d,gas,30112.0,clean,other,sedan,wy


In [2]:
y = vehicles_dataset['price']
X = vehicles_dataset.copy()
X.drop(columns=['price'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=0.6,
    random_state=42
)

X_test, X_val, y_test, y_val = train_test_split(
    X_test,
    y_test,
    train_size=0.5,
    random_state=42
)

X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape

((256128, 10), (256128,), (85376, 10), (85376,), (85376, 10), (85376,))

In [3]:
X_train.reset_index(inplace=True, drop=True)
cols_with_null = [col for col in X_train.columns
                 if X_train[col].isnull().any()]
cols_with_null, X_train.columns

(['year',
  'manufacturer',
  'model',
  'fuel',
  'odometer',
  'title_status',
  'transmission',
  'type'],
 Index(['region', 'year', 'manufacturer', 'model', 'fuel', 'odometer',
        'title_status', 'transmission', 'type', 'state'],
       dtype='object'))

In [4]:
for col_with_null in cols_with_null:
    print(X_train[col_with_null].value_counts(dropna=False))

year
2018.0    21799
2017.0    21747
2015.0    19016
2013.0    18486
2016.0    18367
          ...  
1920.0        1
1913.0        1
1918.0        1
1901.0        1
1943.0        1
Name: count, Length: 112, dtype: int64
manufacturer
ford               42587
chevrolet          33119
toyota             20426
honda              12784
jeep               11603
nissan             11405
ram                10902
NaN                10624
gmc                 9975
bmw                 8756
dodge               8255
mercedes-benz       7066
hyundai             6149
subaru              5702
volkswagen          5598
kia                 5097
lexus               4911
audi                4630
cadillac            4172
chrysler            3560
acura               3547
buick               3290
mazda               3281
infiniti            2911
lincoln             2509
volvo               2069
mitsubishi          1968
mini                1416
pontiac             1384
rover               1284
jaguar           

In [10]:
X_train['model'].unique().tolist()

['optima',
 'q45',
 '4runner sr5 suv',
 'malibu',
 'f-150 xlt',
 'gto',
 nan,
 'Junk Cars',
 'Hudson',
 'compass',
 '2003',
 'fiesta se',
 'a6 2.0t premium sedan 4d',
 'transit',
 'i8',
 'super duty f-250 srw',
 'accent gs',
 'silverado 2500 ltz lifted crew 4wd',
 'ranger',
 'grand caravan',
 'silverado diesels 3500',
 'silverado 2500hd',
 'xts',
 'x6 xdrive35i sport utility',
 '5 series',
 'town & country',
 'sonic',
 'sequoia',
 'tlx technology fwd',
 '4runner limited',
 'explorer',
 '1500 classic regular cab',
 'terrain',
 'silverado',
 'es300',
 '124 spider classica',
 'cr-v',
 'sonata',
 'corvette',
 'mdx advance pkg sport',
 'f-350sd',
 '4 series 440i convertible 2d',
 'impala ss',
 'santa fe sesuv 6a',
 'ANY',
 'edge limited',
 'mustang convertible',
 'altima',
 'allroad 2.7t',
 'f-250',
 'cls-class',
 'es 350',
 'jetta 2.5',
 'prius four touring',
 'a5 premium coupe 2d',
 'hs 250 premium',
 'juke',
 'optima lx sedan 4d',
 'e 400',
 'tahoe',
 'jetta',
 '3 series 330i xdrive seda

In [5]:
X_train_model_bfill_imputed = X_train.copy()
X_train_model_ffill_imputed = X_train.copy()
X_train_model_mode_imputed = X_train.copy()

print(
    X_train_model_bfill_imputed['model'].isnull().any(),
    X_train_model_ffill_imputed['model'].isnull().any(),
    X_train_model_mode_imputed['model'].isnull().any()
)

row_model_with_null_indexes = X_train[X_train['model'].isnull()].index.tolist()

X_train_model_bfill_imputed['model'].bfill(inplace=True)
X_train_model_ffill_imputed['model'].ffill(inplace=True)
X_train_model_mode_imputed['model'].fillna(X_train_model_mode_imputed['model'].mode()[0], inplace=True)

print(
    X_train_model_bfill_imputed['model'].isnull().any(),
    X_train_model_ffill_imputed['model'].isnull().any(),
    X_train_model_mode_imputed['model'].isnull().any()
)

for row_model_with_null_index in row_model_with_null_indexes:
    print(
        X_train_model_bfill_imputed['model'][row_model_with_null_index],
        '------------',
        X_train_model_ffill_imputed['model'][row_model_with_null_index],
        '------------',
        X_train_model_mode_imputed['model'][row_model_with_null_index]
    )


True True True
False False False
Junk Cars ------------ gto ------------ f-150
srx luxury fwd ------------ ilx ------------ f-150
trailblazer ls ------------ crosstrek 2.0i sport ------------ f-150
mdx sh-awd w/advance pkg ------------ cr-v ------------ f-150
s10 ------------ MS250 Mack Midliner ------------ f-150
silverado 1500 ------------ lr4 hse ------------ f-150
e-250 ------------ silverado 1500 ------------ f-150
cooper countryman ------------ sierra 1500 ------------ f-150
civic ------------ corolla ------------ f-150
genesis 3.8 sedan 4d ------------ sierra 1500 ------------ f-150
f-250 sd ------------ silverado ------------ f-150
tundra crewmax pickup 4d ------------ challenger ------------ f-150
odyssey ------------ tacoma ------------ f-150
civic lx ------------ fusion ------------ f-150
titan platinum reserve ------------ rsx ------------ f-150
ats premium luxury ------------ benz e320 wagon ------------ f-150
taurus ------------ charger ------------ f-150
x5 ------------ 