In [131]:
import pandas as pd
from sklearn.model_selection import train_test_split 

In [132]:
data = pd.read_csv('car-data/car_price_prediction.csv')

data.head(10)

'''
Data Cleaning:
Levy - Many values use '-' which should be replaced with the mean or median value of the numerical values of the column
Mileage - Remove km and make it numerical 
Engine Volume - Is not numeric and contains strings 'Turbo'
Doors has following values and must be changed to 2, 4, or 6 (ordinal scale)

04-May    18332
02-Mar      777
>5          128
'''

"\nData Cleaning:\nLevy - Many values use '-' which should be replaced with the mean or median value of the numerical values of the column\nMileage - Remove km and make it numerical \nEngine Volume - Is not numeric and contains strings 'Turbo'\nDoors has following values and must be changed to 2, 4, or 6 (ordinal scale)\n\n04-May    18332\n02-Mar      777\n>5          128\n"

In [133]:
data['Manufacturer'].value_counts()

Manufacturer
HYUNDAI          3769
TOYOTA           3662
MERCEDES-BENZ    2076
FORD             1111
CHEVROLET        1069
                 ... 
TESLA               1
PONTIAC             1
SATURN              1
ASTON MARTIN        1
GREATWALL           1
Name: count, Length: 65, dtype: int64

In [134]:
def preprocess(data: pd.DataFrame):
   # Levy has many missing values
   data['Levy'] = pd.to_numeric(data['Levy'], errors='coerce')
   levy_mean_val = data['Levy'].mean()
   data['Levy'].fillna(levy_mean_val, inplace=True)
   
   # Mileage
   data['Mileage'] = pd.to_numeric((data['Mileage']).astype(str).str.replace(' km', '', regex=True), errors='coerce')

   # Doors
   data['Doors'] = data['Doors'].astype(str).str.extract(r'(\d+)', expand=False).astype(int).map({4: 4, 2: 2, 5: 6})  # Map values

   # Engine Volume
   data['Engine volume'] = pd.to_numeric(data['Engine volume'].astype(str).str.replace(' Trubo', '', regex=True), errors='coerce')

   #OHE Categorical Data
   data = pd.get_dummies(data, columns=['Manufacturer', 'Model', 'Category', 'Leather interior', 'Fuel type', 'Gear box type', 'Drive wheels', 'Wheel', 'Color'])
   
   # Split data 
   train, test = train_test_split(data, test_size=0.2, random_state=123)
   y_train = train.pop('Price')
   X_train = train
   y_test = test.pop('Price')
   X_test = test

   return X_train, y_train, X_test, y_test


X_train, y_train, X_test, y_test = preprocess(data)

In [135]:
X_train.head(5)

Unnamed: 0,ID,Levy,Prod. year,Engine volume,Mileage,Cylinders,Doors,Airbags,Manufacturer_ACURA,Manufacturer_ALFA ROMEO,...,Color_Green,Color_Grey,Color_Orange,Color_Pink,Color_Purple,Color_Red,Color_Silver,Color_Sky blue,Color_White,Color_Yellow
3292,45770574,528.0,2014,1.6,100800,4.0,4,12,False,False,...,False,False,False,False,False,False,False,False,True,False
6169,45810912,579.0,2017,2.5,88336,4.0,4,12,False,False,...,False,False,False,False,False,False,False,False,True,False
16835,45642802,490.0,2011,1.3,198891,4.0,4,0,False,False,...,False,True,False,False,False,False,False,False,False,False
9829,45804398,1017.0,2017,2.0,79877,4.0,4,4,False,False,...,False,False,False,False,False,False,False,False,False,False
7209,45814951,765.0,2015,2.0,118848,4.0,4,12,False,False,...,False,False,False,False,False,True,False,False,False,False


In [136]:
X_train.dtypes

ID                  int64
Levy              float64
Prod. year          int64
Engine volume     float64
Mileage             int64
                   ...   
Color_Red            bool
Color_Silver         bool
Color_Sky blue       bool
Color_White          bool
Color_Yellow         bool
Length: 1708, dtype: object

In [None]:
''' Data is cleaned and prepped'''

' Data is cleaned'