In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [6]:
car_sales = pd.read_csv('car-sales-extended.csv')
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [7]:
len(car_sales)

1000

In [9]:
# Split into X and y
X = car_sales.drop('Price', axis=1)
y = car_sales['Price']

# Split into trainig and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
# Build machine learniong model
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

ValueError: could not convert string to float: 'Nissan'

In [26]:
# Trun categories into numbers (Convert string data to integer)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder(drop='first')
transformer = ColumnTransformer([('one_hot',
                                 one_hot,
                                 categorical_features)],
                               remainder='passthrough')
transformed_X = transformer.fit_transform(X)

In [27]:
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [28]:
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,35431.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,84714.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,154365.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,155144.0
997,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,215883.0


In [29]:
# Or we can use Pandas
dummies = pd.get_dummies(car_sales[['Make', 'Colour', 'Doors']])
dummies.head()

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0


In [30]:
# Let's refit the model
np.random.seed(69)
X_train, X_test, y_train, y_test = train_test_split(transformed_X,
                                                   y,
                                                   test_size=0.2)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.27998628766306966

In [31]:
# Try to just replace with numerical values
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [35]:
conv_to_int = {}
for col_to_conv in ['Make', 'Colour']:
    uniques = list(car_sales[col_to_conv].unique())
    translation = {}
    for i, cat in enumerate(uniques):
        translation[cat] = i
    conv_to_int[col_to_conv] = translation
conv_to_int

{'Make': {'Honda': 0, 'BMW': 1, 'Toyota': 2, 'Nissan': 3},
 'Colour': {'White': 0, 'Blue': 1, 'Red': 2, 'Green': 3, 'Black': 4}}

In [36]:
test_car = car_sales.replace(conv_to_int)

In [37]:
test_car.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,0,0,35431,4,15323
1,1,1,192714,5,19943
2,0,0,84714,4,28343
3,2,0,154365,4,13434
4,3,1,181577,3,14043


In [47]:
X = test_car.drop('Price', axis=1)
y = test_car['Price']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size=0.2)
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.1495083313914468

In [48]:
from sklearn.linear_model import Lasso
model = Lasso()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.21233785339679245

In [49]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.21235398689993923

In [50]:
from sklearn.linear_model import ElasticNet
model = ElasticNet()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.167142436287396

### What if there are some missing values?
1. Fill them with some values (called imputation)
2. Remove the missing values (Remove values missing in the label)