In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from mlfromscratch import utils
from mlfromscratch.Supervised import multivariate_regression as regressor
from mlfromscratch.Supervised import univariate_regression as lregressor

In [45]:
data = pd.read_csv("mlfromscratch/data/vw.csv")
print(data.shape)
data.head()

(15157, 9)


Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,T-Roc,2019,25000,Automatic,13904,Diesel,145,49.6,2.0
1,T-Roc,2019,26883,Automatic,4562,Diesel,145,49.6,2.0
2,T-Roc,2019,20000,Manual,7414,Diesel,145,50.4,2.0
3,T-Roc,2019,33492,Automatic,4825,Petrol,145,32.5,2.0
4,T-Roc,2019,22900,Semi-Auto,6500,Petrol,150,39.8,1.5


In [46]:
data['transmission'].value_counts(), data['fuelType'].value_counts()

(Manual       9417
 Semi-Auto    3780
 Automatic    1960
 Name: transmission, dtype: int64,
 Petrol    8553
 Diesel    6372
 Hybrid     145
 Other       87
 Name: fuelType, dtype: int64)

In [47]:
# Data Pre-processing
data = pd.get_dummies(data, columns=['transmission', 'fuelType'])
data['age'] = 2020 - data['year']
data.drop(['model', 'year'], axis=1, inplace=True)
data.head()

Unnamed: 0,price,mileage,tax,mpg,engineSize,transmission_Automatic,transmission_Manual,transmission_Semi-Auto,fuelType_Diesel,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,age
0,25000,13904,145,49.6,2.0,1,0,0,1,0,0,0,1
1,26883,4562,145,49.6,2.0,1,0,0,1,0,0,0,1
2,20000,7414,145,50.4,2.0,0,1,0,1,0,0,0,1
3,33492,4825,145,32.5,2.0,1,0,0,0,0,0,1,1
4,22900,6500,150,39.8,1.5,0,0,1,0,0,0,1,1


In [48]:
X = data['mpg'].values
y = data['price'].values
X = X.reshape(X.shape[0], 1)
y = y.reshape(y.shape[0], 1)
X, y

(array([[49.6],
        [49.6],
        [50.4],
        ...,
        [42. ],
        [46.3],
        [46.3]]),
 array([[25000],
        [26883],
        [20000],
        ...,
        [ 1590],
        [ 1250],
        [ 2295]], dtype=int64))

In [49]:
X = utils.standardize(X)
y = utils.standardize(y)
X, y

(array([[-0.30445949],
        [-0.30445949],
        [-0.24581591],
        ...,
        [-0.86157349],
        [-0.54636425],
        [-0.54636425]]),
 array([[ 1],
        [ 1],
        [ 0],
        ...,
        [-1],
        [-2],
        [-1]], dtype=int64))

In [50]:
X_train, X_test, y_train, y_test = utils.train_test_split(X, y, test_size=0.3, shuffle=True)

In [53]:
regressor.linear_regression_model(X_train, y_train, X_test, y_test, learning_rate=0.001, epochs=100)

Epoch: 0, loss: 0.501
Epoch: 10, loss: 0.483
Epoch: 20, loss: 0.467
Epoch: 30, loss: 0.454
Epoch: 40, loss: 0.442
Epoch: 50, loss: 0.433
Epoch: 60, loss: 0.425
Epoch: 70, loss: 0.419
Epoch: 80, loss: 0.415
Epoch: 90, loss: 0.413
Train MSE: 0.41132613731153633 Weight:  [[-0.27998138]]
Test MSE: 0.42134097934040105 Weight:  [[-0.27998138]]


In [55]:
lregressor.linear_regression_model(X_train, X_test, y_train, y_test, grad_desc=False, least_sqr=True)

(array([[ 0.36488278],
        [ 0.21144146],
        [-0.06181019],
        ...,
        [-0.11856246],
        [ 0.28711115],
        [ 0.08112144]]),
 0.41735836385183644)

In [56]:
lregressor.linear_regression_model(X_train, X_test, y_train, y_test, grad_desc=True, least_sqr=False)

(array([[ 0.31498342],
        [ 0.18267554],
        [-0.05294124],
        ...,
        [-0.10187703],
        [ 0.24792326],
        [ 0.07030446]]),
 0.4181010271391076)