# Imports


In [222]:
import numpy as np
import pandas as pd
import plotly as px
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures

# Data

In [223]:
try:
    data = pd.read_csv('/kaggle/input/cars-dataset-audi-bmw-ford-hyundai-skoda-vw/cars_dataset.csv')
except FileNotFoundError:
    data = pd.read_csv('cars_dataset.csv')
data

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,Make
0,A1,2017,12500,Manual,15735,Petrol,150.0,55.4,1.4,audi
1,A6,2016,16500,Automatic,36203,Diesel,20.0,64.2,2.0,audi
2,A1,2016,11000,Manual,29946,Petrol,30.0,55.4,1.4,audi
3,A4,2017,16800,Automatic,25952,Diesel,145.0,67.3,2.0,audi
4,A3,2019,17300,Manual,1998,Petrol,145.0,49.6,1.0,audi
...,...,...,...,...,...,...,...,...,...,...
72430,I30,2016,8680,Manual,25906,Diesel,0.0,78.4,1.6,Hyundai
72431,I40,2015,7830,Manual,59508,Diesel,30.0,65.7,1.7,Hyundai
72432,I10,2017,6830,Manual,13810,Petrol,20.0,60.1,1.0,Hyundai
72433,Tucson,2018,13994,Manual,23313,Petrol,145.0,44.8,1.6,Hyundai


In [224]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72435 entries, 0 to 72434
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         72435 non-null  object 
 1   year          72435 non-null  int64  
 2   price         72435 non-null  int64  
 3   transmission  72435 non-null  object 
 4   mileage       72435 non-null  int64  
 5   fuelType      72435 non-null  object 
 6   tax           72435 non-null  float64
 7   mpg           72435 non-null  float64
 8   engineSize    72435 non-null  float64
 9   Make          72435 non-null  object 
dtypes: float64(3), int64(3), object(4)
memory usage: 5.5+ MB


In [225]:
data.describe()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize
count,72435.0,72435.0,72435.0,72435.0,72435.0,72435.0
mean,2017.073666,16580.158708,23176.517057,116.953407,55.85248,1.63565
std,2.101252,9299.028754,21331.515562,64.045533,17.114391,0.561535
min,1996.0,495.0,1.0,0.0,0.3,0.0
25%,2016.0,10175.0,7202.5,30.0,47.9,1.2
50%,2017.0,14495.0,17531.0,145.0,55.4,1.6
75%,2019.0,20361.0,32449.0,145.0,62.8,2.0
max,2020.0,145000.0,323000.0,580.0,470.8,6.6


## Nulls

In [226]:
data.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
Make            0
dtype: int64

In [227]:
data.columns

Index(['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax',
       'mpg', 'engineSize', 'Make'],
      dtype='object')

# Draw

In [228]:
# for col in data.columns:
#     fig, ax = plt.subplots(figsize=(30, 5))
#     ax.tick_params(axis='x', rotation=90)
#     plt.title(f'{col} histogram')
#     sns.histplot(data=data, x=col, ax=ax)
#     plt.show()

In [229]:
def standardize(x):
    return (x - np.mean(x)) / np.std(x)

In [230]:
data.model = preprocessing.LabelEncoder().fit_transform(data.model)
data.transmission = preprocessing.LabelEncoder().fit_transform(data.transmission)
data.fuelType = preprocessing.LabelEncoder().fit_transform(data.fuelType)
data.Make = preprocessing.LabelEncoder().fit_transform(data.Make)
data

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,Make
0,8,2017,12500,1,15735,4,150.0,55.4,1.4,3
1,13,2016,16500,0,36203,0,20.0,64.2,2.0,3
2,8,2016,11000,1,29946,4,30.0,55.4,1.4,3
3,11,2017,16800,0,25952,0,145.0,67.3,2.0,3
4,10,2019,17300,1,1998,4,145.0,49.6,1.0,3
...,...,...,...,...,...,...,...,...,...,...
72430,56,2016,8680,1,25906,0,0.0,78.4,1.6,2
72431,57,2015,7830,1,59508,0,30.0,65.7,1.7,2
72432,54,2017,6830,1,13810,4,20.0,60.1,1.0,2
72433,126,2018,13994,1,23313,4,145.0,44.8,1.6,2


In [231]:
data.mileage = standardize(data.mileage)
data

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,Make
0,8,2017,12500,1,-0.348853,4,150.0,55.4,1.4,3
1,13,2016,16500,0,0.610673,0,20.0,64.2,2.0,3
2,8,2016,11000,1,0.317349,4,30.0,55.4,1.4,3
3,11,2017,16800,0,0.130113,0,145.0,67.3,2.0,3
4,10,2019,17300,1,-0.992834,4,145.0,49.6,1.0,3
...,...,...,...,...,...,...,...,...,...,...
72430,56,2016,8680,1,0.127956,0,0.0,78.4,1.6,2
72431,57,2015,7830,1,1.703195,0,30.0,65.7,1.7,2
72432,54,2017,6830,1,-0.439096,4,20.0,60.1,1.0,2
72433,126,2018,13994,1,0.006398,4,145.0,44.8,1.6,2


# Split

In [232]:
data = shuffle(data)
y = data.price
data.drop('price', axis=1, inplace=True)
X = data

In [233]:
X

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,Make
53216,121,2015,0,1.578027,0,240.0,42.8,3.0,6
57784,20,2017,0,-0.033730,2,140.0,72.4,1.8,5
65890,67,2018,1,-0.021542,4,145.0,52.3,1.5,4
17211,0,2017,3,-0.739170,4,145.0,39.8,3.0,0
22743,42,2017,0,-0.482975,4,145.0,54.3,1.0,1
...,...,...,...,...,...,...,...,...,...
9755,86,2015,1,3.101230,0,30.0,61.4,2.0,3
30359,65,2018,1,-0.720841,4,150.0,57.7,1.2,1
25899,42,2014,1,0.019056,4,0.0,65.7,1.0,1
76,87,2016,0,0.329303,0,200.0,47.1,2.0,3


In [234]:
y

53216    18440
57784    13998
65890    19625
17211    23995
22743    10710
         ...  
9755     10999
30359     7995
25899     7950
76       19600
59709     8300
Name: price, Length: 72435, dtype: int64

In [235]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=0)

# Multiple Linear Regression

## Train

In [236]:
Model1 = LinearRegression()
Model1.fit(X_train, y_train)

## Predict

In [237]:
y_pred1 = Model1.predict(X_test)
y_pred1

array([15704.9132296 , 22173.06515292, 15829.78421889, ...,
       22288.26945514, 23367.6152746 ,  9186.1831825 ])

## R2 Score

In [238]:
r2_score(y_test, y_pred1)

0.7091479081768856

## Mean Squared Error

In [239]:
mean_squared_error(y_test, y_pred1)

24555888.00612672

## Mean Absolute Error

In [240]:
mean_absolute_error(y_test, y_pred1)

3253.6224034568586

# Polynomial Regression

## Degree 2

In [241]:
poly = PolynomialFeatures(degree=2)
X_poly2 = poly.fit_transform(X)
X_poly2

array([[1.000e+00, 1.210e+02, 2.015e+03, ..., 9.000e+00, 1.800e+01,
        3.600e+01],
       [1.000e+00, 2.000e+01, 2.017e+03, ..., 3.240e+00, 9.000e+00,
        2.500e+01],
       [1.000e+00, 6.700e+01, 2.018e+03, ..., 2.250e+00, 6.000e+00,
        1.600e+01],
       ...,
       [1.000e+00, 4.200e+01, 2.014e+03, ..., 1.000e+00, 1.000e+00,
        1.000e+00],
       [1.000e+00, 8.700e+01, 2.016e+03, ..., 4.000e+00, 6.000e+00,
        9.000e+00],
       [1.000e+00, 2.200e+01, 2.018e+03, ..., 1.000e+00, 5.000e+00,
        2.500e+01]])

### Train

In [242]:
Model2 = LinearRegression()
Model2.fit(X_poly2, y)

### Predict

In [243]:
y_pred2 = Model2.predict(X_poly2)
y_pred2

array([23554.22436225, 19005.66785449, 14750.19522011, ...,
        5485.14757475, 18257.14609358, 11133.96127516])

### R2 Score

## R2 Score

In [244]:
r2_score(y, y_pred2)

0.8230380150610741

## Mean Squared Error

## Mean Squared Error

In [245]:
mean_squared_error(y, y_pred2)

15302034.138804073

## Mean Absolute Error

## Mean Absolute Error

In [246]:
mean_absolute_error(y, y_pred2)

2571.6377748115933

## Degree 3

In [247]:
poly = PolynomialFeatures(degree=3)
X_poly3 = poly.fit_transform(X)
X_poly3

array([[1.000e+00, 1.210e+02, 2.015e+03, ..., 5.400e+01, 1.080e+02,
        2.160e+02],
       [1.000e+00, 2.000e+01, 2.017e+03, ..., 1.620e+01, 4.500e+01,
        1.250e+02],
       [1.000e+00, 6.700e+01, 2.018e+03, ..., 9.000e+00, 2.400e+01,
        6.400e+01],
       ...,
       [1.000e+00, 4.200e+01, 2.014e+03, ..., 1.000e+00, 1.000e+00,
        1.000e+00],
       [1.000e+00, 8.700e+01, 2.016e+03, ..., 1.200e+01, 1.800e+01,
        2.700e+01],
       [1.000e+00, 2.200e+01, 2.018e+03, ..., 5.000e+00, 2.500e+01,
        1.250e+02]])

### Train

In [248]:
Model3 = LinearRegression()
Model3.fit(X_poly3, y)

### Predict

In [249]:
y_pred3 = Model3.predict(X_poly3)
y_pred3

array([18974.77437401, 17633.90646935, 14549.24108696, ...,
        7083.87786674, 17870.24549103,  8668.65025902])

### R2 Score

In [250]:
r2_score(y, y_pred3)

0.876812171633266

### Mean Squared Error

In [251]:
mean_squared_error(y, y_pred3)

10652142.920997813

### Mean Absolute Error

In [252]:
mean_absolute_error(y, y_pred3)

2148.1876954685163