In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# sklearn is a huge library with more than 100 models (still growing) - the common practice is to import the model you want
    # it is also divided into subcategories, so you need to access that with .<subcategory name>
from sklearn.linear_model import LinearRegression

In [3]:
# importing dataset to pandas dataframe
trainf = pd.read_csv('Datasets/Car_features_train.csv') # Predictors
trainp = pd.read_csv('Datasets/Car_prices_train.csv') # Response
train = pd.merge(trainf,trainp)
train.head(10)

Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,price
0,18473,bmw,6 Series,2020,Semi-Auto,11,Diesel,145,53.3282,3.0,37980
1,15064,bmw,6 Series,2019,Semi-Auto,10813,Diesel,145,53.043,3.0,33980
2,18268,bmw,6 Series,2020,Semi-Auto,6,Diesel,145,53.4379,3.0,36850
3,18480,bmw,6 Series,2017,Semi-Auto,18895,Diesel,145,51.514,3.0,25998
4,18492,bmw,6 Series,2015,Automatic,62953,Diesel,160,51.4903,3.0,18990
5,17089,bmw,6 Series,2017,Semi-Auto,20643,Diesel,145,51.3358,3.0,27825
6,13342,bmw,6 Series,2013,Semi-Auto,54483,Diesel,160,51.4219,3.0,16640
7,14659,bmw,6 Series,2016,Automatic,32449,Diesel,160,51.6433,3.0,18450
8,13739,bmw,6 Series,2017,Automatic,31328,Diesel,145,51.8007,3.0,26500
9,16234,bmw,6 Series,2012,Automatic,50319,Diesel,160,51.5976,3.0,12970


In [3]:
# Create the model as an object

model = LinearRegression() # No inputs, this will change for other models

# Train the model - separate the predictor(s) and the response for this!
X_train = train[['engineSize']]
y_train = train[['price']]

# Note that both are dfs, NOT series - necessary to avoid errors

model.fit(X_train, y_train)

# Check the slight syntax differences
    # predictors and response separate
    # We need to manually slice the predictor column(s) we want to include
    # No need to assign to an output
    
# Return the parameters
model.coef_ # slope
model.intercept_ # intercept

# No .summary() here! - impossible to do much inference; this is a shortcoming of sklearn

array([-4122.03574424])

In [4]:
# Prediction

# Test data
testf = pd.read_csv('Datasets/Car_features_test.csv') # Predictors
testp = pd.read_csv('Datasets/Car_prices_test.csv') # Response
test = pd.merge(testf,testp)
test.head()

# Again, separate the predictor(s) and the response of interest
X_test = test[['engineSize']]
y_test = test[['price']].to_numpy() # Easier to handle with calculations as np array

y_pred = model.predict(X_test)

# Evaluate

# Evaluate
model_rmse = np.sqrt(np.mean((y_pred - y_test)**2)) # RMSE
model_mae = np.mean(np.abs(y_pred - y_test)) # MAE

print('Test RMSE: ', model_rmse)
print('Test MAE: ', model_mae)

Test RMSE:  12995.106451548696
Test MAE:  9411.325912951994


In [5]:
# Lastly, check for overfitting/underfitting

print('Test RMSE:', model_rmse)

# Print training RMSE and compare
y_pred_train = model.predict(X_train)

print('Train RMSE:', np.sqrt(np.mean((y_pred_train - y_train.to_numpy())**2)))

# Comparable - no overfitting
# Around 13k error for car prices - underfitting

# No .mse_resid here!


Test RMSE: 12995.106451548696
Train RMSE: 12807.526231509039


In [7]:
# Easier way to calculate metrics with sklearn tools

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

model_rmse = mean_squared_error(y_test, y_pred, squared=False)
model_mae = mean_absolute_error(y_test, y_pred)
model_r2 = r2_score(y_test, y_pred)
print('Test RMSE: ', model_rmse)
print('Test MAE: ', model_mae)
print('Test R-squared: ', model_r2)

Test RMSE:  12995.106451548696
Test MAE:  9411.325912951994
Test R-squared:  0.38699003786201447


In [8]:
# Another way to print out Test R-squared
model_r2 = model.score(X_test, y_test)
print('Test R-squared: ', model_r2)

Test R-squared:  0.38699003786201447


**Note:** Why did we repeat the same task in two different libraries?

- statsmodels and sklearn have different advantages - we will use both for our purposes
    - statsmodels returns a lot of statistical output, which is very helpful for inference (coming up next) but it has a limited variety of models.
    - sklearn includes many models (Lasso and Ridge this quarter, many others next quarter) and helpful tools/functions (like metrics) that statsmodels does not  but it does not have any inference tools.