# Scikit-Learn

#### Linear Regression-, KNN-Model and Random Forest Regressor for Multiple Regression 


## Load packages

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant  

sns.set_theme()

## Import Dataset

In [2]:
df = pd.read_csv("car_prices.csv", on_bad_lines="skip")

## Data inspection

In [3]:
df.head(2)

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg566472,ca,5.0,16639.0,white,black,"kia motors america, inc",20500,21500,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
1,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg561319,ca,5.0,9393.0,white,beige,"kia motors america, inc",20800,21500,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 558811 entries, 0 to 558810
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          558811 non-null  int64  
 1   make          548510 non-null  object 
 2   model         548412 non-null  object 
 3   trim          548160 non-null  object 
 4   body          545616 non-null  object 
 5   transmission  493458 non-null  object 
 6   vin           558811 non-null  object 
 7   state         558811 non-null  object 
 8   condition     547017 non-null  float64
 9   odometer      558717 non-null  float64
 10  color         558062 non-null  object 
 11  interior      558062 non-null  object 
 12  seller        558811 non-null  object 
 13  mmr           558811 non-null  int64  
 14  sellingprice  558811 non-null  int64  
 15  saledate      558811 non-null  object 
dtypes: float64(2), int64(3), object(11)
memory usage: 68.2+ MB


In [6]:
print(df.isnull().sum())

year                0
make            10301
model           10399
trim            10651
body            13195
transmission    65353
vin                 0
state               0
condition       11794
odometer           94
color             749
interior          749
seller              0
mmr                 0
sellingprice        0
saledate            0
dtype: int64


## Data transformation

In [7]:
# drop column with too many missing values
df = df.drop(['transmission'], axis=1)

# drop remaining row with one missing value
df = df.dropna()

In [8]:
# Drop irrelevant features
df = df.drop(['trim', 'vin', 'mmr', 'saledate'], axis=1)

In [9]:
print(df.isnull().sum())

year            0
make            0
model           0
body            0
state           0
condition       0
odometer        0
color           0
interior        0
seller          0
sellingprice    0
dtype: int64


In [10]:
# rename columns

df = df.rename(columns={
"make" : "brand",
"body" : "type",
"odometer" : "miles"} 
    )

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533660 entries, 0 to 558810
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          533660 non-null  int64  
 1   brand         533660 non-null  object 
 2   model         533660 non-null  object 
 3   type          533660 non-null  object 
 4   state         533660 non-null  object 
 5   condition     533660 non-null  float64
 6   miles         533660 non-null  float64
 7   color         533660 non-null  object 
 8   interior      533660 non-null  object 
 9   seller        533660 non-null  object 
 10  sellingprice  533660 non-null  int64  
dtypes: float64(2), int64(2), object(7)
memory usage: 48.9+ MB


In [12]:
# transform into lowercase

df["brand"] = df["brand"].str.lower()
df["model"] = df["model"].str.lower()
df["type"] = df["type"].str.lower()

# Categorial or numeric?

* year = categorial
* brand = categorial
* model = categorial
* type = categorial
* state = categorial
* condition = categorial
* miles = numeric
* color = categorial
* interior = categorial
* seller = categorial
* ratingprice = numeric
* sellingprice = numeric
* saledate = categorial

In [13]:
# transform to categorical:

for cat in ["year", "brand", "model", "type", "state", "condition", "color", "interior", "seller"]:
    df[cat] = df[cat].astype("category")

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533660 entries, 0 to 558810
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   year          533660 non-null  category
 1   brand         533660 non-null  category
 2   model         533660 non-null  category
 3   type          533660 non-null  category
 4   state         533660 non-null  category
 5   condition     533660 non-null  category
 6   miles         533660 non-null  float64 
 7   color         533660 non-null  category
 8   interior      533660 non-null  category
 9   seller        533660 non-null  category
 10  sellingprice  533660 non-null  int64   
dtypes: category(9), float64(1), int64(1)
memory usage: 18.4 MB


In [15]:
# summary statistics for all categorical columns
df.describe(include=['category']).transpose()

Unnamed: 0,count,unique,top,freq
year,533660.0,26.0,2012,100612.0
brand,533660.0,53.0,ford,91908.0
model,533660.0,768.0,altima,19159.0
type,533660.0,45.0,sedan,236445.0
state,533660.0,38.0,fl,79626.0
condition,533660.0,41.0,1.9,40404.0
color,533660.0,20.0,black,106600.0
interior,533660.0,17.0,black,238668.0
seller,533660.0,12739.0,nissan-infiniti lt,19677.0


## Data preprocessing pipeline

In [16]:
# Modules
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn import set_config
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [17]:
# for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ])

In [18]:
# for categorical features  
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

In [19]:
# Pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, selector(dtype_exclude="category")),
    ('cat', categorical_transformer, selector(dtype_include="category"))
        ])

In [20]:
df.head()

Unnamed: 0,year,brand,model,type,state,condition,miles,color,interior,seller,sellingprice
0,2015,kia,sorento,suv,ca,5.0,16639.0,white,black,"kia motors america, inc",21500
1,2015,kia,sorento,suv,ca,5.0,9393.0,white,beige,"kia motors america, inc",21500
2,2014,bmw,3 series,sedan,ca,4.5,1331.0,gray,black,financial services remarketing (lease),30000
3,2015,volvo,s60,sedan,ca,4.1,14282.0,white,black,volvo na rep/world omni,27750
4,2014,bmw,6 series gran coupe,sedan,ca,4.3,2641.0,gray,black,financial services remarketing (lease),67000


## Modeling

# Multiple Regression

In [21]:
# Select features for multiple regression
features= [
 'miles',
 'brand',
 'model',
 'type',
 'condition',
 'color'
  ]
X = df[features]

X.info()
print("Missing values:",X.isnull().any(axis = 1).sum())

# Create response
y = df["sellingprice"]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533660 entries, 0 to 558810
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype   
---  ------     --------------   -----   
 0   miles      533660 non-null  float64 
 1   brand      533660 non-null  category
 2   model      533660 non-null  category
 3   type       533660 non-null  category
 4   condition  533660 non-null  category
 5   color      533660 non-null  category
dtypes: category(5), float64(1)
memory usage: 11.2 MB
Missing values: 0


In [22]:
from sklearn.model_selection import train_test_split

# Data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
from sklearn.linear_model import LinearRegression

# Create pipeline with model
lm_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lm', LinearRegression())
                        ])

In [24]:
# show pipeline
set_config(display="diagram")
# Fit model
lm_pipe.fit(X_train, y_train)

In [25]:
y_pred = lm_pipe.predict(X_test)

In [26]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

0.7871510077904085

In [27]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, y_pred)

19820180.859093003

In [28]:
mean_squared_error(y_test, y_pred, squared=False)

4451.986170137212

In [29]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, y_pred)

2881.345892972646

In [74]:
# Create a new GDP value
X_new = pd.DataFrame({
    "year": [2015],
    "brand": "kia",
    "model": "sorento",
    "type": "suv",
    "state": "ca",
    "condition": [5.0],
    "miles": [16639.0],
    "color": "white",
    "interior": "black",
    "seller": "kia motors america, inc",
})

In [75]:
X_new

Unnamed: 0,year,brand,model,type,state,condition,miles,color,interior,seller
0,2015,kia,sorento,suv,ca,5.0,16639.0,white,black,"kia motors america, inc"


In [76]:
my_prediction = lm_pipe.predict(X_new)

In [None]:
#save knn-model 
from joblib import dump

dump(lm_pipe, "lm_model.joblib")

In [77]:
df_prediction = pd.DataFrame({"pred": my_prediction})


In [78]:
df_prediction

Unnamed: 0,pred
0,21454.28274


In [72]:
sample = {
    "year": [2015],
    "brand": "kia",
    "model": "sorento",
    "type": "suv",
    "state": "ca",
    "condition": [5.0],
    "miles": [16639.0],
    "color": "white",
    "interior": "black",
    "seller": "kia motors america, inc",
}

In [52]:
# KNN
from sklearn.neighbors import KNeighborsRegressor as KNR

# Create pipeline with model
knn_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('KNN Regression', KNR(n_neighbors=5))
                        ])

In [53]:
# show pipeline
set_config(display="diagram")
# Fit model
knn_pipe.fit(X_train, y_train)

In [54]:
y_pred = knn_pipe.predict(X_test)

In [55]:
#save knn-model 
from joblib import dump

dump(knn_pipe, "knn_model.joblib")

['knn_model.joblib']

In [56]:
r2_score(y_test, y_pred)

0.840973991058399

In [57]:
mean_squared_error(y_test, y_pred)

14808264.891470972

In [58]:
mean_squared_error(y_test, y_pred, squared=False)

3848.1508405298996

In [59]:
mean_absolute_error(y_test, y_pred)

2373.666221939063

In [79]:
my_prediction = knn_pipe.predict(X_new)

In [80]:
df_prediction_knn = pd.DataFrame({"pred": my_prediction})

In [81]:
df_prediction_knn

Unnamed: 0,pred
0,19980.0


In [60]:
# RandomForest
from sklearn.ensemble import RandomForestRegressor as RFR

# Create pipeline with model
rf_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('Random Forest', RFR(n_estimators=50, max_depth=5))
                        ])

In [61]:
# show pipeline
set_config(display="diagram")
# Fit model
rf_pipe.fit(X_train, y_train)

In [62]:
#save rf-model
from joblib import dump

dump(rf_pipe, "rf_model.joblib")

['rf_model.joblib']

In [63]:
y_pred = rf_pipe.predict(X_test)

In [64]:
r2_score(y_test, y_pred)

0.5419539906431969

In [65]:
mean_squared_error(y_test, y_pred)

42652561.57895278

In [66]:
mean_squared_error(y_test, y_pred, squared=False)

6530.892862308551

In [67]:
mean_absolute_error(y_test, y_pred)

4268.4538994900995