In [1]:
%matplotlib inline

from typing import List
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow.csv as pv
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mutual_info_score
from collections import defaultdict
from joblib import dump
from ch2.ch2_util import BasicPreprocessing, RecordizeDataframe

sns.set_theme()

In [2]:
df = pd.read_csv('../data/card_price/data.csv')
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [3]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

In [4]:
y_train = np.log1p(df_train['MSRP'].values)
y_test = np.log1p(df_test['MSRP'].values)
df_train = df_train.loc[:, df_train.columns != 'MSRP']
df_test = df_test.loc[:, df_test.columns != 'MSRP']

In [5]:
features = [
    'year', 'popularity', 'city_mpg', 'highway_mpg', 'engine_cylinders', 'engine_hp', 'number_of_doors',
    'make', 'transmission_type', 'driven_wheels', 'vehicle_size', 'model', 'market_category', 'engine_fuel_type'
]


In [6]:
preprocess = Pipeline(steps=[
    ('basic', BasicPreprocessing(features)),
    ('to_record', RecordizeDataframe()),
    ('vectorizer', DictVectorizer())
])

In [7]:
X_train = preprocess.fit_transform(df_train)

In [8]:
model = sklearn.linear_model.LinearRegression()
model.fit(X_train, y_train)
y_hat = model.predict(X_train)

In [9]:
X_test = preprocess.transform(df_test)
model = sklearn.linear_model.LinearRegression()
model.fit(X_test, y_test)
y_hat = model.predict(X_test)

In [10]:
dump(model, 'model/car_price.joblib')
dump(preprocess, 'model/proprocess.joblib')

['model/proprocess.joblib']