In [74]:
# Import the required libraries
import pandas as pd
from pathlib import Path
import hvplot.pandas
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics, model_selection
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, scale
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error
import pickle

import warnings
warnings.filterwarnings('ignore')

In [75]:
car_data= pd.read_csv(Path("./Resources/car_prices.csv"))
car_data.head(3)

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg566472,ca,5.0,16639.0,white,black,kia motors america inc,20500.0,21500.0,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
1,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg561319,ca,5.0,9393.0,white,beige,kia motors america inc,20800.0,21500.0,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,wba3c1c51ek116351,ca,45.0,1331.0,gray,black,financial services remarketing (lease),31900.0,30000.0,Thu Jan 15 2015 04:30:00 GMT-0800 (PST)


In [76]:
car_data.describe(exclude=np.number)

Unnamed: 0,make,model,trim,body,transmission,vin,state,color,interior,seller,saledate
count,548536,548438,548186,545642,493485,558833,558837,558088,558088,558837,558825
unique,96,973,1963,87,4,550297,64,46,17,14263,3766
top,Ford,Altima,Base,Sedan,automatic,automatic,fl,black,black,nissan-infiniti lt,Tue Feb 10 2015 01:30:00 GMT-0800 (PST)
freq,93554,19349,55817,199437,475915,22,82945,110970,244329,19693,5334


In [77]:
car_data=car_data.drop(columns=['vin','saledate','seller'], axis=1)

In [8]:
car_data.dropna(subset=['transmission'])
car_data = car_data.dropna()

In [9]:
car_data.isnull().mean()*100

year            0.0
make            0.0
model           0.0
trim            0.0
body            0.0
transmission    0.0
state           0.0
condition       0.0
odometer        0.0
color           0.0
interior        0.0
mmr             0.0
sellingprice    0.0
dtype: float64

In [10]:
car_data['state'] = car_data['state'].str.upper()

In [11]:
car_data.head(2)

Unnamed: 0,year,make,model,trim,body,transmission,state,condition,odometer,color,interior,mmr,sellingprice
0,2015,Kia,Sorento,LX,SUV,automatic,CA,5.0,16639.0,white,black,20500.0,21500.0
1,2015,Kia,Sorento,LX,SUV,automatic,CA,5.0,9393.0,white,beige,20800.0,21500.0


In [12]:
car_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 472325 entries, 0 to 558836
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          472325 non-null  int64  
 1   make          472325 non-null  object 
 2   model         472325 non-null  object 
 3   trim          472325 non-null  object 
 4   body          472325 non-null  object 
 5   transmission  472325 non-null  object 
 6   state         472325 non-null  object 
 7   condition     472325 non-null  float64
 8   odometer      472325 non-null  float64
 9   color         472325 non-null  object 
 10  interior      472325 non-null  object 
 11  mmr           472325 non-null  float64
 12  sellingprice  472325 non-null  float64
dtypes: float64(4), int64(1), object(8)
memory usage: 50.4+ MB


In [13]:
make_price= car_data.make.value_counts()[:15]
make_price.index

Index(['Ford', 'Chevrolet', 'Nissan', 'Toyota', 'Dodge', 'Honda', 'Hyundai',
       'BMW', 'Kia', 'Chrysler', 'Infiniti', 'Mercedes-Benz', 'Jeep',
       'Volkswagen', 'Lexus'],
      dtype='object', name='make')

In [14]:
make_df= car_data[car_data['make'].isin(make_price.index)]

In [15]:
make_df.value_counts()

year  make        model            trim              body         transmission  state  condition  odometer  color   interior  mmr      sellingprice
1990  Chevrolet   C/K 1500 Series  454SS             Regular Cab  automatic     FL     3.0        101927.0  black   red       7225.0   8000.0          1
2013  Dodge       Avenger          SE                Sedan        automatic     SC     26.0       57114.0   silver  black     9200.0   9000.0          1
                                                                                OH     49.0       48945.0   blue    tan       9500.0   9900.0          1
                                                                                       48.0       26102.0   blue    black     10850.0  11000.0         1
                                                                                       45.0       15609.0   blue    black     11650.0  11600.0         1
                                                                                       

In [16]:
make_df =make_df.drop(columns=['trim', 'interior', 'body', 'transmission'], axis=1)


In [73]:
make_df['condition'].describe()

count    395553.000000
mean         30.948897
std          13.277015
min           1.000000
25%          24.000000
50%          35.000000
75%          42.000000
max          49.000000
Name: condition, dtype: float64

In [17]:
encoder = LabelEncoder()

In [18]:
object_columns = [i for i in make_df.columns if make_df[i].dtype == "object" ]

In [19]:
for  i in object_columns:
    make_df[i] = encoder.fit_transform(make_df[i])
    with open("encoder_"+str(i)+".pkl", 'wb') as encoder_file:
        pickle.dump(encoder, encoder_file)

In [20]:
make_df.head(3)

Unnamed: 0,year,make,model,state,condition,odometer,color,mmr,sellingprice
0,2015,9,354,2,5.0,16639.0,17,20500.0,21500.0
1,2015,9,354,2,5.0,9393.0,17,20800.0,21500.0
2,2014,0,5,2,45.0,1331.0,7,31900.0,30000.0


In [21]:
scaler = MinMaxScaler()

In [22]:
X= make_df.iloc[:,:-1]
y= make_df.iloc[:,-1]

In [23]:
for i in X.columns:
    X[i] = scaler.fit_transform(np.array(X[i]).reshape(-1,1))
    with open(str(i)+"_scaler.pkl",'wb') as scaler_file:
        pickle.dump(scaler, scaler_file) 

In [24]:
X.head()

Unnamed: 0,year,make,model,state,condition,odometer,color,mmr
0,1.0,0.642857,0.842857,0.060606,0.083333,0.016638,0.894737,0.116352
1,1.0,0.642857,0.842857,0.060606,0.083333,0.009392,0.894737,0.118057
2,0.96,0.0,0.011905,0.060606,0.916667,0.00133,0.368421,0.181134
4,0.96,0.0,0.052381,0.060606,0.875,0.00264,0.368421,0.374911
5,1.0,0.857143,0.07619,0.060606,0.0,0.005553,0.368421,0.087086


In [25]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=0.15,random_state=1)

In [26]:
xtrain.shape, ytrain.shape

((336220, 8), (336220,))

In [27]:
lasso_model = Lasso(alpha=0.001)
lasso_model.fit(xtrain,ytrain)

In [28]:
pred=lasso_model.predict(xtest)

In [29]:
r2_score(ytest,pred)

0.9662649011289366

In [30]:
with open("lasso_model.pkl", "wb") as f:
    pickle.dump(lasso_model, f)

In [47]:
model = pickle.load(open('lasso_model.pkl', 'rb'))

In [48]:
model

In [38]:
year = 2012
make = "Honda"
model = "Civic"
odometer = 123450
color = "black"
state = "CA"
condition= 34.0
mmr=16000


In [39]:
features = {"year":year, "make":make, "model" :model, "odometer":odometer, "color":color, "state":state,"mmr":mmr, "condition":condition}

In [40]:
data = {}
for i in features.keys():
    if type(features[i]) == str:
        with open("encoder_"+str(i)+".pkl", 'rb') as encoder:
            label_encoder = pickle.load(encoder)
        print(label_encoder,i)
        data[str(i)] = label_encoder.transform(np.array(features[i]).reshape(1,-1))

    elif type(features[i]) != str:
              data[str(i)] = features[i]
    with open(str(i)+"_scaler.pkl", 'rb') as scaler_file:
        scaler = pickle.load(scaler_file)
    data[str(i)] = scaler.transform(np.array(data[i]).reshape(1,-1))

# processed_data = {key: value[0] for key, value in data.items()}
# processed_df = pd.DataFrame(processed_data)

# prediction = model.predict(processed_df.values())

# output = (prediction[0])
data

LabelEncoder() make
LabelEncoder() model
LabelEncoder() color
LabelEncoder() state


{'year': array([[0.88]]),
 'make': array([[0.35714286]]),
 'model': array([[0.1952381]]),
 'odometer': array([[0.12344925]]),
 'color': array([[0.05263158]]),
 'state': array([[0.06060606]]),
 'mmr': array([[0.09077994]]),
 'condition': array([[0.6875]])}

In [41]:
processed_data = {key: value[0] for key, value in data.items()}
processed_df = pd.DataFrame(processed_data)
processed_df

Unnamed: 0,year,make,model,odometer,color,state,mmr,condition
0,0.88,0.357143,0.195238,0.123449,0.052632,0.060606,0.09078,0.6875


In [43]:
model= pickle.load(open('lasso_model.pkl', 'rb'))

In [45]:
prediction = model.predict(processed_df.values)

# output = (prediction[0])
prediction

array([117847.21868019])

In [46]:
output = round(prediction[0],3)
output

117847.219

In [None]:
# processed_data = {key:value[0] for key, value in data.items()}
# processed_df = pd.DataFrame(processed_data)

In [None]:
# model=pickle.load(open('lasso_model.pkl', 'rb'))

In [None]:
# model.predict(processed_df.values)

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=0.15,random_state=1)

In [None]:
xtrain.shape, ytrain.shape

In [None]:
# car_data['condition', 'sellingprice'] corr

In [None]:
lasso_model = Lasso()
lasso_model.fit(xtrain,ytrain)

In [None]:
pred=lasso_model.predict(xtest)

In [None]:
msre = round(np.sqrt(mean_squared_error(ytest,pred)),2)
msre

In [None]:
lasso_model.coef_

In [None]:
lasso_model.intercept_

In [None]:
r2_score(ytest,pred)

In [None]:
opt_lass_two = Lasso(alpha=0.001)
opt_lass_two.fit(xtrain, ytrain)
opt_pred=opt_lass_two.predict(xtest)


In [None]:
opt_lass_two.intercept_

In [None]:
r2_score(ytest, opt_pred)

In [None]:
coef_dict={}
for coef, feat in zip(lasso_model.coef_,X.columns):
    coef_dict[feat] = coef
coef_dict

In [None]:
coef_names = make_df.drop('sellingprice',axis=1).columns

print(coef_names)

In [None]:
lasso = Lasso(alpha=0.001)
lasso_coef = lasso.fit(X, y).coef_

plt.plot(range(len(coef_names)), lasso_coef)
plt.xticks(range(len(coef_names)), coef_names, rotation=90)
plt.ylabel("Coefficients")
plt.show()

In [None]:
plt.bar(coef_names, lasso_coef)
plt.xticks(rotation=90)
plt.grid()
plt.title("Lasso Feature Selection")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.show()