In [52]:
# Import the required libraries
import pandas as pd
from pathlib import Path
import hvplot.pandas
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics, model_selection
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error
import pickle
import warnings
warnings.filterwarnings('ignore')

In [53]:
car_data= pd.read_csv(Path("./Resources/car_prices.csv"))

In [54]:
car_data.dropna(subset=['transmission'])
car_data = car_data.dropna()
car_data['state'] = car_data['state'].str.upper()

In [55]:
# Check for Null Values
car_data.isnull().mean()*100

year            0.0
make            0.0
model           0.0
trim            0.0
body            0.0
transmission    0.0
vin             0.0
state           0.0
condition       0.0
odometer        0.0
color           0.0
interior        0.0
seller          0.0
mmr             0.0
sellingprice    0.0
saledate        0.0
dtype: float64

In [56]:
#converts the 'saledate' column to a datetime format and localizes it to UTC
car_data['saledate'] = pd.to_datetime(car_data['saledate'], utc=True)
car_data['saledate'] = car_data['saledate'].dt.tz_convert('UTC')

car_data['year'] = pd.to_datetime(car_data['year'], format='%Y')

# Create a new variable for vehicle age
car_data['vehicleage'] = pd.to_datetime(car_data['saledate']).dt.year - car_data['year'].dt.year


In [57]:
make_price= car_data.make.value_counts()[:15]
make_price.index
make_df= car_data[car_data['make'].isin(make_price.index)]

In [58]:
make_df=make_df.drop(columns=['vin','saledate','seller','trim', 'interior', 'body', 'transmission'], axis=1)
make_df.head(3)

Unnamed: 0,year,make,model,state,condition,odometer,color,mmr,sellingprice,vehicleage
0,2015-01-01,Kia,Sorento,CA,5.0,16639.0,white,20500.0,21500.0,-1
1,2015-01-01,Kia,Sorento,CA,5.0,9393.0,white,20800.0,21500.0,-1
2,2014-01-01,BMW,3 Series,CA,45.0,1331.0,gray,31900.0,30000.0,1


In [59]:
encoder = LabelEncoder()

In [84]:
# object_columns = [i for i in make_df.columns if make_df[i].dtype == "object" ]

In [119]:
for  i in object_columns:
    make_df[i] = encoder.fit_transform(make_df[i])
    with open("encoder_"+str(i)+".pkl", 'wb') as encoder_file:
        pickle.dump(encoder, encoder_file)

In [120]:
make_df.columns

Index(['year', 'make', 'model', 'state', 'condition', 'odometer', 'color',
       'mmr', 'sellingprice', 'vehicleage'],
      dtype='object')

In [121]:
X= make_df.drop(columns='sellingprice')
y= make_df['sellingprice']

In [122]:
# X= make_df.drop(columns="sellingprice")
# y= make_df["sellingprice"]

In [123]:
scaler = MinMaxScaler()

In [124]:
for i in X.columns:
    X[i] = scaler.fit_transform(np.array(X[i]).reshape(-1,1))
    with open(str(i)+"_scaler.pkl",'wb') as scaler_file:
        pickle.dump(scaler, scaler_file) 

In [125]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=0.15,random_state=1)
xtrain.shape, ytrain.shape

((336220, 9), (336220,))

In [126]:
lasso_model = Lasso(alpha=0.001)
lasso_model.fit(xtrain,ytrain)

In [127]:
pred=lasso_model.predict(xtest)

In [128]:
r2_score(ytest,pred)

0.9662719300090209

In [129]:
with open("lasso_model.pkl", "wb") as f:
    pickle.dump(lasso_model, f)

In [130]:
model = pickle.load(open('lasso_model.pkl', 'rb'))

In [131]:
model

In [132]:
year = 2012
make = "Honda"
model = "Civic"
odometer = 123450
color = "black"
state = "CA"
condition= 34.0
mmr=16000
vehicleage= 2

In [133]:
features = {"year":year, "make":make, "model" :model, "odometer":odometer, "color":color, "state":state,"mmr":mmr, "condition":condition, "vehicleage":vehicleage}

In [134]:
data = {}
for i in features.keys():
    if type(features[i]) == str:
        with open("encoder_"+str(i)+".pkl", 'rb') as encoder:
            label_encoder = pickle.load(encoder)
        print(label_encoder,i)
        data[str(i)] = label_encoder.transform(np.array(features[i]).reshape(1,-1))

    elif type(features[i]) != str:
              data[str(i)] = features[i]
    with open(str(i)+"_scaler.pkl", 'rb') as scaler_file:
        scaler = pickle.load(scaler_file)
    data[str(i)] = scaler.transform(np.array(data[i]).reshape(1,-1))



LabelEncoder() make
LabelEncoder() model
LabelEncoder() color
LabelEncoder() state


In [137]:
processed_data = {key: value[0] for key, value in data.items()}
processed_df = pd.DataFrame(processed_data)

prediction = model.predict(processed_df.values())

# output = (prediction[0])
# data

AttributeError: 'str' object has no attribute 'predict'

In [49]:
# x_vars= ["year","seller", "color", "interior"]
# y_vars= ["sellingprice"]
# g= sns.PairGrid(car_data, hue="sellingprice", x_vars=x_vars, y_vars=y_vars)
# g.map_diag(sns.histplot, color=".3")
# g.map_offdiag(sns.scatterplot)
# g.add_legend()
