In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
df=pd.read_csv('car_price_prediction.csv')

In [3]:
df.head(5)

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


In [4]:
df.shape

(19237, 18)

In [5]:
df.describe()

Unnamed: 0,ID,Price,Prod. year,Cylinders,Airbags
count,19237.0,19237.0,19237.0,19237.0,19237.0
mean,45576540.0,18555.93,2010.912824,4.582991,6.582627
std,936591.4,190581.3,5.668673,1.199933,4.320168
min,20746880.0,1.0,1939.0,1.0,0.0
25%,45698370.0,5331.0,2009.0,4.0,4.0
50%,45772310.0,13172.0,2012.0,4.0,6.0
75%,45802040.0,22075.0,2015.0,4.0,12.0
max,45816650.0,26307500.0,2020.0,16.0,16.0


In [6]:
df.duplicated().sum()

313

In [7]:
df.isna().sum()

ID                  0
Price               0
Levy                0
Manufacturer        0
Model               0
Prod. year          0
Category            0
Leather interior    0
Fuel type           0
Engine volume       0
Mileage             0
Cylinders           0
Gear box type       0
Drive wheels        0
Doors               0
Wheel               0
Color               0
Airbags             0
dtype: int64

In [8]:
df['Levy'] = df['Levy'].replace('-', np.nan).astype(float)
df['Engine volume'] = df['Engine volume'].str.replace(' Turbo', '').astype(float)
df['Mileage'] = df['Mileage'].str.replace(' km', '').str.replace(',', '').astype(float)
df['Levy'].fillna(df['Levy'].mean(), inplace=True)

In [9]:
# Encoding categorical variables
label_encoders = {}
categorical_cols = ['Manufacturer', 'Model', 'Category', 'Leather interior', 'Fuel type', 'Gear box type', 
                    'Drive wheels', 'Doors', 'Wheel', 'Color']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [10]:
# Splitting the data into features and target
X = df.drop(['ID', 'Price'], axis=1)
y = df['Price']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train[:5], y_train[:5]

(array([[-0.34267765,  1.40502489, -1.039444  ,  0.36470698,  0.97813632,
          0.61207151, -0.78516633,  0.21809968, -0.02727109, -0.48509661,
         -0.59737086,  0.15097284,  0.15067123, -0.28685234,  1.15912971,
          1.25436131],
        [ 1.02292353, -1.41198825, -0.98837546, -0.69970667, -0.81498543,
          0.61207151, -1.33742762, -0.35007746, -0.0302802 , -0.48509661,
         -0.59737086,  0.15097284,  0.15067123, -0.28685234,  0.78552311,
         -0.59468654],
        [ 1.34056834,  0.16553911, -0.63575937, -0.16749984,  0.97813632,
          0.61207151, -1.33742762,  1.35445397, -0.0279962 ,  1.18870424,
         -0.59737086, -1.6144687 ,  0.15067123, -0.28685234, -1.26931319,
          1.25436131],
        [-0.71462072, -1.41198825,  0.55340803,  0.18730471, -0.81498543,
          0.61207151, -1.33742762, -0.35007746, -0.02741186, -0.48509661,
         -0.59737086,  0.15097284,  0.15067123, -0.28685234, -0.14849339,
         -0.59468654],
        [ 0.00439173

In [11]:
from sklearn.linear_model import LinearRegression

# Train the Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predict on the test set
y_pred_linear = linear_model.predict(X_test)




In [12]:
import pickle

y_pred_linear = linear_model.predict(X_test)


# Save the model to a pickle file
with open('linear_model.pkl', 'wb') as f:
    pickle.dump(linear_model, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save the label encoders to a pickle file
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)