## **Importing Important Libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error , mean_absolute_error , r2_score 
from sklearn.linear_model import LinearRegression , ridge_regression , Lasso 
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline

In [2]:
df = pd.read_csv(r"C:\Users\Lenovo\Downloads\quikr_car.csv")
df.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [3]:
df.shape

(892, 6)

## **Data Info**

In [4]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


## Problems in Data:-

* Year feature has many non-year values.
* Year feture has object datatype but it should be int datatype.
* Price column has object datatype but it should be int datatype.
* Price column has many object values in it so we have to remove it.
* kms_driven column has kms and ,'s in it so we have to remove them.
* kms_driven column has object datatype so we should convert it to int.
* kms_driven column has null values .
* fuel_type column has null values . 
* keep first 3 words of the name . 

## Making backup data.

In [5]:
copy_df = df.copy()

# **Data Preprocessing**

## Cleaning Year Column

In [6]:
df['year'].head() 

0    2007
1    2006
2    2018
3    2014
4    2014
Name: year, dtype: object

In [7]:
df = df[df['year'].str.isnumeric()]
df.shape

(842, 6)

In [8]:
df['year'] = df['year'].astype(int)
df.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 842 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        842 non-null    object
 1   company     842 non-null    object
 2   year        842 non-null    int32 
 3   Price       842 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: int32(1), object(5)
memory usage: 42.8+ KB


## Cleaning Price Column

In [9]:
df['Price'] 

0             80,000
1           4,25,000
2      Ask For Price
3           3,25,000
4           5,75,000
           ...      
886         3,00,000
888         2,60,000
889         3,90,000
890         1,80,000
891         1,60,000
Name: Price, Length: 842, dtype: object

In [10]:
df = df[df['Price'] != 'Ask For Price']
df.head() 

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel
6,Ford Figo,Ford,2012,175000,"41,000 kms",Diesel


In [11]:
df['Price'] = df['Price'].str.replace(',','').astype(int)
df.head() 

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel
6,Ford Figo,Ford,2012,175000,"41,000 kms",Diesel


In [12]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 819 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        819 non-null    object
 1   company     819 non-null    object
 2   year        819 non-null    int32 
 3   Price       819 non-null    int32 
 4   kms_driven  819 non-null    object
 5   fuel_type   816 non-null    object
dtypes: int32(2), object(4)
memory usage: 38.4+ KB


## Cleaning kms_driven Column

In [13]:
df['kms_driven'] = df['kms_driven'].str.replace('kms', '')
df['kms_driven'] = df['kms_driven'].str.replace(',', '')
df = df[df['kms_driven']!='Petrol']
df['kms_driven'] = df['kms_driven'].astype(int)
df.head() 

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40,Diesel
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,28000,Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,36000,Diesel
6,Ford Figo,Ford,2012,175000,41000,Diesel


In [14]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 817 entries, 0 to 889
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        817 non-null    object
 1   company     817 non-null    object
 2   year        817 non-null    int32 
 3   Price       817 non-null    int32 
 4   kms_driven  817 non-null    int32 
 5   fuel_type   816 non-null    object
dtypes: int32(3), object(3)
memory usage: 35.1+ KB


## Cleaning fuel_type Column

In [15]:
df['fuel_type'].value_counts() 

fuel_type
Petrol    428
Diesel    386
LPG         2
Name: count, dtype: int64

In [16]:
df = df[~df['fuel_type'].isnull()]
df.head() 

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40,Diesel
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,28000,Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,36000,Diesel
6,Ford Figo,Ford,2012,175000,41000,Diesel


In [17]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 816 entries, 0 to 889
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        816 non-null    object
 1   company     816 non-null    object
 2   year        816 non-null    int32 
 3   Price       816 non-null    int32 
 4   kms_driven  816 non-null    int32 
 5   fuel_type   816 non-null    object
dtypes: int32(3), object(3)
memory usage: 35.1+ KB


## Keeping first 3 words of the names

In [18]:
df['name'].value_counts() 

name
Honda City                                 13
Honda Amaze                                11
Maruti Suzuki Dzire                        10
Mahindra Scorpio S10                        9
Maruti Suzuki Swift Dzire VXi 1.2 BS IV     7
                                           ..
BMW 5 Series 520d Sedan                     1
Hyundai Verna 1.6 EX VTVT                   1
Honda Amaze 1.5 E i DTEC                    1
Mahindra XUV500 W8 AWD 2013                 1
Tata Zest XM Diesel                         1
Name: count, Length: 463, dtype: int64

In [19]:
df['name'] = df['name'].str.split().str[:3].apply(' '.join)
df.head() 

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
3,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
4,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
6,Ford Figo,Ford,2012,175000,41000,Diesel


In [20]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 816 entries, 0 to 889
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        816 non-null    object
 1   company     816 non-null    object
 2   year        816 non-null    int32 
 3   Price       816 non-null    int32 
 4   kms_driven  816 non-null    int32 
 5   fuel_type   816 non-null    object
dtypes: int32(3), object(3)
memory usage: 35.1+ KB


## Removing Outliers from the Data.

In [21]:
df.describe() 

Unnamed: 0,year,Price,kms_driven
count,816.0,816.0,816.0
mean,2012.444853,411717.6,46275.531863
std,4.002992,475184.4,34297.428044
min,1995.0,30000.0,0.0
25%,2010.0,175000.0,27000.0
50%,2013.0,299999.0,41000.0
75%,2015.0,491250.0,56818.5
max,2019.0,8500003.0,400000.0


In [22]:
df = df[df['Price']<6000000].reset_index(drop=True)

In [23]:
df.head() 

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel


In [24]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 815 entries, 0 to 814
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        815 non-null    object
 1   company     815 non-null    object
 2   year        815 non-null    int32 
 3   Price       815 non-null    int32 
 4   kms_driven  815 non-null    int32 
 5   fuel_type   815 non-null    object
dtypes: int32(3), object(3)
memory usage: 28.8+ KB


## Now, I have have cleaned the data.

## Exporting the cleaned data in the form of csv. 

In [25]:
# df.to_csv('CAR.csv', index=False)

## **Importing the Cleaned data**

In [26]:
Car = pd.read_csv(r"D:\PYTHON1\CAR.csv")
Car.head()  

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel


In [27]:
Car.drop('name', axis=1, inplace=True)
Car.head() 

Unnamed: 0,company,year,Price,kms_driven,fuel_type
0,Hyundai,2007,80000,45000,Petrol
1,Mahindra,2006,425000,40,Diesel
2,Hyundai,2014,325000,28000,Petrol
3,Ford,2014,575000,36000,Diesel
4,Ford,2012,175000,41000,Diesel


## Spilitting it into x (independent) and y(dependent) data.

In [28]:
x = Car.drop('Price',axis=1)
y = Car['Price']

print(x.shape)
print(y.shape)

(815, 4)
(815,)


## Splitting Data into Training and Test data. 

In [29]:
x_train , x_test , y_train , y_test = train_test_split(x,y, test_size=0.2 , random_state=42)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape) 

(652, 4)
(163, 4)
(652,)
(163,)


## **Data Preprocessing**

## Encoding the Data

In [30]:
ohe = OneHotEncoder()
ohe.fit(x[['company','fuel_type']]) 

## Transforming the Columns

In [31]:
column_trans = make_column_transformer(
    (OneHotEncoder(categories=ohe.categories_), ['company', 'fuel_type']),
    remainder='passthrough'
)

In [32]:
column_trans

## Building Linear Refression Model throgh Pipe line.

In [33]:
lr = LinearRegression()

In [34]:
pipe_lr = make_pipeline(column_trans, lr)

In [35]:
pipe_lr.fit(x_train,y_train)

## Generating Predictions

In [36]:
ypred =  pipe_lr.predict(x_test)
ypred

array([  58784.37556368,  177022.73599247,  539531.94058945,
        472423.50720533,  510948.03654671,  482636.33814827,
        180137.65514921,  382453.17879124,  563987.03437325,
        265009.20139872,  329414.1809647 ,  437328.01820576,
        616844.47292666,  110066.83982316, 1389994.17958115,
        419481.12772263,  447286.47589482,  509538.05580088,
        884420.70168877,  358977.60973208,  479803.20786034,
        -35013.27138001,  563415.08087025,  669229.85490599,
        183314.22452546,  392751.26247666,  452907.94628344,
        810369.89143284,  394004.5184693 ,  485459.12889853,
        617616.61015571,  272094.27590164,   -4255.06452937,
        -29865.68985302,  393432.5649663 ,  311541.43106231,
        410902.62224397,  438494.1354422 ,  435137.70727757,
        516482.36311476,  293521.30156369,  356147.76390843,
         -7265.76738976,  633745.55024412,  304833.37062685,
        336563.59975219,  553119.91781627,  -43993.44073207,
        180241.88005964,

## Model Evaluation Function.

In [37]:
def model_eval2(y_test , ypred_test):
    print('MSE: ',mean_squared_error(y_test, ypred_test))
    print('MAE: ',mean_absolute_error(y_test, ypred_test))
    print('RMSE: ',np.sqrt(mean_squared_error(y_test, ypred_test)))
    print('R2_Score: ',r2_score(y_test, ypred_test))

In [38]:
def model_score(model):
    print('Training Score', model.score(x_train , y_train))
    print('Testing Score', model.score(x_test , y_test))  

## Evaluating the Model

In [39]:
model_eval2(y_test,ypred)

MSE:  107861671265.78772
MAE:  195783.20233273075
RMSE:  328423.00660244207
R2_Score:  0.4703341934776667


In [40]:
model_score(pipe_lr)

Training Score 0.6192606373022975
Testing Score 0.4703341934776667


## Generating the best value for random state hyperparameter.

In [41]:
score=[]
for i in range(25):
    x_train , x_test,y_train,y_test=train_test_split(x,y,test_size=0.25, random_state=i)
    lr=LinearRegression()
    pipe=make_pipeline(column_trans, lr)
    pipe.fit(x_train,y_train)
    y_pred = pipe_lr.predict(x_test)
    # print(r2_score(y_test,y_pred), i)
    score.append(r2_score(y_test,y_pred))

In [42]:
np.argmax(score)

4

In [43]:
score[np.argmax(score)]

0.7184954181344974

In [44]:
x_train , x_test,y_train,y_test=train_test_split(x,y,test_size=0.25, random_state=np.argmax(score))
lr=LinearRegression()
pipe=make_pipeline(column_trans, lr)
pipe.fit(x_train,y_train)
y_pred = pipe_lr.predict(x_test)
r2_score(y_test,y_pred)

0.7184954181344974

## Saving and Predicting with a Linear Regression Car Model

In [45]:
import pickle

In [46]:
pickle.dump(pipe_lr,open('LinearRegressionCarModel.pkl','wb'))

In [47]:
pipe_lr.predict(pd.DataFrame([['Maruti',2019,100,'Petrol']],columns=['company','year','kms_driven','fuel_type']))

array([456338.28766944])

## Predicting Outcome

In [48]:
new_df = pd.DataFrame({
    'company' : 'Maruti',
    'year':2019,                                         
    'kms_driven':100, 
    'fuel_type':'Petrol'},
    index=[0])

In [49]:
p = pipe_lr.predict(new_df)
print(p)

[456338.28766944]


## **Save Model Using Joblib**

In [50]:
import joblib

In [51]:
joblib.dump(pipe_lr, 'model_joblib_car')

['model_joblib_car']

In [52]:
model = joblib.load('model_joblib_car')

In [53]:
model.predict(new_df)

array([456338.28766944])

# **GUI**

In [54]:
from tkinter import *
import joblib

In [55]:
import tkinter as tk
import joblib
import pandas as pd

def show_entry_fields():
    try:
        # Access entry fields through the entries dictionary
        p1 = str(entries["e1"].get())
        p2 = int(entries["e2"].get())
        p3 = int(entries["e3"].get())
        p4 = str(entries["e4"].get())

        # Convert input values to appropriate data types
        p2 = float(p2)
        p3 = float(p3)

        # Load the model
        model = joblib.load('model_joblib_car')

        # Format input as a DataFrame
        input_data = pd.DataFrame({'company': [p1], 'year': [p2], 'kms_driven': [p3], 'fuel_type': [p4]})

        # Make prediction
        result = model.predict(input_data)

        # Format the result to display currency symbol and two decimal places
        formatted_result = f"₹ {result[0]:,.2f}"

        # Update the result label with the formatted result
        result_label.config(text=f"Predicted Price: {formatted_result}")
    except Exception as e:
        result_label.config(text=f"An error occurred: {str(e)}")

master = tk.Tk()
master.title("Car Price Predictor")
master.geometry("400x250")  # Set the size of the GUI window

# Create labels and entry fields with adjusted font size and weight
title_label = tk.Label(master, text="Car Price Prediction System", bg="Red", fg="white", font=("Helvetica", 14, "bold"))
title_label.grid(row=0, columnspan=2)

# Create dictionary to store entry fields
entries = {}

labels = ['company','year','kms_driven','fuel_type']
for i, label_text in enumerate(labels):
    tk.Label(master, text=f"Enter Your {label_text}", font=("Helvetica", 12)).grid(row=i + 1)
    entry = tk.Entry(master, font=("Helvetica", 12))
    entry.grid(row=i + 1, column=1)
    # Store reference to entry field in entries dictionary
    entries[f"e{i + 1}"] = entry

# Create predict button
tk.Button(master, text='Predict', command=show_entry_fields, font=("Helvetica", 12)).grid(row=len(labels) + 1, columnspan=2)

# Create result label
result_label = tk.Label(master, text="", font=("Helvetica", 12))
result_label.grid(row=len(labels) + 2, columnspan=2)

master.mainloop()
