In [196]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [197]:
# Scikit-learn modules for model building and evaluation
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [198]:
# Streamlit for creating interactive web applications
import streamlit as st

In [199]:
# Read the dataset
# Ensure that the CSV file "Car details v3.csv" is located in the "data" folder relative to your project root.
df = pd.read_csv("car_details.csv")

# Display the first few rows to verify the dataset was loaded correctly
print(df.head())

                           name  year  selling_price  km_driven    fuel  \
0        Maruti Swift Dzire VDI  2014         450000     145500  Diesel   
1  Skoda Rapid 1.5 TDI Ambition  2014         370000     120000  Diesel   
2      Honda City 2017-2020 EXi  2006         158000     140000  Petrol   
3     Hyundai i20 Sportz Diesel  2010         225000     127000  Diesel   
4        Maruti Swift VXI BSIII  2007         130000     120000  Petrol   

  seller_type transmission         owner     mileage   engine   max_power  \
0  Individual       Manual   First Owner   23.4 kmpl  1248 CC      74 bhp   
1  Individual       Manual  Second Owner  21.14 kmpl  1498 CC  103.52 bhp   
2  Individual       Manual   Third Owner   17.7 kmpl  1497 CC      78 bhp   
3  Individual       Manual   First Owner   23.0 kmpl  1396 CC      90 bhp   
4  Individual       Manual   First Owner   16.1 kmpl  1298 CC    88.2 bhp   

                     torque  seats  
0            190Nm@ 2000rpm    5.0  
1       250N

In [200]:
# Show Shape of Data
print(f"Number of Row : {df.shape[0]}\nNumber of Columns : {df.shape[1]}")

Number of Row : 8128
Number of Columns : 13


In [201]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB


In [202]:
# Check NaN Value
df.isna().sum()

name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
torque           222
seats            221
dtype: int64

In [203]:
# Describe Numiric Data
df.describe()

Unnamed: 0,year,selling_price,km_driven,seats
count,8128.0,8128.0,8128.0,7907.0
mean,2013.804011,638271.8,69819.51,5.416719
std,4.044249,806253.4,56550.55,0.959588
min,1983.0,29999.0,1.0,2.0
25%,2011.0,254999.0,35000.0,5.0
50%,2015.0,450000.0,60000.0,5.0
75%,2017.0,675000.0,98000.0,5.0
max,2020.0,10000000.0,2360457.0,14.0


In [204]:
# Describe non Numiric 
df.describe(exclude=np.number)

Unnamed: 0,name,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque
count,8128,8128,8128,8128,8128,7907,7907,7913,7906
unique,2058,4,3,2,5,393,121,322,441
top,Maruti Swift Dzire VDI,Diesel,Individual,Manual,First Owner,18.9 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm
freq,129,4402,6766,7078,5289,225,1017,377,530


In [205]:
# Delete Nulls
df.dropna(inplace=True , ignore_index=True)

In [206]:
# Data After Delete Nulls
df.sample(5)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
2033,Mahindra Xylo D2,2010,160000,255000,Diesel,Individual,Manual,Second Owner,13.0 kmpl,2489 CC,95 bhp,220Nm at 1400-2600 rpm,8.0
2288,Hyundai Verna 1.4 CX,2013,550000,50000,Diesel,Individual,Manual,First Owner,23.5 kmpl,1396 CC,88.7 bhp,219.9Nm@ 1750-2750rpm,5.0
1433,Hyundai Elantra CRDi,2006,155000,110000,Diesel,Individual,Manual,Fourth & Above Owner,14.9 kmpl,1991 CC,112.2 bhp,"25@ 1,800-2,800(kgm@ rpm)",5.0
4678,Honda BRV i-DTEC V MT,2019,1100000,50000,Diesel,Individual,Manual,Second Owner,21.9 kmpl,1498 CC,98.6 bhp,200Nm@ 1750rpm,7.0
6717,Maruti Alto K10 VXI,2018,350000,40000,Petrol,Individual,Manual,First Owner,23.95 kmpl,998 CC,67.05 bhp,90Nm@ 3500rpm,5.0


In [207]:
# The New of Shape Data
print(f"Number of Row : {df.shape[0]}\nNumber of Columns : {df.shape[1]}")

Number of Row : 7906
Number of Columns : 13


In [208]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7906 entries, 0 to 7905
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           7906 non-null   object 
 1   year           7906 non-null   int64  
 2   selling_price  7906 non-null   int64  
 3   km_driven      7906 non-null   int64  
 4   fuel           7906 non-null   object 
 5   seller_type    7906 non-null   object 
 6   transmission   7906 non-null   object 
 7   owner          7906 non-null   object 
 8   mileage        7906 non-null   object 
 9   engine         7906 non-null   object 
 10  max_power      7906 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7906 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 803.1+ KB


In [209]:
# Check NaN Value
df.isna().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
mileage          0
engine           0
max_power        0
torque           0
seats            0
dtype: int64

In [210]:
# Describe Numiric (After Delete Null)
df.describe()

Unnamed: 0,year,selling_price,km_driven,seats
count,7906.0,7906.0,7906.0,7906.0
mean,2013.983936,649813.7,69188.66,5.416393
std,3.863695,813582.7,56792.3,0.959208
min,1994.0,29999.0,1.0,2.0
25%,2012.0,270000.0,35000.0,5.0
50%,2015.0,450000.0,60000.0,5.0
75%,2017.0,690000.0,95425.0,5.0
max,2020.0,10000000.0,2360457.0,14.0


In [211]:
# Describe non Numiric (After Delete Null)
df.describe(exclude=np.number)

Unnamed: 0,name,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque
count,7906,7906,7906,7906,7906,7906,7906,7906,7906
unique,1982,4,3,2,5,393,121,320,441
top,Maruti Swift Dzire VDI,Diesel,Individual,Manual,First Owner,18.9 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm
freq,129,4299,6563,6865,5215,225,1017,377,530


In [212]:
Brands_of_car = df['name'].tolist()
for i in range(len(Brands_of_car)):
    car = Brands_of_car[i].split(' ')
    Brands_of_car[i] = car[0]
Brands_of_car = pd.Series(Brands_of_car)

In [213]:
# Replace Model of each car to only Name of Brand
df['name'] = Brands_of_car

In [214]:
print(f"Number of Unique Brand of Car {df['name'].nunique()}")

Number of Unique Brand of Car 31


In [215]:
df = df.replace({'mileage':'[A-Za-z/]','engine':'[A-Za-z]','max_power':'[A-Za-z]'},'',regex=True)
df['mileage'] = df['mileage'].astype(float)
df['engine'] = df['engine'].astype(float)
df['max_power'] = df['max_power'].astype(float)

In [216]:
df.drop('torque' , axis = 1 , inplace = True)

In [217]:
df['year'] = 2025 - df['year']
df.rename(columns={'year':'age'},inplace=True)

In [218]:
# Show Data After some of Preprocessing
df.head()

Unnamed: 0,name,age,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti,11,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0
1,Skoda,11,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0
2,Honda,19,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0
3,Hyundai,15,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0
4,Maruti,18,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0


In [219]:
# After Change DataType of Features
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7906 entries, 0 to 7905
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           7906 non-null   object 
 1   age            7906 non-null   int64  
 2   selling_price  7906 non-null   int64  
 3   km_driven      7906 non-null   int64  
 4   fuel           7906 non-null   object 
 5   seller_type    7906 non-null   object 
 6   transmission   7906 non-null   object 
 7   owner          7906 non-null   object 
 8   mileage        7906 non-null   float64
 9   engine         7906 non-null   float64
 10  max_power      7906 non-null   float64
 11  seats          7906 non-null   float64
dtypes: float64(4), int64(3), object(5)
memory usage: 741.3+ KB


In [220]:
column_to_encode = ['name','fuel','seller_type','transmission','owner']
le = LabelEncoder()
for column in column_to_encode:
    df[column] = le.fit_transform(df[column])

In [221]:
# Show Data After some of Encode
df.head()

Unnamed: 0,name,age,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,20,11,450000,145500,1,1,1,0,23.4,1248.0,74.0,5.0
1,26,11,370000,120000,1,1,1,2,21.14,1498.0,103.52,5.0
2,10,19,158000,140000,3,1,1,4,17.7,1497.0,78.0,5.0
3,11,15,225000,127000,1,1,1,0,23.0,1396.0,90.0,5.0
4,20,18,130000,120000,3,1,1,0,16.1,1298.0,88.2,5.0


In [222]:
# Final Info after Encode
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7906 entries, 0 to 7905
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           7906 non-null   int64  
 1   age            7906 non-null   int64  
 2   selling_price  7906 non-null   int64  
 3   km_driven      7906 non-null   int64  
 4   fuel           7906 non-null   int64  
 5   seller_type    7906 non-null   int64  
 6   transmission   7906 non-null   int64  
 7   owner          7906 non-null   int64  
 8   mileage        7906 non-null   float64
 9   engine         7906 non-null   float64
 10  max_power      7906 non-null   float64
 11  seats          7906 non-null   float64
dtypes: float64(4), int64(8)
memory usage: 741.3 KB


In [223]:
X = df.drop('selling_price' , axis = 1)
y = df['selling_price']

In [224]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.25 , random_state = 44 , shuffle = True)

In [225]:
print(f'Shape of X_Train {X_train.shape}')
print(f'Shape of X_Test {X_test.shape}')
print(f'Shape of Y_Train {y_train.shape}')
print(f'Shape of Y_Test {y_test.shape}')

Shape of X_Train (5929, 11)
Shape of X_Test (1977, 11)
Shape of Y_Train (5929,)
Shape of Y_Test (1977,)


In [226]:
def Kfold(model,model_name):
    model = cross_val_score(model , X , y , cv = 10)
    model_score = np.average(model)
    print(f"{model_name} score on cross validation: {model_score * 100}%")

def train(model,model_name):
    model.fit(X_train,y_train)
    model_train_score = model.score(X_train,y_train)
    model_test_score = model.score(X_test,y_test)
    print(f"{model_name} model score on Training data: {model_train_score * 100}%\n{model_name} model score on Testing data: {model_test_score * 100}%")

def r2(model,model_name):
    score = r2_score(y_test , model.predict(X_test))
    print(f"R2 Score for {model_name} is {score * 100}%")

In [227]:
lr = LinearRegression()
Kfold(lr,'Linear Regression')
train(lr,'Linear Regression')
r2(lr,'Linear Regression')

Linear Regression score on cross validation: 66.00283903822164%
Linear Regression model score on Training data: 67.61174801974936%
Linear Regression model score on Testing data: 68.8358170667601%
R2 Score for Linear Regression is 68.8358170667601%


In [228]:
joblib.dump(lr,'lr.sav')

['lr.sav']