In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
cars_data = pd.read_csv('Cardetails.csv')

In [None]:
cars_data.head()

In [None]:
cars_data.drop(columns=['torque'], inplace=True)

In [None]:
cars_data.head()

In [None]:
cars_data.shape

(8128, 12)

PREPROCESSING

In [None]:
#NULL CHECK

cars_data.isnull().sum()

Unnamed: 0,0
name,0
year,0
selling_price,0
km_driven,0
fuel,0
seller_type,0
transmission,0
owner,0
mileage,221
engine,221


In [None]:
# REMOVED NULL VALUES FROM THE AIN DATA SET
cars_data.dropna(inplace=True)

In [None]:
cars_data.shape

(7907, 12)

In [None]:
# DUPLICATE CHECK
cars_data.duplicated().sum()

np.int64(1189)

In [None]:
# DROP DUPLICATES
cars_data.drop_duplicates(inplace = True)

In [None]:
cars_data.shape

(6718, 12)

In [None]:
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6718 entries, 0 to 8125
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6718 non-null   object 
 1   year           6718 non-null   int64  
 2   selling_price  6718 non-null   int64  
 3   km_driven      6718 non-null   int64  
 4   fuel           6718 non-null   object 
 5   seller_type    6718 non-null   object 
 6   transmission   6718 non-null   object 
 7   owner          6718 non-null   object 
 8   mileage        6718 non-null   object 
 9   engine         6718 non-null   object 
 10  max_power      6718 non-null   object 
 11  seats          6718 non-null   float64
dtypes: float64(1), int64(3), object(8)
memory usage: 682.3+ KB


DATA ANALYSIS

In [None]:
for col in cars_data.columns:
  print('Unique values of '+ col)
  print(cars_data[col].unique())
  print("================")

Unique values of name
['Maruti Swift Dzire VDI' 'Skoda Rapid 1.5 TDI Ambition'
 'Honda City 2017-2020 EXi' ... 'Tata Nexon 1.5 Revotorq XT'
 'Ford Freestyle Titanium Plus Diesel BSIV'
 'Toyota Innova 2.5 GX (Diesel) 8 Seater BS IV']
Unique values of year
[2014 2006 2010 2007 2017 2001 2011 2013 2005 2009 2016 2012 2002 2015
 2018 2003 2019 2008 2020 1999 2000 1983 2004 1996 1994 1995 1998 1997
 1991]
Unique values of selling_price
[  450000   370000   158000   225000   130000   440000    96000    45000
   350000   200000   500000    92000   280000   180000   400000   778000
   150000   680000   174000   950000   525000   600000   575000   275000
   300000   220000   254999   670000    70000   730000   650000   330000
   366000  1149000   425000  2100000   925000   675000   819999   390000
  1500000   700000  1450000  1090000   850000  1650000  1750000  1590000
  1689999  1425000   265000   190000   630000   540000   448000   745000
  1025000   235000  1700000    50000  1200000   610000

In [None]:
def get_brand_name(car_name):
  car_name = car_name.split(" ")[0] # we wrote index 0 as every car name has brand at first
  return car_name.strip() # used strip to remove extra spaces

In [None]:
def clean_data(value):
  value = value.split('')[0]
  value = (value.strip())
  if value == '':
    value = 0
  return float(value)


In [None]:
get_brand_name("Maruti Swift Dezire VDI")

'Maruti'

In [None]:
cars_data['name'] = cars_data['name'].apply(get_brand_name)

In [None]:
cars_data['name'].unique()

array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 'Jeep', 'Mercedes-Benz', 13, 14,
       15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30],
      dtype=object)

In [None]:
cars_data['mileage'] = cars_data['mileage'].apply(get_brand_name)

In [None]:
cars_data['max_power'] = cars_data['max_power'].apply(get_brand_name)

In [None]:
cars_data['engine'] = cars_data['engine'].apply(get_brand_name)

In [None]:
for col in cars_data.columns:
  print('Unique values of '+ col)
  print(cars_data[col].unique())
  print("================")

In [None]:
cars_data['name'].replace(['Maruti','Skoda','Honda','Hyundai','Toyota','Ford','Renault','Mahindra',
'Tata','Chevrolet','Datsun','Jeep','Mercedes-Benz','Mitsubishi','Audi',
'Volkswagen','BMW','Nissan','Lexus','Jaguar','Land','MG','Volvo','Daewoo',
'Kia','Fiat','Force','Ambassador','Ashok','Isuzu','Opel'],
                          [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
                          ,inplace=True)

In [None]:
cars_data['transmission'].unique()

array(['Manual', 'Automatic'], dtype=object)

In [None]:
cars_data['transmission'].replace(['Manual', 'Automatic'],[1,2],inplace=True)

In [None]:
cars_data['seller_type'].unique()

array(['Individual', 'Dealer', 'Trustmark Dealer'], dtype=object)

In [None]:
cars_data['seller_type'].replace(['Individual', 'Dealer', 'Trustmark Dealer'],[1,2,3],inplace=True)

In [None]:
cars_data['fuel'].unique()

array(['Diesel', 'Petrol', 'LPG', 'CNG'], dtype=object)

In [None]:
cars_data['fuel'].replace(['Diesel', 'Petrol', 'LPG', 'CNG'],[1,2,3,4],inplace=True)

In [None]:
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   int64  
 6   transmission   8128 non-null   int64  
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(1), int64(5), object(7)
memory usage: 825.6+ KB


In [None]:
cars_data.reset_index(inplace=True)

In [None]:
cars_data

Unnamed: 0,index,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,0,1,2014,450000,145500,1,1,1,First Owner,23.4,1248,74,5.0
1,1,2,2014,370000,120000,1,1,1,Second Owner,21.14,1498,103.52,5.0
2,2,3,2006,158000,140000,2,1,1,Third Owner,17.7,1497,78,5.0
3,3,4,2010,225000,127000,1,1,1,First Owner,23.0,1396,90,5.0
4,4,1,2007,130000,120000,2,1,1,First Owner,16.1,1298,88.2,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6713,8121,1,2013,260000,50000,2,1,1,Second Owner,18.9,998,67.1,5.0
6714,8122,4,2014,475000,80000,1,1,1,Second Owner,22.54,1396,88.73,5.0
6715,8123,4,2013,320000,110000,2,1,1,First Owner,18.5,1197,82.85,5.0
6716,8124,4,2007,135000,119000,1,1,1,Fourth & Above Owner,16.8,1493,110,5.0


In [None]:
cars_data['owner'].unique()

array(['First Owner', 'Second Owner', 'Third Owner',
       'Fourth & Above Owner', 'Test Drive Car'], dtype=object)

In [None]:
cars_data['owner'].replace(['First Owner', 'Second Owner', 'Third Owner',
       'Fourth & Above Owner', 'Test Drive Car'],[1,2,3,4,5],inplace=True)

In [None]:
cars_data.drop(columns=['index'],inplace=True)

In [None]:
cars_data

In [None]:
input_data = cars_data.drop(columns=['selling_price'])
output_data = cars_data['selling_price']

TEST AND TRAIN

In [None]:
x_train, x_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.2)

MODEL CREATION

In [None]:
model = LinearRegression()

TRAIN MODEL

In [None]:
model.fit(x_train , y_train)

In [None]:
predict = model.predict(x_test)

In [None]:
predict

array([ 443442.72717693,  178181.20213327,  520631.72414033, ...,
       2709491.3878383 ,  973211.1842147 ,  306122.84410261])

In [None]:
x_train.head(1)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
5989,10,2011,100000,1,1,1,1,18.3,1991,147.9,5.0


In [None]:
input_data_model = pd.DataFrame([[10,2015,10000,1,1,1,1,18.3,1991,147.9,5.0]]
                                ,columns=["name","year","km_driven","fuel","seller_type","transmission","owner","mileage","engine","max_power","seats"])

In [None]:
model.predict(input_data_model)

array([1152955.30215886])

In [None]:
import pickle as pk

In [None]:
pk.dump(model,open('model.pkl' , 'wb'))

In [None]:
from google.colab import files
files.download('model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import numpy as np
import pickle as pk
import streamlit as st

In [None]:
model = pk.load(open('model.pkl' , 'rb'))

In [None]:
st.header('car Price Prediction Model')