In [1]:
import pandas  as pd  #dataframe
import numpy as np  #mathematical computations
from sklearn.model_selection import ShuffleSplit , cross_val_score, train_test_split #Random shuffling,#Score cross validation , #Splitting the dataset into training and testing
from sklearn.linear_model import LinearRegression  #Linear Regression
from sklearn.linear_model import Lasso  #Lasso Regression
from sklearn.tree import DecisionTreeRegressor #Decision Tree Regression
from sklearn.ensemble import RandomForestRegressor  #Random Forest Regression
from sklearn.model_selection import GridSearchCV, ShuffleSplit #Hyper parameter tuning,
from warnings import simplefilter #Filtering warnings

simplefilter(action='ignore', category=FutureWarning)
import pickle #saving the model


In [2]:
dataset=pd.read_csv("C:\\Users\\hp\\Downloads\\used_cars_dataset_v2.csv")

In [3]:
print(dataset.head())

           Brand      model  Year  Age     kmDriven Transmission   Owner  \
0          Honda       City  2001   23    98,000 km       Manual  second   
1         Toyota     Innova  2009   15  190000.0 km       Manual  second   
2     Volkswagen  VentoTest  2010   14    77,246 km       Manual   first   
3  Maruti Suzuki      Swift  2017    7    83,500 km       Manual  second   
4  Maruti Suzuki     Baleno  2019    5    45,000 km    Automatic   first   

  FuelType PostedDate                                       AdditionInfo  \
0   Petrol     Nov-24  Honda City v teck in mint condition, valid gen...   
1   Diesel     Jul-24  Toyota Innova 2.5 G (Diesel) 7 Seater, 2009, D...   
2   Diesel     Nov-24  Volkswagen Vento 2010-2013 Diesel Breeze, 2010...   
3   Diesel     Nov-24     Maruti Suzuki Swift 2017 Diesel Good Condition   
4   Petrol     Nov-24       Maruti Suzuki Baleno Alpha CVT, 2019, Petrol   

     AskPrice  
0  ₹ 1,95,000  
1  ₹ 3,75,000  
2  ₹ 1,84,999  
3  ₹ 5,65,000  
4  ₹ 6

In [4]:
#drop the columns that are not as important for the model
dataset2=dataset.drop(['AdditionInfo','PostedDate'],axis=1).reindex(['Brand','model','Year','Age','kmDriven','Transmission','Owner','FuelType','AskPrice'],axis=1)
dataset2.head(30)

Unnamed: 0,Brand,model,Year,Age,kmDriven,Transmission,Owner,FuelType,AskPrice
0,Honda,City,2001,23,"98,000 km",Manual,second,Petrol,"₹ 1,95,000"
1,Toyota,Innova,2009,15,190000.0 km,Manual,second,Diesel,"₹ 3,75,000"
2,Volkswagen,VentoTest,2010,14,"77,246 km",Manual,first,Diesel,"₹ 1,84,999"
3,Maruti Suzuki,Swift,2017,7,"83,500 km",Manual,second,Diesel,"₹ 5,65,000"
4,Maruti Suzuki,Baleno,2019,5,"45,000 km",Automatic,first,Petrol,"₹ 6,85,000"
5,BMW,X3,2014,10,"83,000 km",Automatic,first,Diesel,"₹ 13,50,000"
6,Toyota,Innova,2014,10,"168,000 km",Manual,second,Diesel,"₹ 10,25,000"
7,BMW,5 Series,2019,5,"25,000 km",Automatic,second,Diesel,"₹ 59,50,000"
8,Maruti Suzuki,maruti-suzuki-dzire,2020,4,"33,759 km",Manual,second,Petrol,"₹ 6,22,000"
9,Ford,Ecosport,2017,7,"69,713 km",Manual,second,Petrol,"₹ 6,49,000"


In [5]:
dataset2.shape

(14993, 9)

In [6]:
#preprocessing

In [7]:
print(dataset2.isnull().sum())  #checking for missing values

Brand            0
model            0
Year             0
Age              0
kmDriven        88
Transmission     0
Owner            0
FuelType         0
AskPrice         0
dtype: int64


In [8]:
dataset2.dropna(inplace=True)

In [9]:
dataset2.shape

(14905, 9)

In [10]:
print(dataset2['kmDriven'].head())

0      98,000 km
1    190000.0 km
2      77,246 km
3      83,500 km
4      45,000 km
Name: kmDriven, dtype: object


In [11]:
# Convert the column to string type first
dataset2['kmDriven'] = dataset2['kmDriven'].astype(str)

# Remove commas and 'km' text
dataset2['kmDriven'] = dataset2['kmDriven'].str.replace(',', '').str.replace('km', '').str.strip()

# Convert the cleaned string values to numeric
dataset2['kmDriven'] = pd.to_numeric(dataset2['kmDriven'], errors='coerce')

# Adding .0 to integers
dataset2['kmDriven'] = dataset2['kmDriven'].astype(float)

# Print the result 
print(dataset2['kmDriven'].head())

0     98000.0
1    190000.0
2     77246.0
3     83500.0
4     45000.0
Name: kmDriven, dtype: float64


In [12]:
print(dataset2.duplicated().sum()) #checks for duplicates

1161


In [13]:
dataset2

Unnamed: 0,Brand,model,Year,Age,kmDriven,Transmission,Owner,FuelType,AskPrice
0,Honda,City,2001,23,98000.0,Manual,second,Petrol,"₹ 1,95,000"
1,Toyota,Innova,2009,15,190000.0,Manual,second,Diesel,"₹ 3,75,000"
2,Volkswagen,VentoTest,2010,14,77246.0,Manual,first,Diesel,"₹ 1,84,999"
3,Maruti Suzuki,Swift,2017,7,83500.0,Manual,second,Diesel,"₹ 5,65,000"
4,Maruti Suzuki,Baleno,2019,5,45000.0,Automatic,first,Petrol,"₹ 6,85,000"
...,...,...,...,...,...,...,...,...,...
14988,Maruti Suzuki,Wagon-R,2022,2,31000.0,Manual,first,hybrid,"₹ 5,75,000"
14989,BMW,X1,2017,7,45000.0,Automatic,second,Diesel,"₹ 13,45,000"
14990,Maruti Suzuki,New-gen Swift,2011,13,60000.0,Automatic,second,Diesel,"₹ 1,60,000"
14991,Tata,Zest,2019,5,62000.0,Manual,second,hybrid,"₹ 3,80,000"


In [14]:
dataset2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14905 entries, 0 to 14992
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Brand         14905 non-null  object 
 1   model         14905 non-null  object 
 2   Year          14905 non-null  int64  
 3   Age           14905 non-null  int64  
 4   kmDriven      14905 non-null  float64
 5   Transmission  14905 non-null  object 
 6   Owner         14905 non-null  object 
 7   FuelType      14905 non-null  object 
 8   AskPrice      14905 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 1.1+ MB


In [15]:
#data analysis

In [16]:
for col in dataset2.columns:
    print('Unique values of ' +col)
    print(dataset2[col].unique())
    print('=====================\n')

Unique values of Brand
['Honda' 'Toyota' 'Volkswagen' 'Maruti Suzuki' 'BMW' 'Ford' 'Kia'
 'Mercedes-Benz' 'Hyundai' 'Audi' 'Renault' 'MG' 'Volvo' 'Skoda' 'Tata'
 'Mahindra' 'Mini' 'Land Rover' 'Jeep' 'Chevrolet' 'Jaguar' 'Fiat'
 'Aston Martin' 'Porsche' 'Nissan' 'Force' 'Mitsubishi' 'Lexus' 'Isuzu'
 'Datsun' 'Ambassador' 'Rolls-Royce' 'Bajaj' 'Opel' 'Ashok' 'Bentley'
 'Ssangyong' 'Maserati' 'Toyota Land' 'Citroen' 'Lamborghini' 'Hummer']

Unique values of model
['City' 'Innova' 'VentoTest' 'Swift' 'Baleno' 'X3' '5 Series'
 'maruti-suzuki-dzire' 'Ecosport' 'Alto-K10' 'Carnival' 'Swift-Dzire'
 'Corolla' 'GLE COUPE' 'Xcent' 'Seltos' 'Ertiga' '3 Series GT' 'Endeavour'
 'Innova Crysta' 'A3' 'KWID' 'Hector' 'Celerio' 'Vitara-Brezza'
 '2.8 Legender 4X2' 'S90' 'Venue' 'Creta' 'Alcazar' 'i20' 'E-Class' 'Polo'
 'Verna' 'A4' 'Fortuner' 'C-Class' 'Kushaq' 'Ciaz' 'Safari' 'BRV' 'Duster'
 'Wagon-R' 'Bolero Power Plus' 'Eon' 'Hector Plus' 'XUV500' 'GLS' 'i10'
 'GLA Class' 'Carens' 'Ignis' 'Grand i10'

In [17]:
# Standardize and clean the 'FuelType' column
dataset2['FuelType'] = dataset2['FuelType'].str.strip().str.capitalize()

# Separate 'Hybrid/CNG' into distinct fuel types
dataset2['FuelType'] = dataset2['FuelType'].replace({'Hybrid/cng': 'CNG'})

# Check unique values after cleaning
print(dataset2['FuelType'].unique())

['Petrol' 'Diesel' 'CNG' 'Hybrid']


In [18]:
# Remove everything except digits and decimal points (to allow decimal numbers)
dataset2['AskPrice'] = dataset2['AskPrice'].replace(r'[^\d.]', '', regex=True)

# Convert the 'AskPrice' column to numeric, forcing errors to NaN
dataset2['AskPrice'] = pd.to_numeric(dataset2['AskPrice'], errors='coerce')
# Verify if there are any NaN values left
print(dataset2['AskPrice'].isnull().sum())

# Print some rows to inspect the 'AskPrice' values
print(dataset2['AskPrice'].head())

# Conversion rates
conversion_rate_usd = 0.012  # To convert to USD
conversion_rate_kes = 140    # Approximate conversion rate from USD to KES

# Perform the conversion to USD
dataset2['Price_USD'] = round(dataset2['AskPrice'] * conversion_rate_usd, 2)

# Convert from USD to KES
dataset2['Price_KES'] = round(dataset2['Price_USD'] * conversion_rate_kes, 2)

# Display the updated columns
print(dataset2[['AskPrice', 'Price_USD', 'Price_KES']].head())


0
0    195000
1    375000
2    184999
3    565000
4    685000
Name: AskPrice, dtype: int64
   AskPrice  Price_USD  Price_KES
0    195000    2340.00   327600.0
1    375000    4500.00   630000.0
2    184999    2219.99   310798.6
3    565000    6780.00   949200.0
4    685000    8220.00  1150800.0


In [19]:
dataset2 = dataset2.drop(['AskPrice', 'Price_USD'], axis=1)  # Drop both columns
print(dataset2.head())



           Brand      model  Year  Age  kmDriven Transmission   Owner  \
0          Honda       City  2001   23   98000.0       Manual  second   
1         Toyota     Innova  2009   15  190000.0       Manual  second   
2     Volkswagen  VentoTest  2010   14   77246.0       Manual   first   
3  Maruti Suzuki      Swift  2017    7   83500.0       Manual  second   
4  Maruti Suzuki     Baleno  2019    5   45000.0    Automatic   first   

  FuelType  Price_KES  
0   Petrol   327600.0  
1   Diesel   630000.0  
2   Diesel   310798.6  
3   Diesel   949200.0  
4   Petrol  1150800.0  


In [20]:

# Standardize the 'Brand' column
# Convert the Brand column to string, then to lowercase, replace hyphens with spaces, and strip whitespace.
dataset2['Brand'] = (
    dataset2['Brand']
    .astype(str)
    .str.lower()
    .str.replace('-', ' ', regex=True)
    .str.strip()
)

# Debug: Print unique brand names before mapping
print("Unique brands before mapping:")
print(dataset2['Brand'].unique())
#  Define the mapping dictionary

brand_mapping = {
    'honda': 1,
    'toyota': 2,
    'volkswagen': 3,
    'maruti suzuki': 4,
    'bmw': 5,
    'ford': 6,
    'kia': 7,
    'mercedes benz': 8,
    'hyundai': 9,
    'audi': 10,
    'renault': 11,
    'mg': 12,
    'volvo': 13,
    'skoda': 14,
    'tata': 15,
    'mahindra': 16,
    'mini': 17,
    'land rover': 18,
    'jeep': 19,
    'chevrolet': 20,
    'jaguar': 21,
    'fiat': 22,
    'aston martin': 23,
    'porsche': 24,
    'nissan': 25,
    'force': 26,
    'mitsubishi': 27,
    'lexus': 28,
    'isuzu': 29,
    'datsun': 30,
    'ambassador': 31,
    'rolls royce': 32,
    'bajaj': 33,
    'opel': 34,
    'ashok': 35,
    'bentley': 36,
    'ssangyong': 37,
    'maserati': 38,
    'toyota land': 39,
    'citroen': 40,
    'lamborghini': 41,
    'hummer': 42
}

# Replace brand names with their numeric codes

dataset2['Brand'] = dataset2['Brand'].replace(brand_mapping).infer_objects(copy=False)

print("\nUnique brands after mapping:")
print(dataset2['Brand'].unique())

print("\nDataFrame head after mapping:")
print(dataset2.head())

print("\nDataFrame columns:")
print(dataset2.columns)

  


Unique brands before mapping:
['honda' 'toyota' 'volkswagen' 'maruti suzuki' 'bmw' 'ford' 'kia'
 'mercedes benz' 'hyundai' 'audi' 'renault' 'mg' 'volvo' 'skoda' 'tata'
 'mahindra' 'mini' 'land rover' 'jeep' 'chevrolet' 'jaguar' 'fiat'
 'aston martin' 'porsche' 'nissan' 'force' 'mitsubishi' 'lexus' 'isuzu'
 'datsun' 'ambassador' 'rolls royce' 'bajaj' 'opel' 'ashok' 'bentley'
 'ssangyong' 'maserati' 'toyota land' 'citroen' 'lamborghini' 'hummer']

Unique brands after mapping:
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42]

DataFrame head after mapping:
   Brand      model  Year  Age  kmDriven Transmission   Owner FuelType  \
0      1       City  2001   23   98000.0       Manual  second   Petrol   
1      2     Innova  2009   15  190000.0       Manual  second   Diesel   
2      3  VentoTest  2010   14   77246.0       Manual   first   Diesel   
3      4      Swift  2017    7   83500.0       Manual  second   D

In [21]:
#  we convert Brand to  an integer
dataset2['Brand'] = dataset2['Brand'].astype(int)

print(dataset2.info())


<class 'pandas.core.frame.DataFrame'>
Index: 14905 entries, 0 to 14992
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Brand         14905 non-null  int32  
 1   model         14905 non-null  object 
 2   Year          14905 non-null  int64  
 3   Age           14905 non-null  int64  
 4   kmDriven      14905 non-null  float64
 5   Transmission  14905 non-null  object 
 6   Owner         14905 non-null  object 
 7   FuelType      14905 non-null  object 
 8   Price_KES     14905 non-null  float64
dtypes: float64(2), int32(1), int64(2), object(4)
memory usage: 1.1+ MB
None


In [22]:
dataset2['Transmission'].unique()


array(['Manual', 'Automatic'], dtype=object)

In [23]:
dataset2['Transmission'] = dataset2['Transmission'].replace(['Manual', 'Automatic'], [1, 2])



In [24]:

# we convert the model names to string, lowercase them, and strip extra whitespace.
dataset2['model'] = dataset2['model'].astype(str).str.lower().str.strip()
#  then,create a new column 'Model_ID'
# This will assign sequential numbers (1, 2, 3, …) to models within each brand.
# The numbering resets for each unique brand.
dataset2['Model_ID'] = dataset2.groupby('Brand')['model'].transform(lambda x: pd.factorize(x)[0] + 1)

print("Sample of Brand, model, and Model_ID:")
print(dataset2[['Brand', 'model', 'Model_ID']].head(20))



Sample of Brand, model, and Model_ID:
    Brand                model  Model_ID
0       1                 city         1
1       2               innova         1
2       3            ventotest         1
3       4                swift         1
4       4               baleno         2
5       5                   x3         1
6       2               innova         1
7       5             5 series         2
8       4  maruti-suzuki-dzire         3
9       6             ecosport         1
10      4             alto-k10         4
11      7             carnival         1
12      4          swift-dzire         5
13      2              corolla         2
14      8            gle coupe         1
15      9                xcent         1
16      7               seltos         2
17      4               ertiga         6
18      4               ertiga         6
19      5          3 series gt         3


In [25]:
dataset2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14905 entries, 0 to 14992
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Brand         14905 non-null  int32  
 1   model         14905 non-null  object 
 2   Year          14905 non-null  int64  
 3   Age           14905 non-null  int64  
 4   kmDriven      14905 non-null  float64
 5   Transmission  14905 non-null  int64  
 6   Owner         14905 non-null  object 
 7   FuelType      14905 non-null  object 
 8   Price_KES     14905 non-null  float64
 9   Model_ID      14905 non-null  int64  
dtypes: float64(2), int32(1), int64(4), object(3)
memory usage: 1.2+ MB


In [26]:
dataset2

Unnamed: 0,Brand,model,Year,Age,kmDriven,Transmission,Owner,FuelType,Price_KES,Model_ID
0,1,city,2001,23,98000.0,1,second,Petrol,327600.0,1
1,2,innova,2009,15,190000.0,1,second,Diesel,630000.0,1
2,3,ventotest,2010,14,77246.0,1,first,Diesel,310798.6,1
3,4,swift,2017,7,83500.0,1,second,Diesel,949200.0,1
4,4,baleno,2019,5,45000.0,2,first,Petrol,1150800.0,2
...,...,...,...,...,...,...,...,...,...,...
14988,4,wagon-r,2022,2,31000.0,1,first,Hybrid,966000.0,10
14989,5,x1,2017,7,45000.0,2,second,Diesel,2259600.0,4
14990,4,new-gen swift,2011,13,60000.0,2,second,Diesel,268800.0,46
14991,15,zest,2019,5,62000.0,1,second,Hybrid,638400.0,5


In [27]:
dataset2['FuelType'].unique()

array(['Petrol', 'Diesel', 'CNG', 'Hybrid'], dtype=object)

In [28]:
dataset2['FuelType'] = dataset2['FuelType'].replace(['Petrol', 'Diesel', 'CNG', 'Hybrid'], [1, 2, 3, 4]).infer_objects(copy=False)


In [29]:
dataset2.reset_index(inplace=True) #ensures that the serial numbers are going systematically

In [30]:
dataset2

Unnamed: 0,index,Brand,model,Year,Age,kmDriven,Transmission,Owner,FuelType,Price_KES,Model_ID
0,0,1,city,2001,23,98000.0,1,second,1,327600.0,1
1,1,2,innova,2009,15,190000.0,1,second,2,630000.0,1
2,2,3,ventotest,2010,14,77246.0,1,first,2,310798.6,1
3,3,4,swift,2017,7,83500.0,1,second,2,949200.0,1
4,4,4,baleno,2019,5,45000.0,2,first,1,1150800.0,2
...,...,...,...,...,...,...,...,...,...,...,...
14900,14988,4,wagon-r,2022,2,31000.0,1,first,4,966000.0,10
14901,14989,5,x1,2017,7,45000.0,2,second,2,2259600.0,4
14902,14990,4,new-gen swift,2011,13,60000.0,2,second,2,268800.0,46
14903,14991,15,zest,2019,5,62000.0,1,second,4,638400.0,5


In [31]:
#  we drop the original 'model' column
dataset2.drop(columns=['model'], inplace=True)

# then,reorder columns so that 'Model_ID' takes the place of the dropped 'model' column
#  we get the list of columns from the DataFrame
cols = list(dataset2.columns)
#  then we remove 'Model_ID' from its current position
cols.remove('Model_ID')
#  then we insert 'Model_ID' at the position where 'model' was originally (assuming it was the second column, index 1)
cols.insert(1, 'Model_ID')
#  reorder the DataFrame columns
dataset2 = dataset2[cols]

# Rename 'Model_ID' to 'model'
dataset2.rename(columns={'Model_ID': 'model'}, inplace=True)

print(dataset2.head())
print(dataset2.columns)


   index  model  Brand  Year  Age  kmDriven  Transmission   Owner  FuelType  \
0      0      1      1  2001   23   98000.0             1  second         1   
1      1      1      2  2009   15  190000.0             1  second         2   
2      2      1      3  2010   14   77246.0             1   first         2   
3      3      1      4  2017    7   83500.0             1  second         2   
4      4      2      4  2019    5   45000.0             2   first         1   

   Price_KES  
0   327600.0  
1   630000.0  
2   310798.6  
3   949200.0  
4  1150800.0  
Index(['index', 'model', 'Brand', 'Year', 'Age', 'kmDriven', 'Transmission',
       'Owner', 'FuelType', 'Price_KES'],
      dtype='object')


In [32]:
dataset2

Unnamed: 0,index,model,Brand,Year,Age,kmDriven,Transmission,Owner,FuelType,Price_KES
0,0,1,1,2001,23,98000.0,1,second,1,327600.0
1,1,1,2,2009,15,190000.0,1,second,2,630000.0
2,2,1,3,2010,14,77246.0,1,first,2,310798.6
3,3,1,4,2017,7,83500.0,1,second,2,949200.0
4,4,2,4,2019,5,45000.0,2,first,1,1150800.0
...,...,...,...,...,...,...,...,...,...,...
14900,14988,10,4,2022,2,31000.0,1,first,4,966000.0
14901,14989,4,5,2017,7,45000.0,2,second,2,2259600.0
14902,14990,46,4,2011,13,60000.0,2,second,2,268800.0
14903,14991,5,15,2019,5,62000.0,1,second,4,638400.0


In [33]:
dataset2['Owner'].unique()

array(['second', 'first'], dtype=object)

In [34]:
pd.set_option('future.no_silent_downcasting', True)
dataset2['Owner'] = dataset2['Owner'].replace(['second', 'first'], [1,2])



In [35]:
dataset2

Unnamed: 0,index,model,Brand,Year,Age,kmDriven,Transmission,Owner,FuelType,Price_KES
0,0,1,1,2001,23,98000.0,1,1,1,327600.0
1,1,1,2,2009,15,190000.0,1,1,2,630000.0
2,2,1,3,2010,14,77246.0,1,2,2,310798.6
3,3,1,4,2017,7,83500.0,1,1,2,949200.0
4,4,2,4,2019,5,45000.0,2,2,1,1150800.0
...,...,...,...,...,...,...,...,...,...,...
14900,14988,10,4,2022,2,31000.0,1,2,4,966000.0
14901,14989,4,5,2017,7,45000.0,2,1,2,2259600.0
14902,14990,46,4,2011,13,60000.0,2,1,2,268800.0
14903,14991,5,15,2019,5,62000.0,1,1,4,638400.0


In [36]:
dataset2.drop(columns=['index'],inplace=True)

In [37]:
dataset2

Unnamed: 0,model,Brand,Year,Age,kmDriven,Transmission,Owner,FuelType,Price_KES
0,1,1,2001,23,98000.0,1,1,1,327600.0
1,1,2,2009,15,190000.0,1,1,2,630000.0
2,1,3,2010,14,77246.0,1,2,2,310798.6
3,1,4,2017,7,83500.0,1,1,2,949200.0
4,2,4,2019,5,45000.0,2,2,1,1150800.0
...,...,...,...,...,...,...,...,...,...
14900,10,4,2022,2,31000.0,1,2,4,966000.0
14901,4,5,2017,7,45000.0,2,1,2,2259600.0
14902,46,4,2011,13,60000.0,2,1,2,268800.0
14903,5,15,2019,5,62000.0,1,1,4,638400.0


In [38]:
#here we split input features and output features
#output column= Price_KES
#input columns= the remaining columns
input_data= dataset2.drop(columns=['Price_KES'])
output_data= dataset2['Price_KES']

In [39]:
x_train,x_test,y_train,y_test= train_test_split(input_data,output_data,test_size=0.2)

In [40]:
#model Creation

In [41]:
model= LinearRegression()

In [42]:
#Train Model

In [43]:
model.fit(x_train,y_train)

In [44]:
predict= model.predict( x_test)

In [45]:
predict

array([2795306.54664004,  776476.76988256, 1313106.46781719, ...,
       1412418.16726023, 2565181.33239222,   36623.63459265])

In [46]:
x_train.head(1)

Unnamed: 0,model,Brand,Year,Age,kmDriven,Transmission,Owner,FuelType
9657,1,1,2016,8,72205.0,2,2,1


In [47]:
input_data_model=pd.DataFrame(
    [[21,4,2018,6,90000.0,2,2,2]],
    columns=['model','Brand','Year','Age','kmDriven','Transmission','Owner','FuelType'])


In [48]:
input_data_model

Unnamed: 0,model,Brand,Year,Age,kmDriven,Transmission,Owner,FuelType
0,21,4,2018,6,90000.0,2,2,2


In [49]:
model.predict(input_data_model)

array([2229938.24194345])

In [50]:

#  We are Definining features and target.
# we use all our other columns other than Price_KES For our x and for our y we use Price_KES
x = dataset2.drop('Price_KES', axis=1)
y = dataset2['Price_KES']

# We perform cross-validation using Linear Regression
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
lr = LinearRegression()
scores = cross_val_score(lr, x, y, cv=cv, scoring='r2')

print(f'Cross-validation R² scores: {scores}')
print(f'Mean R² score: {scores.mean():.3f}')

# then we Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


# then we Train Linear Regression on the training set and evaluate on the test set
lr.fit(x_train, y_train)
test_score = lr.score(x_test, y_test)
print(f'Linear Regression Test R² score: {test_score:.3f}')



Cross-validation R² scores: [0.14722015 0.1591168  0.16863903 0.15665974 0.1589472 ]
Mean R² score: 0.158
Linear Regression Test R² score: 0.160


In [75]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, ShuffleSplit, train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error

# Function to remove outliers based on IQR
def remove_outliers(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    # Removing outliers from numeric columns only
    df_clean = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
    return df_clean

# Function to find the best model using GridSearchCV
def find_best_model_using_gridsearchcv(X, y):
    algos = {
        'linear_regression': {
            'model': LinearRegression(),
            'params': {}
        },
        'lasso': {
            'model': Lasso(random_state=42),
            'params': {
                'alpha': [1, 2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(random_state=42),
            'params': {
                'criterion': ['squared_error', 'friedman_mse'],
                'splitter': ['best', 'random']
            }
        },
        'random_forest': {
            'model': RandomForestRegressor(random_state=42),
            'params': {
                'criterion': ['squared_error', 'friedman_mse'],
                'n_estimators': list(range(1, 50, 5))
            }
        }
    }
    
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    
    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X_train, y_train)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })
        
        # If it's the best score, we'll store the best model for predictions
        if algo_name == 'random_forest':  # Assuming RandomForest was the best model
            best_model = gs.best_estimator_
    
    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params']), best_model

# Step 1: Remove outliers from the dataset
dataset2_clean = remove_outliers(dataset2)

# Step 2: Split the cleaned data into training and testing sets
X = dataset2_clean.drop('Price_KES', axis=1)
y = dataset2_clean['Price_KES']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Find the best model using GridSearchCV
results, best_model = find_best_model_using_gridsearchcv(X_train, y_train)

# Step 4: Use the best model to make predictions
y_pred = best_model.predict(X_test)

# Step 5: Calculate MAPE (Mean Absolute Percentage Error)
mape = mean_absolute_percentage_error(y_test, y_pred) * 100

# Step 6: Calculate percentage accuracy
accuracy = 100 - mape

# Output results
print(results)
print(f"MAPE: {mape}%")
print(f"Accuracy: {accuracy}%")

               model  best_score  \
0  linear_regression    0.360932   
1              lasso    0.360932   
2      decision_tree    0.746699   
3      random_forest    0.862932   

                                         best_params  
0                                                 {}  
1                {'alpha': 1, 'selection': 'cyclic'}  
2  {'criterion': 'friedman_mse', 'splitter': 'best'}  
3  {'criterion': 'friedman_mse', 'n_estimators': 46}  
MAPE: 21.885080419186288%
Accuracy: 78.1149195808137%


In [76]:
print(X_train.shape)
print(y_train.shape)


(9150, 8)
(9150,)


In [77]:
# Ensure the rows in X and y are consistent
X = dataset2_clean.drop('Price_KES', axis=1)
y = dataset2_clean['Price_KES']


In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [79]:
dataset2_clean = dataset2_clean.dropna()


In [80]:
model = RandomForestRegressor(criterion='squared_error', n_estimators=26, random_state=42)


In [81]:
model.fit(X_train,y_train)

In [82]:

def predict_price(brand, car_model, year, age, kmDriven, Transmission, Owner, FuelType):
    # we create a DataFrame with the same column names as used in training
    input_data = pd.DataFrame(columns=['model', 'Brand', 'Year', 'Age', 'kmDriven', 'Transmission', 'Owner', 'FuelType'],
                              data=[[0, 0, year, age, kmDriven, 0, 0, 0]])  # Initialize with default values
    
    # Set the values for the brand, car model, and other features
    if brand.lower() == 'honda':
        input_data['Brand'] = 1
    if car_model.lower() == 'city':
        input_data['model'] = 1
    
    input_data['Year'] = year
    input_data['Age'] = age
    input_data['kmDriven'] = kmDriven

    if Transmission.lower() == 'automatic':
        input_data['Transmission'] = 1
    elif Transmission.lower() == 'manual':
        input_data['Transmission'] = 2

    if Owner.lower() == 'first':
        input_data['Owner'] = 1
    elif Owner.lower() == 'second':
        input_data['Owner'] = 2

    if FuelType.lower() == 'diesel':
        input_data['FuelType'] = 1
    elif FuelType.lower() == 'petrol':
        input_data['FuelType'] = 2
    elif FuelType.lower() == 'cng':
        input_data['FuelType'] = 3
    elif FuelType.lower() == 'hybrid':
        input_data['FuelType'] = 4
    
    # Pass the DataFrame to the model for prediction
    return float(format(model.predict(input_data)[0], '.2f'))

   

In [83]:
predicted_price = predict_price('Honda', 'city', 2001, 23, 98000.0, 'Manual', 'second', 'Petrol')
print(predicted_price)



907910.77


In [84]:
predicted_price = predict_price('Maruti Suzuki','Swift',2017,7,83500.0,'Manual','second','Diesel')
print(predicted_price)

1281969.18


In [92]:
predicted_price= predict_price('Maruti Suzuki','Baleno',2020,3,83500.0,'Automatic','second','Diesel')
print(predicted_price)

1650406.15


In [89]:
predicted_price=predict_price('BMW','X3',2023,0,2000.0,'automatic','second','petrol')
print(predicted_price)

2039067.69


In [90]:
predicted_price= predict_price('mercedez-benz','gle',2020,3,80000,'automatic','first','petrol')
print(predicted_price)

1662360.0


In [93]:
predicted_price=predict_price( 'Volvo','GLE COUPE',2016,8,12000,'automatic','second','petrol')
print(predicted_price)


995776.92


In [94]:
predicted_price=predict_price('Skoda','xcent',2001,22,500000,'manual', 'second','petrol')
print(predicted_price)

657557.17


In [95]:
predicted_price=predict_price('Tata','Seltos',2009,14,200000,'manual','second','diesel')
print(predicted_price)

546354.07


In [96]:
predicted_price=predict_price('Mahindra','Ertiga',2007,16,150000,'automatic','second','diesel')
print(predicted_price)

486683.02


In [97]:
predicted_price=predict_price('Ford','Ecosport', 2017,7,69713,'Manual','second','Petrol')
print(predicted_price)

1438569.12


In [98]:
predicted_price=predict_price('Tata','Zest',2022,1,62000.0,'Manual','second','hybrid')
print(predicted_price)

1873716.92


In [99]:
predicted_price=predict_price('Toyota','Innova',2023,0,19000.0,'manual','second','diesel')
print(predicted_price)

2175341.54


In [100]:
import pickle
with open('SECOND HAND CAR PRICE PREDICTION V2_model.pickle','wb') as file:
    pickle.dump(model,file)

In [101]:
with open('SECOND HAND CAR PRICE PREDICTION V2_model.pickle','rb') as file:
    loaded_model= pickle.load(file)

In [102]:
predicted_price=predict_price('Ford','Ecosport', 2017,7,69713,'Manual','second','Petrol')
print(predicted_price)

1438569.12


In [103]:
print("model loaded successfully")

model loaded successfully
