>>> ## Olb bike price prediction project 

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 


In [2]:
df = pd.read_csv(r"E:\used bike\Used_Bikes.csv")
df.head()

Unnamed: 0,bike_name,price,city,kms_driven,owner,age,power,brand
0,TVS Star City Plus Dual Tone 110cc,35000.0,Ahmedabad,17654.0,First Owner,3.0,110.0,TVS
1,Royal Enfield Classic 350cc,119900.0,Delhi,11000.0,First Owner,4.0,350.0,Royal Enfield
2,Triumph Daytona 675R,600000.0,Delhi,110.0,First Owner,8.0,675.0,Triumph
3,TVS Apache RTR 180cc,65000.0,Bangalore,16329.0,First Owner,4.0,180.0,TVS
4,Yamaha FZ S V 2.0 150cc-Ltd. Edition,80000.0,Bangalore,10000.0,First Owner,3.0,150.0,Yamaha


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32648 entries, 0 to 32647
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   bike_name   32648 non-null  object 
 1   price       32648 non-null  float64
 2   city        32648 non-null  object 
 3   kms_driven  32648 non-null  float64
 4   owner       32648 non-null  object 
 5   age         32648 non-null  float64
 6   power       32648 non-null  float64
 7   brand       32648 non-null  object 
dtypes: float64(4), object(4)
memory usage: 2.0+ MB


In [4]:
## duplicates 
df.duplicated().sum()

25324

In [5]:
df.shape 

(32648, 8)

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df.duplicated().sum()

0

In [8]:
df.shape 

(7324, 8)

In [9]:
# missing value  
df.isnull().sum()

bike_name     0
price         0
city          0
kms_driven    0
owner         0
age           0
power         0
brand         0
dtype: int64

In [10]:
cat_col = df.select_dtypes(include='O')
cat_col.head()

Unnamed: 0,bike_name,city,owner,brand
0,TVS Star City Plus Dual Tone 110cc,Ahmedabad,First Owner,TVS
1,Royal Enfield Classic 350cc,Delhi,First Owner,Royal Enfield
2,Triumph Daytona 675R,Delhi,First Owner,Triumph
3,TVS Apache RTR 180cc,Bangalore,First Owner,TVS
4,Yamaha FZ S V 2.0 150cc-Ltd. Edition,Bangalore,First Owner,Yamaha


In [11]:
num_col = df.select_dtypes(exclude='O')
num_col.head()

Unnamed: 0,price,kms_driven,age,power
0,35000.0,17654.0,3.0,110.0
1,119900.0,11000.0,4.0,350.0
2,600000.0,110.0,8.0,675.0
3,65000.0,16329.0,4.0,180.0
4,80000.0,10000.0,3.0,150.0


In [12]:
# bike_price 
# feature selection  
cat_col.drop(['bike_name','city'],axis=1,inplace=True) 

In [13]:
cat_col.owner.value_counts()

owner
First Owner             6642
Second Owner             588
Third Owner               84
Fourth Owner Or More      10
Name: count, dtype: int64

In [14]:
dt = {'First Owner':1,'Second Owner':2,'Third Owner':3,'Fourth Owner Or More':4}
dt 

{'First Owner': 1,
 'Second Owner': 2,
 'Third Owner': 3,
 'Fourth Owner Or More': 4}

In [48]:
cat_col['owner'] = cat_col['owner'].map(dt)  # only single time execute 
cat_col.head()

Unnamed: 0,owner,brand
0,,6
1,,2
2,,14
3,,6
4,,5


In [16]:
cat_col['brand'].nunique()

23

In [17]:
brand_list = list(cat_col['brand'].value_counts().keys())
brand_list

['Bajaj',
 'Royal Enfield',
 'Hero',
 'Honda',
 'Yamaha',
 'TVS',
 'KTM',
 'Suzuki',
 'Harley-Davidson',
 'Kawasaki',
 'Hyosung',
 'Mahindra',
 'Benelli',
 'Triumph',
 'Ducati',
 'BMW',
 'Jawa',
 'Indian',
 'MV',
 'Rajdoot',
 'LML',
 'Yezdi',
 'Ideal']

In [18]:
# for loop
# dict={}
# j=1
# for i in ls:
#     dict[i]=j
#     j+=1
# print(dict)

In [19]:
brand_dict = {}
for value,key in enumerate(brand_list,start=1):
    print(key , " >>> ",value)
    brand_dict[key]  = value
    

Bajaj  >>>  1
Royal Enfield  >>>  2
Hero  >>>  3
Honda  >>>  4
Yamaha  >>>  5
TVS  >>>  6
KTM  >>>  7
Suzuki  >>>  8
Harley-Davidson  >>>  9
Kawasaki  >>>  10
Hyosung  >>>  11
Mahindra  >>>  12
Benelli  >>>  13
Triumph  >>>  14
Ducati  >>>  15
BMW  >>>  16
Jawa  >>>  17
Indian  >>>  18
MV  >>>  19
Rajdoot  >>>  20
LML  >>>  21
Yezdi  >>>  22
Ideal  >>>  23


In [20]:
cat_col['brand'] = cat_col['brand'].map(brand_dict)


In [21]:
# combining two dataframes 
df2 = pd.concat([cat_col,num_col],axis='columns')
df2.head()

Unnamed: 0,owner,brand,price,kms_driven,age,power
0,1,6,35000.0,17654.0,3.0,110.0
1,1,2,119900.0,11000.0,4.0,350.0
2,1,14,600000.0,110.0,8.0,675.0
3,1,6,65000.0,16329.0,4.0,180.0
4,1,5,80000.0,10000.0,3.0,150.0


In [22]:
df2.isnull().sum()

owner         0
brand         0
price         0
kms_driven    0
age           0
power         0
dtype: int64

In [23]:
# splitting x and y 
x = df2.drop('price',axis='columns')
y = df2[['price']]

In [24]:
y 

Unnamed: 0,price
0,35000.0
1,119900.0
2,600000.0
3,65000.0
4,80000.0
...,...
9362,25000.0
9369,35000.0
9370,450000.0
9371,139000.0


In [25]:
#### training testing split 
from sklearn.model_selection import train_test_split

In [26]:
x_train,x_test , y_train, y_test = train_test_split(x,y,test_size=0.2)  # 20% testing 

In [27]:
from sklearn.linear_model import LinearRegression   # class 

In [28]:
# object  
lnr = LinearRegression()

In [29]:
x_train.head()

Unnamed: 0,owner,brand,kms_driven,age,power
5362,1,1,17000.0,5.0,150.0
2404,2,7,10536.0,4.0,250.0
2848,1,2,11600.0,7.0,350.0
642,1,1,11500.0,4.0,220.0
9034,1,7,20000.0,6.0,200.0


In [30]:
# Train 
lnr.fit(x_train,y_train) 


In [31]:
## model testing or model evaluation phase   
print("Training score : ",lnr.score(x_train,y_train)*100)
print("Testing score : ",lnr.score(x_test,y_test)*100) 

# 90% above 
# model score ?  
# difference between training score and testing not should be more than 5
# model overfite or underfit 

# score  ==>  Down  90% 
# difference ==>  okay  5 % 


# ## overfit model 
# Training score == > High 
# Testing score  == > Low 


# ## Underfit model 
# Training score == > Low   73  
# Testing score  == > Low/High  85  ,  60

# generailed model 
# score high 
# differnce not > 5 
# underfit 

Training score :  73.85680975151558
Testing score :  74.7878621128157


In [32]:
x_test.shape 

(1465, 5)

In [33]:
x_test.head()

Unnamed: 0,owner,brand,kms_driven,age,power
7661,1,1,26592.0,4.0,220.0
5353,1,1,40000.0,6.0,150.0
2484,1,6,7500.0,5.0,110.0
5395,1,3,16830.0,6.0,100.0
7956,1,1,17000.0,4.0,150.0


In [34]:
### prediction  

unseen_bike = [[1,2,14000.0,2.0,410.0]]    # x variables 
lnr.predict(unseen_bike)



array([[178353.13088828]])

In [35]:
prediction = lnr.predict(x_test)

In [36]:
y_test['prediction'] = prediction 

In [37]:
y_test.head(40)

Unnamed: 0,price,prediction
7661,65000.0,57337.276749
5353,59000.0,13210.715614
2484,40000.0,59252.076566
5395,36000.0,12973.843394
7956,45000.0,20417.960433
977,20500.0,-21289.148954
6929,23000.0,-104.16578
6138,89000.0,46900.721954
4834,263000.0,230951.903523
7147,25700.0,69161.518479


In [38]:
final_data = pd.concat([x_test,y_test],axis='columns')
final_data.head()

#(actual price - predicted price )  ==> should be low 

Unnamed: 0,owner,brand,kms_driven,age,power,price,prediction
7661,1,1,26592.0,4.0,220.0,65000.0,57337.276749
5353,1,1,40000.0,6.0,150.0,59000.0,13210.715614
2484,1,6,7500.0,5.0,110.0,40000.0,59252.076566
5395,1,3,16830.0,6.0,100.0,36000.0,12973.843394
7956,1,1,17000.0,4.0,150.0,45000.0,20417.960433


In [39]:
lnr # algorithm 

In [40]:
ls = ['vashisht', 'vishab','antish','disha'] 
ls 

['vashisht', 'vishab', 'antish', 'disha']

In [41]:
import joblib 

In [42]:
joblib.dump(ls,'student_list.lb')     # to save your variable   

['student_list.lb']

In [43]:
studet_list = joblib.load('student_list.lb') 

In [44]:
studet_list

['vashisht', 'vishab', 'antish', 'disha']

In [45]:
# saving model on a local system 
joblib.dump(lnr,'regression_model.lb')

['regression_model.lb']

In [46]:
final_data.to_csv('prediction_data.csv',index=False)

In [47]:
## algorithm  --> model 
# deployment 