In [1]:
# ==============================
# Import Libraries
# ==============================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pickle as pk
import os
from sklearn.model_selection import train_test_split
import streamlit as st



In [2]:
# ==============================
# Load Dataset
# ==============================
bikes_data = pd.read_csv("Used_Bikes.csv")

print("Dataset Loaded... \nShape:", bikes_data.shape)
bikes_data.head()

Dataset Loaded... 
Shape: (32648, 8)


Unnamed: 0,bike_name,price,city,kms_driven,owner,age,power,brand
0,TVS Star City Plus Dual Tone 110cc,35000.0,Ahmedabad,17654.0,First Owner,3.0,110.0,TVS
1,Royal Enfield Classic 350cc,119900.0,Delhi,11000.0,First Owner,4.0,350.0,Royal Enfield
2,Triumph Daytona 675R,600000.0,Delhi,110.0,First Owner,8.0,675.0,Triumph
3,TVS Apache RTR 180cc,65000.0,Bangalore,16329.0,First Owner,4.0,180.0,TVS
4,Yamaha FZ S V 2.0 150cc-Ltd. Edition,80000.0,Bangalore,10000.0,First Owner,3.0,150.0,Yamaha


In [3]:
# ==============================
# Data Cleaning
# ==============================
bikes_data.dropna(inplace=True)
bikes_data.drop_duplicates(inplace=True)

print("Cleaned Data... \nShape:", bikes_data.shape)

Cleaned Data... 
Shape: (7324, 8)


In [4]:
# ==============================
# Feature Engineering: Extract Model (optional, skip if not needed)
# ==============================
# For bikes, we already have 'brand', so we can skip extracting brand/model.
print("Brand column exists. Skipping extraction.")

Brand column exists. Skipping extraction.


In [5]:
# ==============================
# Remove Unwanted Brands (optional, skip or filter if needed)
# ==============================
# For bikes, you may skip this step or filter rare brands if needed.
print("Skipping unwanted brand removal for bikes.")

Skipping unwanted brand removal for bikes.


In [6]:
# ==============================
# Clean Numeric Columns
# ==============================
# All numeric columns in bikes_data are already float/int.
print("Numeric columns are clean.")

Numeric columns are clean.


In [7]:
# ==============================
# Encoding Categorical Columns
# ==============================
owner_mapping = {
    'First Owner': 1, 'Second Owner': 2, 'Third Owner': 3,
    'Fourth Owner Or More': 4
}
brand_mapping = {b: i+1 for i, b in enumerate(bikes_data['brand'].unique())}
city_mapping = {c: i+1 for i, c in enumerate(bikes_data['city'].unique())}

bikes_data['owner'] = bikes_data['owner'].map(owner_mapping)
bikes_data['brand'] = bikes_data['brand'].map(brand_mapping)
bikes_data['city'] = bikes_data['city'].map(city_mapping)

print("Encoding Done...")

Encoding Done...


In [8]:
# ==============================
# Train-Test Split
# ==============================
feature_columns = ['brand', 'city', 'kms_driven', 'owner', 'age', 'power']

input_data = bikes_data[feature_columns]
output_data = bikes_data['price']

x_train, x_test, y_train, y_test = train_test_split(
    input_data, output_data, test_size=0.2, random_state=42
)

print("Data Split Done... \nTrain Size:", x_train.shape, "\nTest Size:", x_test.shape)
print("Training feature names:", x_train.columns.tolist())

Data Split Done... 
Train Size: (5859, 6) 
Test Size: (1465, 6)
Training feature names: ['brand', 'city', 'kms_driven', 'owner', 'age', 'power']


In [9]:
# ==============================
# Model Training
# ==============================
model = LinearRegression()
model.fit(x_train, y_train)

print("Model Training Done...")

Model Training Done...


## Model Accuracy
The RÂ² score (accuracy) indicates how well the model predicts bike prices. A higher value (closer to 100%) means better prediction.

In [10]:
# ==============================
# Model Evaluation
# ==============================
y_pred = model.predict(x_test)

r2 = r2_score(y_test, y_pred)*100
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("ðŸ“Š Model Evaluation:")
print(f"RÂ² Score: {r2:.2f}% (Model Accuracy)")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")


ðŸ“Š Model Evaluation:
RÂ² Score: 75.87% (Model Accuracy)
MAE: 34660.11
RMSE: 68542.73


In [11]:
# ==============================
# Sample Prediction
# ==============================
# Example: TVS, Ahmedabad, 10000 kms, First Owner, 3 years, 110cc
sample_input = pd.DataFrame(
    [[brand_mapping['TVS'], city_mapping['Ahmedabad'], 10000, 1, 3, 110]],
    columns=feature_columns
)

print("Sample Input:", sample_input)
print("Sample Prediction:", model.predict(sample_input))

Sample Input:    brand  city  kms_driven  owner  age  power
0      1     1       10000      1    3    110
Sample Prediction: [-9346.43245949]


In [12]:
# ==============================
# Save Model
# ==============================
file_path = os.path.join(os.getcwd(), "bike_model.pkl")
pk.dump(model, open(file_path, "wb"))

print("âœ… Model saved at:", file_path)

âœ… Model saved at: d:\Kuldeep_Clg\sem-5\Project\ML\bike_model.pkl
