# Import


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
import re

# Load Dataset

In [7]:
df = pd.read_csv("car_price_dataset.csv", encoding="latin1")
df.head()

Unnamed: 0,year,kilometers_driven,fuel_type,transmission,owner_type,mileage,engine,power,seats,price
0,2006,157617,Petrol,Automatic,Second,13.2,2994,267.05,4,11.68
1,2019,26689,Petrol,Manual,Fourth & Above,19.23,3467,237.86,7,13.51
2,2014,16174,CNG,Automatic,Third,10.19,2410,288.32,5,12.25
3,2010,93461,Electric,Manual,Third,14.93,4288,91.82,7,13.67
4,2007,131174,Petrol,Manual,First,24.53,3254,254.33,8,12.51


# Preprocessing and Cleaning 


In [9]:
def extract_number(text):
    num = re.findall(r"[\d\.]+", str(text))
    if num:  
        return float(num[0])
    else:
        return None 


df['Mileage'] = df['mileage'].apply(extract_number)
df['engine'] = df['engine'].apply(extract_number)
df['power'] = df['power'].apply(extract_number)

In [13]:
df

Unnamed: 0,year,kilometers_driven,fuel_type,transmission,owner_type,mileage,engine,power,seats,price,Mileage,Engine,Power
0,2006,157617,Petrol,Automatic,Second,13.20,2994,267.05,4,11.68,13.20,2994.0,267.05
1,2019,26689,Petrol,Manual,Fourth & Above,19.23,3467,237.86,7,13.51,19.23,3467.0,237.86
2,2014,16174,CNG,Automatic,Third,10.19,2410,288.32,5,12.25,10.19,2410.0,288.32
3,2010,93461,Electric,Manual,Third,14.93,4288,91.82,7,13.67,14.93,4288.0,91.82
4,2007,131174,Petrol,Manual,First,24.53,3254,254.33,8,12.51,24.53,3254.0,254.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,2022,172316,LPG,Manual,Fourth & Above,25.88,4602,224.65,8,16.05,25.88,4602.0,224.65
596,2015,137717,LPG,Manual,Third,15.52,4075,293.47,6,15.63,15.52,4075.0,293.47
597,2006,183595,Petrol,Manual,First,27.54,4454,376.66,5,16.35,27.54,4454.0,376.66
598,2012,129085,Diesel,Automatic,Fourth & Above,28.88,3234,175.29,7,11.94,28.88,3234.0,175.29


In [11]:
df.dropna(inplace=True)
df.shape

(600, 11)

# Train and Split

In [13]:

X = df.drop('price', axis=1)
y = df['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline 

In [15]:
categorical_features = ['fuel_type', 'transmission', 'owner_type']
numerical_features = ['year', 'kilometers_driven', 'seats','mileage','engine','power']


categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])


model_pipeline.fit(X_train, y_train)

# Evaluation 

In [15]:
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

y_pred = model_pipeline.predict(X_test)


r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R2 Score: {r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")

R2 Score: 0.8725
MSE: 1.3331
RMSE: 1.1546



# pridicting System

In [18]:
def predict_car_price(model_pipeline, year, kilometers_driven, fuel_type, transmission, owner_type, seats, Mileage, engine, power):
   
    input_data = pd.DataFrame({
        'year': [year],
        'kilometers_driven': [kilometers_driven],
        'fuel_type': [fuel_type],
        'transmission': [transmission],
        'owner_type': [owner_type],
        'seats': [seats],
        'mileage': [Mileage],
        'engine': [engine],    
        'power': [power]      
    })
    
  
    prediction = model_pipeline.predict(input_data)
    
    return prediction[0]

In [20]:
predicted_price = predict_car_price(
    model_pipeline=model_pipeline,
    year=2010,
    kilometers_driven=72000,
    fuel_type='CNG',
    transmission='Manual',
    owner_type='First',
    seats=5.0,
    Mileage=26.6,  
    engine=998,    
    power=58.16    
)

print(f"Predicted Price: {predicted_price:.2f} Lakh")

Predicted Price: 6.61 Lakh


# Save Pipeline

In [24]:
import pickle 

pickle.dump(model_pipeline,open("pipeline.pkl22",'wb'))