## Import Libraries

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

## Read Csv file

In [2]:
df = pd.read_csv('car data.csv')
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [3]:
df.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Driven_kms,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.642584,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


## Clean data

In [4]:
df_cleaned = df.dropna()

In [5]:
df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Driven_kms       0
Fuel_Type        0
Selling_type     0
Transmission     0
Owner            0
dtype: int64

In [6]:
df_cleaned

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.60,6.87,42450,Diesel,Dealer,Manual,0
...,...,...,...,...,...,...,...,...,...
296,city,2016,9.50,11.60,33988,Diesel,Dealer,Manual,0
297,brio,2015,4.00,5.90,60000,Petrol,Dealer,Manual,0
298,city,2009,3.35,11.00,87934,Petrol,Dealer,Manual,0
299,city,2017,11.50,12.50,9000,Diesel,Dealer,Manual,0


## Splitting data into train and test dataset

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x = df_cleaned.drop(columns = ['Selling_Price','Car_Name'])
y = df_cleaned['Selling_Price']

num_cols = ['Year', 'Present_Price', 'Driven_kms', 'Owner']
cat_cols = ['Fuel_Type', 'Selling_type', 'Transmission']

## Converting categorical data into numeric data

In [9]:
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)



## Pipelining, Fitting and predictiong the model

In [10]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")


Mean Squared Error (MSE): 0.8263443475409831
R-squared (R2): 0.9641274851604863


In [11]:
for i in range(5):
    print(f"Predicted price: {y_pred[i]:.2f}, Actual price: {y_test.iloc[i]:.2f}")

Predicted price: 0.44, Actual price: 0.35
Predicted price: 11.15, Actual price: 10.11
Predicted price: 4.90, Actual price: 4.95
Predicted price: 0.21, Actual price: 0.15
Predicted price: 7.70, Actual price: 6.95


## Predicting the model on new data created dataframe

In [12]:
import numpy as np

new_car = pd.DataFrame({
    'Year': [2020],
    'Present_Price': [12.50],
    'Driven_kms': [15000],     
    'Fuel_Type': ['Petrol'],
    'Selling_type': ['Dealer'],
    'Transmission': ['Manual'],
    'Owner': [0]
})

predicted_price = model.predict(new_car)
print(f"Predicted Price for new car: {predicted_price[0]:.2f}")

Predicted Price for new car: 10.15


In [13]:
model.predict(new_car)

array([10.154])