# **Day-16 | Car Price Prediction using RANDOM FOREST**

# *Load Dataset from Local directory*

In [9]:
from google.colab import files
uploaded = files.upload()
import pandas as pd

Saving used_car_dataset.csv to used_car_dataset.csv


# *Load Dataset*

In [11]:
dataset = pd.read_csv('used_car_dataset.csv')
dataset = dataset.drop(['car_name'],axis=1)

### *Summarize Dataset*

In [12]:
print(dataset.shape)
print(dataset.head(5))

(2105, 5)
  car_price_in_rupees kms_driven fuel_type       city  year_of_manufacture
0         ₹ 4.45 Lakh  22,402 km    Petrol     Mumbai                 2016
1         ₹ 2.93 Lakh  10,344 km    Petrol    Kolkata                 2019
2        ₹ 22.49 Lakh  12,999 km    Diesel  Bangalore                 2021
3         ₹ 6.95 Lakh  45,000 km    Petrol      Thane                 2016
4           ₹ 12 Lakh  11,193 km    Petrol    Kolkata                 2019


### *Splitting Dataset into X & Y*
### *This X contains Both Numerical & Text Data*

In [13]:
Xdata = dataset.drop('car_price_in_rupees',axis='columns')
numericalCols=Xdata.select_dtypes(exclude=['object']).columns
X=Xdata[numericalCols]
X

Unnamed: 0,year_of_manufacture
0,2016
1,2019
2,2021
3,2016
4,2019
...,...
2100,2015
2101,2013
2102,2018
2103,2017


In [17]:
Y = dataset['car_price_in_rupees']
Y

Unnamed: 0,car_price_in_rupees
0,₹ 4.45 Lakh
1,₹ 2.93 Lakh
2,₹ 22.49 Lakh
3,₹ 6.95 Lakh
4,₹ 12 Lakh
...,...
2100,₹ 3.6 Lakh
2101,₹ 22 Lakh
2102,₹ 8.38 Lakh
2103,₹ 6.75 Lakh


### *Scaling the Independent Variables (Features)*

In [14]:
from sklearn.preprocessing import scale
cols = X.columns
X = pd.DataFrame(scale(X))
X.columns = cols
X

Unnamed: 0,year_of_manufacture
0,-0.363031
1,0.701782
2,1.411657
3,-0.363031
4,0.701782
...,...
2100,-0.717969
2101,-1.427844
2102,0.346844
2103,-0.008094


### *Splitting Dataset into Train & Test*

In [44]:
# Remove commas and ' km' text, then convert to numeric
data['kms_driven'] = data['kms_driven'].astype(str).str.replace(',', '').str.replace(' km', '').astype(float)

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.20,random_state=0)

In [45]:
from sklearn.ensemble import RandomForestRegressor


In [46]:
# Example setup, you need to replace with your actual data preparation steps
X = dataset[['kms_driven', 'fuel_type', 'city', 'year_of_manufacture']]  # Use the correct feature columns
Y = dataset['car_price_in_rupees']  # Use the correct target column


In [47]:
# Check data types of each column
print(X.dtypes)


kms_driven             float64
fuel_type               object
city                    object
year_of_manufacture      int64
dtype: object


In [48]:
import pandas as pd

# One-Hot Encode the categorical columns 'fuel_type' and 'city'
X = pd.get_dummies(X, columns=['fuel_type', 'city'], drop_first=True)


In [49]:
print(X.columns)


Index(['kms_driven', 'year_of_manufacture', 'fuel_type_Diesel',
       'fuel_type_Diesel + 1', 'fuel_type_Electric', 'fuel_type_Hybrid',
       'fuel_type_LPG', 'fuel_type_Petrol', 'fuel_type_Petrol + 1',
       'city_Ambattur', 'city_Bangalore', 'city_Chennai', 'city_Delhi',
       'city_Faridabad', 'city_Gurgaon', 'city_Hyderabad', 'city_Kolkata',
       'city_Mumbai', 'city_Noida', 'city_Pallikarnai', 'city_Poonamallee',
       'city_Pune', 'city_Thane', 'city_Thiruvallur'],
      dtype='object')


In [50]:
# Example if column has leading or trailing spaces
X.columns = X.columns.str.strip()


In [51]:

# Assuming 'data' is your original DataFrame
X = dataset[['kms_driven', 'fuel_type', 'city', 'year_of_manufacture']]
Y = dataset['car_price_in_rupees']


In [52]:
print(X.head())  # Display first few rows
print(X.info())  # Display concise summary


   kms_driven fuel_type       city  year_of_manufacture
0     22402.0    Petrol     Mumbai                 2016
1     10344.0    Petrol    Kolkata                 2019
2     12999.0    Diesel  Bangalore                 2021
3     45000.0    Petrol      Thane                 2016
4     11193.0    Petrol    Kolkata                 2019
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2105 entries, 0 to 2104
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   kms_driven           2105 non-null   float64
 1   fuel_type            2105 non-null   object 
 2   city                 2105 non-null   object 
 3   year_of_manufacture  2105 non-null   int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 65.9+ KB
None


In [53]:
data = pd.read_csv('used_car_dataset.csv')  # Adjust path to your file
print(data.columns)  # Verify all columns are loaded correctly


Index(['car_name', 'car_price_in_rupees', 'kms_driven', 'fuel_type', 'city',
       'year_of_manufacture'],
      dtype='object')


In [54]:
import pandas as pd

# Load your data
data = pd.read_csv('used_car_dataset.csv')  # Replace with your actual file path

# Cleaning 'kms_driven'
dataset['kms_driven'] = dataset['kms_driven'].astype(str).str.replace(',', '').str.replace(' km', '').astype(float)

# Cleaning 'car_price_in_rupees'
# Convert to string, remove rupee symbol and commas
dataset['car_price_in_rupees'] = dataset['car_price_in_rupees'].astype(str).str.replace('₹', '').str.replace(',', '').str.strip()

# Use regex to extract numeric part and detect 'Lakh' or 'Crore'
dataset['car_price_numeric'] = dataset['car_price_in_rupees'].str.extract(r'(\d+\.?\d*)').astype(float)
dataset['price_unit'] = dataset['car_price_in_rupees'].str.extract(r'(\D+)').fillna('')

# Convert 'Lakh' to numeric and 'Crore' to numeric
dataset['car_price_in_rupees'] = dataset.apply(
    lambda x: x['car_price_numeric'] * 100000 if 'Lakh' in x['price_unit'] else x['car_price_numeric'] * 10000000 if 'Crore' in x['price_unit'] else x['car_price_numeric'],
    axis=1
)

# Drop intermediate columns used for conversion
dataset.drop(['car_price_numeric', 'price_unit'], axis=1, inplace=True)

# Handling Missing Values (optional step if needed)
dataset['kms_driven'].fillna(dataset['kms_driven'].mean(), inplace=True)
dataset['car_price_in_rupees'].fillna(dataset['car_price_in_rupees'].mean(), inplace=True)

# Verify the cleaned data
print(dataset.head())  # Print the first few rows to check the cleaned data
print(dataset.dtypes)  # Check the data types to ensure all are correctly converted


   car_price_in_rupees  kms_driven fuel_type       city  year_of_manufacture
0                 4.45     22402.0    Petrol     Mumbai                 2016
1                 2.93     10344.0    Petrol    Kolkata                 2019
2                22.49     12999.0    Diesel  Bangalore                 2021
3                 6.95     45000.0    Petrol      Thane                 2016
4           1200000.00     11193.0    Petrol    Kolkata                 2019
car_price_in_rupees    float64
kms_driven             float64
fuel_type               object
city                    object
year_of_manufacture      int64
dtype: object


### *Feature Encoding*

In [56]:
print(data.columns)
data.columns = data.columns.str.strip()
print(data[['fuel_type', 'city']].head())  # Check if these columns exist and contain data

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# Load your data
data = pd.read_csv('used_car_dataset.csv')  # Replace with your actual file path

# Check and clean column names
data.columns = data.columns.str.strip()

# Verify columns
print(data.columns)  # Ensure 'fuel_type' and other columns are listed

# Check data presence
print(dataset[['fuel_type', 'city']].head())  # Check if these columns exist

# Define features and target variable
X = dataset.drop('car_price_in_rupees', axis=1)
y = dataset['car_price_in_rupees']

# Identify categorical columns
categorical_cols = ['fuel_type', 'city']
numerical_cols = ['kms_driven', 'year_of_manufacture']  # Add other numerical columns if any

# Define preprocessing for categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ]
)

# Define the pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

# Split data into train and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Train the model
pipeline.fit(x_train, y_train)


Index(['car_name', 'car_price_in_rupees', 'kms_driven', 'fuel_type', 'city',
       'year_of_manufacture'],
      dtype='object')
  fuel_type       city
0    Petrol     Mumbai
1    Petrol    Kolkata
2    Diesel  Bangalore
3    Petrol      Thane
4    Petrol    Kolkata
Index(['car_name', 'car_price_in_rupees', 'kms_driven', 'fuel_type', 'city',
       'year_of_manufacture'],
      dtype='object')
  fuel_type       city
0    Petrol     Mumbai
1    Petrol    Kolkata
2    Diesel  Bangalore
3    Petrol      Thane
4    Petrol    Kolkata


### *Feature Scaling*

In [57]:
from sklearn.preprocessing import StandardScaler

# Define numerical columns
numerical_cols = ['kms_driven', 'year_of_manufacture']

# Define preprocessing for numerical features with scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ]
)

# Define the pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])


Model evaluvation and training

In [58]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Train the model
pipeline.fit(x_train, y_train)

# Predict on test data
y_pred = pipeline.predict(x_test)

# Evaluate performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


Mean Absolute Error: 382674.3632452025
Mean Squared Error: 1012438941969.7948
R-squared: -0.14021899673097638


Hyper Parameter Tuning

In [61]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Define categorical and numerical columns
categorical_cols = ['fuel_type', 'city']
numerical_cols = ['kms_driven', 'year_of_manufacture']

# Define the preprocessor with handle_unknown='ignore'
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)


In [62]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

# Fit the pipeline
pipeline.fit(x_train, y_train)


In [64]:
param_grid = {
    'model__n_estimators': [50, 100],
    'model__max_depth': [None, 10],
    'model__min_samples_split': [2, 5]
}
# Use a smaller sample of the training data
x_train_subset = x_train.sample(frac=0.2, random_state=42)
y_train_subset = y_train.loc[x_train_subset.index]

# Run Grid Search on the subset
grid_search.fit(x_train_subset, y_train_subset)
from sklearn.model_selection import RandomizedSearchCV

# Define the RandomizedSearchCV
random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=10, cv=5, scoring='neg_mean_absolute_error', random_state=42)

# Fit RandomizedSearchCV
random_search.fit(x_train, y_train)




Model Validation

In [66]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Identify categorical features
categorical_features = ['fuel_type', 'city']

# Create a ColumnTransformer to handle categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)


In [67]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# Create a pipeline that first applies transformations, then fits the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

# Fit the pipeline
pipeline.fit(x_train, y_train)


In [68]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(pipeline, x_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE to positive MSE
cv_scores = -cv_scores

print(f"Cross-Validated MSE Scores: {cv_scores}")
print(f"Mean Cross-Validated MSE: {cv_scores.mean()}")
print(f"Standard Deviation of Cross-Validated MSE: {cv_scores.std()}")


Cross-Validated MSE Scores: [1.44060198e+12 1.58894298e+12 1.63440840e+12 9.07856920e+11
 7.21805702e+11]
Mean Cross-Validated MSE: 1258723195082.796
Standard Deviation of Cross-Validated MSE: 372733659825.8376


Model Evaluation

In [69]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test set
y_pred = pipeline.predict(x_test)

# Calculate MSE and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 1045711530310.221
R-squared: -0.17769092291187927


In [70]:
importances = pipeline.named_steps['model'].feature_importances_

# Retrieve feature names from the ColumnTransformer
feature_names = pipeline.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out(categorical_features)

# Combine feature names and their importance scores
feature_importances = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)

print("Feature Importances:")
for feature, importance in feature_importances:
    print(f"{feature}: {importance}")


Feature Importances:
fuel_type_Diesel: 0.02602043221139971
city_Hyderabad: 0.023775344053999843
city_Ahmedabad: 0.022056047959979366
city_Pune: 0.019635883113372554
city_Thane: 0.019230341306239844
city_Mumbai: 0.018418351142631073
city_Faridabad: 0.015718009640471896
city_Bangalore: 0.012609063771273432
fuel_type_Electric: 0.012525193588582716
fuel_type_Petrol: 0.007535152476256057
city_Kolkata: 0.004351285453557976
city_Ambattur: 0.0037632399271106585
city_Chennai: 0.002260240113276161
fuel_type_CNG: 0.0019598346358051326
city_Pallikarnai: 0.0009907698574426228
city_Delhi: 0.0009524972669930056
city_Poonamallee: 0.0007509335059891791
fuel_type_Diesel + 1: 0.0005193545211850651
fuel_type_Petrol + 1: 0.000507763868593455
city_Thiruvallur: 0.0004385634843616012
fuel_type_LPG: 3.261461463273547e-05
city_Noida: 5.69703043437487e-14
fuel_type_Hybrid: 3.289845608298369e-14
city_Gurgaon: 8.664393252154744e-15


In [72]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20, 30]
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit GridSearchCV
grid_search.fit(x_train, y_train)

# Print best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", -grid_search.best_score_)


Best Parameters: {'model__max_depth': 10, 'model__n_estimators': 200}
Best Score: 1178145204478.4902


In [73]:
import joblib

# Save the model
joblib.dump(pipeline, 'car_price_predictor_model.pkl')


['car_price_predictor_model.pkl']