<a href="https://colab.research.google.com/github/Himanshuu002/Vehicle-Price-Prediction/blob/main/vehicle_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

imhimanshu02_vechile_price_data_path = kagglehub.dataset_download('imhimanshu02/vechile-price-data')

print('Data source import complete.')


## Step 1: Importing Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Step 2: Load the Dataset and Initial Exploration

In [None]:
df=pd.read_csv("/kaggle/input/vechile-price-data/dataset.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
print(df.columns.tolist())


In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
# For Missing values:
# - Numeric columns: fill with median
# - Categorical columns: fill with mode

numeric_col = df.select_dtypes(include=['int64','float64']).columns
categorical_col = df.select_dtypes(include=['object']).columns

df[numeric_col] = df[numeric_col].fillna(df[numeric_col].median())

for col in categorical_col:
    df[col] = df[col].fillna(df[col].mode()[0])

In [None]:
df.isnull().sum().sum()

In [None]:
print(" Numerical Columns:")
print(numeric_col.tolist())

print("\n Categorical Columns:")
print(categorical_col.tolist())

In [None]:
for col in (categorical_col.tolist()):
    print(df[col].value_counts())
    print('*' * 40)

In [None]:
for col in numeric_col:
    plt.figure(figsize=(8,6))
    sns.histplot(x=df[col],data=df,bins=20)
    plt.title(f'Distribution of {col}')
    plt.show()

In [None]:
# Calculate correlation matrix for numerical features
corr_matrix = df[numeric_col].corr()

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(corr_matrix,annot=True,fmt='.2f',square=True,cmap='coolwarm')
plt.title('Distribution of Numerical columns')
plt.show()

In [None]:
target_col = 'price'

In [None]:
correlation_with_target = corr_matrix[target_col].sort_values(ascending=False)
print("\n Correlation with target variable:")
print(correlation_with_target)

In [None]:
low_corr_features = correlation_with_target[abs(correlation_with_target<0.05)].index.tolist()
print('low correlation features:',low_corr_features)

In [None]:
# Droping low correlation features

df.drop(columns = low_corr_features,inplace=True)


In [None]:
df.head()

In [None]:
X=df.drop(columns='price')
Y=df['price']

In [None]:
# Applying OneHotEncoding
X_encoded = pd.get_dummies(X,drop_first=True)

In [None]:
X_encoded.shape

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X_encoded)

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(X_scaled,Y,test_size=0.2,random_state=537)

In [None]:
x_train.shape

In [None]:
# model 1: Linear regression
lr = LinearRegression()
lr.fit(x_train,y_train)
lr_pred = lr.predict(x_test)



In [None]:
# model 2: Random forest
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
rf_pred = rf.predict(x_test)

In [None]:
# Evaluation function
def evaluate_model(name, y_true, y_pred):
    print(f"\n {name} Evaluation:")
    print(f"MAE  : {mean_absolute_error(y_true, y_pred):.2f}")
    print(f"RMSE : {np.sqrt(mean_squared_error(y_true, y_pred)):.2f}")
    print(f"R²   : {r2_score(y_true, y_pred):.4f}")

In [None]:
# Evaluate both models
evaluate_model("Linear Regression", y_test, lr_pred)
evaluate_model("Random Forest Regressor", y_test, rf_pred)

# Model Visualation

In [None]:
# Linear Regression - Actual vs Predicted
plt.figure(figsize=(8, 5))
plt.scatter(y_test, lr_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Linear Regression: Actual vs Predicted")
plt.grid(True)
plt.show()

In [None]:
# Random Forest Regressor - Actual vs Predicted
plt.figure(figsize=(8, 5))
plt.scatter(y_test, rf_pred, alpha=0.5, color='orange')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Random Forest: Actual vs Predicted")
plt.grid(True)
plt.show()


In [None]:
# Feature names after one-hot encoding
feature_names = X_encoded.columns

In [None]:
# Extract importance scores from the trained RF model
importances = rf.feature_importances_
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

In [None]:
# Plot top 15 important features
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))
plt.title("Top 10 Important Features (Random Forest)")
plt.xlabel("Importance Score")
plt.ylabel("Feature Name")
plt.tight_layout()
plt.show()