In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

df = pd.read_csv('vehicles_copy.csv')
print(df.describe())
print(df.info())

          Unnamed: 0           year  seating_capacity  kilometers_driven  \
count  499746.000000  499746.000000     499746.000000      499746.000000   
mean   249872.500000    2011.995770          5.999532         120.362064   
std    144264.388151       7.204957          2.583512         108.449333   
min         0.000000    2000.000000          2.000000           0.000000   
25%    124936.250000    2006.000000          4.000000          30.730000   
50%    249872.500000    2012.000000          6.000000          89.290000   
75%    374808.750000    2018.000000          8.000000         185.150000   
max    499745.000000    2024.000000         10.000000         479.980000   

       selling_price      owner_age  proposed_purchase_price  
count  499746.000000  499746.000000            499746.000000  
mean    17658.561496      59.013005             19425.720447  
std      7580.377670      23.933346              8358.384150  
min      4500.000000      18.000000              4725.020000  


In [22]:
# Handle missing values

df.fillna(df.median(numeric_only=True), inplace=True)  
df.fillna(df.mode().iloc[0], inplace=True)  


In [23]:
# Remove duplicates

df.drop_duplicates(inplace=True)

In [24]:
# Remove outliers

def remove_outliers(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]

for column in df.select_dtypes(include=['number']).columns:
    df = remove_outliers(df, column)


In [25]:
# Encode the data

label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [26]:
# Split the data

target_column = 'selling_price'
X = df.drop(columns=[target_column])
y = df[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print("Data preprocessing done")

(395216, 18) (98805, 18) (395216,) (98805,)
Data preprocessing done


In [27]:
lr = LinearRegression(fit_intercept=True)
lr.fit(X_train, y_train)

y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

train_rmse = root_mean_squared_error(y_train, y_train_pred)
test_rmse_lr = root_mean_squared_error(y_test, y_test_pred)

print("\n ====================== For Linear regression ======================")
print(f"Train RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse_lr}")


Train RMSE: 428.3542020325439
Test RMSE: 425.11402772185386
