In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR 


In [None]:
# Load the dataset
data = pd.read_csv("Australian Vehicle Prices.csv")

# Display the first few rows of the dataset
data_head = data.head()

# Get a concise summary of the dataframe
data_info = data.info()

# Display the statistical summary of numerical columns
data_description = data.describe()

data_head, data_info, data_description


# Data Cleaning

In [None]:
# 1. Convert 'Price', 'Kilometres', 'Doors', and 'Seats' to numeric, handling errors by coercing to NaN
data['Price'] = pd.to_numeric(data['Price'].str.replace('[^0-9.]', '', regex=True), errors='coerce')
data['Kilometres'] = pd.to_numeric(data['Kilometres'].str.replace('[^0-9.]', '', regex=True), errors='coerce')
data['Doors'] = pd.to_numeric(data['Doors'].str.extract('(\d+)')[0], errors='coerce')
data['Seats'] = pd.to_numeric(data['Seats'].str.extract('(\d+)')[0], errors='coerce')

# 2. Fill missing values for numeric columns with the median
data['Price'].fillna(data['Price'].median(), inplace=True)
data['Kilometres'].fillna(data['Kilometres'].median(), inplace=True)
data['Doors'].fillna(data['Doors'].median(), inplace=True)
data['Seats'].fillna(data['Seats'].median(), inplace=True)

# 3. For categorical columns with missing values, fill with 'Unknown'
categorical_columns_with_na = ['Car/Suv', 'Location', 'BodyType']
for column in categorical_columns_with_na:
    data[column].fillna('Unknown', inplace=True)

# Optionally, display the cleaned data statistics to verify the changes
cleaned_data_description = data.describe(include='all')
print(cleaned_data_description)

# Exploratory Data Analysis (EDA):

In [None]:
sns.set_style("whitegrid")
#Distribution of Vehicle Prices: You've visualized how vehicle prices are spread across your dataset, which helps identify the range most prices fall into and any outliers.
plt.figure(figsize=(10, 6))
sns.histplot(data['Price'], bins=30, kde=True)
plt.title('Distribution of Vehicle Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

#Vehicle Price vs. Year: This plot highlights the relationship between the vehicle's manufacturing year and its price, indicating a potential trend where newer vehicles tend to be more expensive.
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Year', y='Price', data=data)
plt.title('Vehicle Price vs. Year')
plt.xlabel('Year')
plt.ylabel('Price')
plt.show()

#Distribution of Kilometres: Understanding how many kilometres vehicles have on the odometer helps in analyzing the impact of usage on price.
#co relation Matrix
plt.figure(figsize=(10, 6))
sns.histplot(data['Kilometres'], bins=30, kde=True)
plt.title('Distribution of Kilometres')
plt.xlabel('Kilometres')
plt.ylabel('Frequency')
plt.show()

#Price vs. Kilometres: This scatter plot examines how vehicle price relates to the number of kilometres driven, likely showing that vehicles with lower kilometres may fetch higher prices.
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Kilometres', y='Price', data=data)
plt.title('Price vs. Kilometres')
plt.xlabel('Kilometres')
plt.ylabel('Price')
plt.show()

#Distribution of Year: Seeing the year distribution helps understand the age range of the vehicles in your dataset.
plt.figure(figsize=(10, 6))
sns.histplot(data['Year'], bins=30, kde=True)
plt.title('Distribution of Year')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.show()

#Price vs. Number of Doors: This analysis explores whether the number of doors on a vehicle influences its market price.
plt.figure(figsize=(10, 6))
sns.boxplot(x='Doors', y='Price', data=data)
plt.title('Price vs. Number of Doors')
plt.xlabel('Doors')
plt.ylabel('Price')
plt.show()

#Distribution of Fuel Types: Identifying the variety and frequency of fuel types available in your dataset helps in understanding market preferences.
plt.figure(figsize=(10, 6))
sns.countplot(x='FuelType', data=data)
plt.title('Distribution of Fuel Types')
plt.xlabel('Fuel Type')
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.show()

#Price vs. Fuel Type: By comparing vehicle prices across different fuel types, you can deduce if certain fuel types are associated with higher or lower prices.
plt.figure(figsize=(10, 6))
sns.boxplot(x='FuelType', y='Price', data=data)
plt.title('Price vs. Fuel Type')
plt.xlabel('Fuel Type')
plt.ylabel('Price')
plt.xticks(rotation=45)
plt.show()


# Feature Engineering

In [None]:
# Current year for calculating vehicle age
current_year = np.max(data['Year'])

# Create 'VehicleAge' feature
data['VehicleAge'] = current_year - data['Year']

# Selecting categorical variables for one-hot encoding
categorical_features = ['FuelType', 'Transmission', 'DriveType']

# Performing one-hot encoding
one_hot_encoded_data = pd.get_dummies(data[categorical_features])

# Drop the original categorical columns and concatenate the new one-hot encoded columns
data_prepared = data.drop(categorical_features, axis=1)
data_prepared = pd.concat([data_prepared, one_hot_encoded_data], axis=1)

# Display the first few rows of the prepared dataset to verify changes
data_prepared.head()


# Data Preprocessing

In [None]:
# Define the features and target variable
X = data_prepared.drop(['Price', 'Brand', 'Model', 'Car/Suv', 'Title', 'UsedOrNew', 'Engine', 'FuelConsumption', 'ColourExtInt', 'Location', 'CylindersinEngine', 'BodyType'], axis=1)
y = data_prepared['Price']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Output the shape of the splits to confirm the sizes
X_train.shape, X_test.shape, y_train.shape, y_test.shape


# Model Selection and Training

In [None]:
# Assuming X and y are already defined and ready for use

# Imputing missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# Splitting the imputed data into training and testing sets again
X_train_imputed, X_test_imputed, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Redefine the models dictionary
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}

# Retraining models on the imputed data
results_imputed = {}

for name, model in models.items():
    model.fit(X_train_imputed, y_train)
    predictions = model.predict(X_test_imputed)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    results_imputed[name] = rmse

results_imputed

