# Subproject 1 – Used Car Prices Prediction
Machine Learning – M.Sc. in Electrical and Computer Engineering


Importing libraries

In [None]:
import pandas as pd

In [None]:
import numpy as np
import matplotlib.pyplot as plt


Reading files using pandas library

In [None]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_sample_submission = pd.read_csv("sample_submission.csv")


# Exploratory Data Analysis (EDA)

using the shape() method, will give the dimension of the data i am working with.

In [None]:
# for train data

df_train.shape

In [None]:
# for test data

df_test.shape

the head() method will show first rows of data but limited, "n" can be passed as an argument, if we want specific amount to row to be shown.
passing n as an argument here will show the first n rows from index (0 to n-1).

In [None]:
# for train data

df_train.head()

In [None]:
# for test data

df_test.head()

Using info() method to see the summary details about the dataframes like the index, datatypes, columns, non-null values and the memory usage.

In [None]:
#  for the train data

df_train.info()

In [None]:
#  for the test data

df_test.info()

checking the data types of the data we are dealing with

In [None]:
# for train data

df_train.dtypes

In [None]:
# for test data

df_test.dtypes

the describitution of the dataset

In [None]:
df_train.describe()

In [None]:
df_test.describe()

In [None]:
# from the distribution plot below, we can say that the the target ['price'] of the train data is skewed with fewer high-priced car.

# The target distribution was analyzed to identify skewness and outliers in used car prices, which directly affects model choice and potential target transformations.

# price distrubution  visualization
plt.figure(figsize=(6,4))
plt.hist(df_train["price"], bins=50)
plt.title("Distribution of Car Prices")
plt.xlabel("Price")
plt.ylabel("Frequency")
plt.show()

separating numerical column and categorical column

In [None]:
#  for train data

numerical_columns = df_train.select_dtypes(include=["int64", "float64"]).columns
categorical_columns = df_test.select_dtypes(include=["object", "bool", "category"]).columns

print("Numerical columns:", numerical_columns)
print("Categorical columns:", categorical_columns)


In [None]:
#this plot will show the 'brand' category showing that a small number of dataset dominate compare to other brands

# this was conducted to identify dominant and rare brands, guiding the choice of encoding strategies and dimensionality control. wecan also change the column to other categorical columns too for better encoding strategies.

plt.figure(figsize=(6,4))
df_train["brand"].value_counts().head(10).plot(kind="bar")
plt.title("Top 10 Car Brands")
plt.ylabel("Count")
plt.show()




In [None]:
# The relationship between price and fuel type was analyzed to verify its influence on vehicle value and justify categorical encoding.

plt.figure(figsize=(6,4))
df_train.boxplot(column="price", by="fuel_type")
plt.title("Price vs Fuel Type")
plt.suptitle("")  # removes automatic subtitle
plt.show()

Missing values

the pandas library provides isnull() to check the column with missing values, sum() will give the total sum of the missing values if any exist,  and the sort_values for sorting.

In [None]:
#  for train data

train_missing_values =df_train.isnull().sum().sort_values(ascending=False)
train_missing_columns = df_train.columns[df_train.isna().any()].tolist()
train_missing_columns

In [None]:
#for test data

df_test.isnull().sum().sort_values(ascending=False)

In [None]:
# from pandas dataframe, we can use the .mean() method to find the mean for the each colums with missing values, then convert it to percentage multiplying be 100.

#  Missing value analysis was performed to quantify data incompleteness and justify the imputation strategies applied during preprocessing.

train_missing_percentage = df_train.isnull().mean().sort_values(ascending=False) * 100
train_missing_percentage[train_missing_percentage > 0]

# the plot for columns with missing data
(df_train.drop(['price'], axis=1).isnull().mean() * 100).plot(kind="bar")
plt.title("Missing values")
plt.ylabel("missing")
plt.show()


# Data preprocessing

## Filling missing data

From using info() method when doing EDA above on the train data, we can see that there are some missing values. Like in the fuel_type, accident and the clean_title features.
This missing values are categorical features, therefore there is need to fill the the missing values before training.

In [None]:
# replacing the missing values in the training set with its modes using the mode() function to get the mode and using fillna() function to fill the positions with misiing data. The 2 method are from pandas library 

for column in train_missing_columns:
    mode_value = df_train[column].mode()[0]
    df_train[column] = df_train[column].fillna(mode_value)

# For the test set, i am only fill missing values for columns that are present in the train data set and excluding 'price'.
# DOing this bring cosistency and more accuracy when training the train data. As we are dealing with unseeb data, we can't be sure if the unseen data will have additional column or not, or if they will contain some missing data, so it is best to take care of that. 
 
 
missing_columns_test = [col for col in train_missing_columns if col in df_test.columns]

for column in missing_columns_test:
    # Use the mode calculated from the training set for consistency
    mode_value = df_train[column].mode()[0]
    df_test[column] = df_test[column].fillna(mode_value)


In [None]:
df_train.info()

In [None]:
df_test.info()

## Encoding data

 We now have to convert all catogorical features into numerical beacuse most Machine learning models can not work with categorical features.

To do this, i will be making use of the get_dummies method that implements OneHotEncoding from the pandas library

In [None]:
# X = df_train.drop(['price'], axis=1)
# y = df_train['price']

In [None]:
df_train_encoded = pd.get_dummies(df_train, drop_first=True)

df_test_encoded = pd.get_dummies(df_test, drop_first=True)
df_test_encoded = df_test_encoded.reindex(columns=df_train_encoded.columns, fill_value=0)

spliting the trian data into features (X) and target (y) variables

In [None]:
from sklearn.model_selection import train_test_split

X = df_train_encoded.drop(['price'], axis=1)
y = df_train_encoded['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=42)

# Model Training

Baseline models

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor



from sklearn.metrics import mean_squared_error

base_models = {
    "Linear regression": LinearRegression(),
    "Decision tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(
        n_estimators=50,        
        max_depth=12,          
        min_samples_split=10,
        min_samples_leaf=5,
        max_features=0.4,      
        max_samples=0.7,        
        n_jobs=-1,              
        random_state=42),
        "k-NN": KNeighborsRegressor(n_neighbors=3)
}

base_models_information = []

In [None]:
def fit_eval_plot(name, model, ax):
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)

    base_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, base_pred))

    ax.scatter(y_test, base_pred, c='g', alpha=0.5, label='Predicted')
    min_val = min(y_test.min(), base_pred.min())
    max_val = max(y_test.max(), base_pred.max())
    ax.plot([min_val, max_val], [min_val, max_val], c='r', label='Ideal')

    ax.set_ylabel('Predicted')
    ax.set_xlabel('Actual')
    ax.set_title(f'{name} / RMSE={rmse} / score={score}')
    ax.legend()

    return {"model": name, "rmse": rmse, "pred": base_pred}


In [None]:
# Linear Regression

fig, ax = plt.subplots(1, 1, figsize=(8, 5))
base_models_information.append(fit_eval_plot("Linear regression", base_models["Linear regression"], ax))
plt.tight_layout()
plt.show()

In [None]:

#  Decision tree

fig, ax = plt.subplots(1, 1, figsize=(8, 5))
base_models_information.append(fit_eval_plot("Decision tree", base_models["Decision tree"], ax))
plt.tight_layout()
plt.show()


In [None]:
#  Random forest

fig, ax = plt.subplots(1, 1, figsize=(8, 5))
base_models_information.append(fit_eval_plot("Random Forest", base_models["Random Forest"], ax))
plt.tight_layout()
plt.show()


In [None]:
#  K-Nearest Neighbors

fig, ax = plt.subplots(1, 1, figsize=(8, 5))
base_models_information.append(fit_eval_plot("k-NN", base_models["k-NN"], ax))
plt.tight_layout()
plt.show()


Feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler

standard_Scaler = StandardScaler().fit(X)

df_train_scaled = pd.DataFrame(standard_Scaler.transform(X), columns = X.columns)
df_test_scaled = standard_Scaler.fit_transform(df_test_encoded)
df_train_scaled.plot(kind='box', figsize=(20,5))

df_train_scaled['price'] =y

Models Tuning

In [None]:
from sklearn.model_selection import GridSearchCV, KFold

results = []
best_models = {}
scoring = "neg_root_mean_squared_error"

cv = KFold(n_splits=3, shuffle=True, random_state=42)

X_scaled = df_train_scaled.drop(['price'], axis=1)
y_scaled = df_train_scaled['price']

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, train_size=0.2, random_state=42)

def run_gridsearch(name, model, grid, scoring):
    gs = GridSearchCV(
        estimator=model,
        param_grid=grid,
        cv=cv,
        scoring=scoring,
        # n_jobs=-1,
    )
    gs.fit(X_train, y_train)
    rmse = -gs.best_score_
    results.append({"model": name, "rmse": rmse, "params": gs.best_params_})
    best_models[name] = gs.best_estimator_
    params = gs.best_params_
    print(f"{name} | RMSE={rmse:.4f}")
    print(f"params={params} ")
    return gs

In [None]:
linreg_grid = {
    "fit_intercept": [True, False],
    "positive": [False, True]
}

run_gridsearch("linreg", LinearRegression(), linreg_grid, scoring)


In [None]:
knn_grid = {
    "n_neighbors": [5, 10],
    "weights": ["uniform", "distance"],
    "metric": ["minkowski"],
    "p": [2],
}

run_gridsearch("knn", KNeighborsRegressor(), knn_grid, scoring)


In [None]:
dt_grid = {
    "max_depth": [None, 10],
    "min_samples_split": [2, 10],
    "min_samples_leaf": [1, 4],
    "max_features": ["sqrt"],
    "ccp_alpha": [0.0],
}

run_gridsearch("dt", DecisionTreeRegressor(random_state=42), dt_grid, scoring)


In [None]:
rf_grid = {
    "n_estimators": [100],
    "max_depth": [None, 20],
    "min_samples_split": [2, 10],
    "min_samples_leaf": [1, 4],
    "max_features": ["sqrt"],
    "bootstrap": [True],
}

run_gridsearch("rf", RandomForestRegressor(random_state=42), rf_grid, scoring)


In [None]:
results_df = pd.DataFrame(results).sort_values("rmse")
results_df


Kaggle Submission File Generation

In [None]:
# Get the name of the best model
best_model_name = results_df.iloc[0]['model']

# Retrieve the best estimator (pipeline) for that model
best = best_models[best_model_name]

# Preprocess the test set using the same preprocessor fitted on the training data
# Note: The preprocess object is already part of the best_pipeline

# Make predictions on the test set
predictions = best.predict(df_test)

# Create the submission DataFrame
submission_df = pd.DataFrame({'id': df_test['id'], 'price': predictions})

# Ensure prices are non-negative, as car prices cannot be negative
submission_df['price'] = submission_df['price'].apply(lambda x: max(0, x))

# Display the first few rows of the submission file
display(submission_df.head())

# Save to CSV for Kaggle submission
submission_df.to_csv('submission.csv', index=False)