# Subproject 1 – Used Car Prices Prediction
Machine Learning – M.Sc. in Electrical and Computer Engineering


Importing libraries

In [1]:
import pandas as pd

In [2]:
import numpy as np
import matplotlib.pyplot as plt


Reading datasets files and making dataframes using pandas library

In [3]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_sample_submission = pd.read_csv("sample_submission.csv")


# Exploratory Data Analysis (EDA)

After reading and making DataFrames of the Datasets, i will be analysing the data.

Purpose: 

This will let me understand the kind of datatype i am working. 
It will let me understand the kind of features that are present in the Data, if they are categorical or numerical features.
If there there are missing values in the data, and the type (if any). This make me have better decision on methods i can use to fill up the values.

To do this i will be making use of some methods from the pandas library such as "shape()", "head()", "info()", "dtypes()", "describe()". 

During this EDA, i wil also be exploring some other related and important part which wich will let me make better decision when training tdata.

using the shape() method, will give the dimension (row x columns) of the data i am working with.

In [None]:
# for train data

df_train.shape

In [None]:
# for test data

df_test.shape

the head() method will show first rows of data but limited, "n" can be passed as an argument, if we want specific amount of row to be shown.

passing n as an argument here will show the first n rows from index (0 to n-1).

In [None]:
# for train data

df_train.head()

In [None]:
# for test data

df_test.head()

Using info() method to see the summary details about the dataframes like the index, datatypes, columns, non-null values and the memory usage.

Ths method also shows the total numbers of values for each of the columns present in the datasets

In [None]:
#  for the train data

df_train.info()

In [None]:
#  for the test data

df_test.info()

using describe() will give the descriptive statistics of the dataset such as the mean, count, std and others

In [None]:
df_train.describe()

In [None]:
df_test.describe()

Distribution Visualization

In [None]:
#  only numerical data can be plot.

df_train.drop(['price'], axis=1).plot(kind='box', figsize=(20,10))



Target visualization

In [None]:
# from the price distribution plot below, it is noticed that the target ['price'] of the train data is skewed with fewer high-priced car.

# The target distribution was analyzed to identify skewness and outliers in used car prices, which directly affects model choice and potential target transformations.

# Due to the skewness from the target, we can make use of log transform, beacuse it compreses the high-price tail and also affect the proportionality of the errors.

# price distrubution  visualization
plt.figure(figsize=(6,4))
plt.hist(df_train["price"], bins=50)
plt.title("Distribution of Car Prices")
plt.xlabel("Price")
plt.ylabel("Frequency")
plt.show()

separating numerical column and categorical column:

this part shows the numerical columns and categorical features columns we have. Doing this made me understand the if there will be need for data encoding or not.

In [4]:
#  for train data

numerical_columns = df_train.select_dtypes(include=["int64", "float64"]).columns
categorical_columns = df_test.select_dtypes(include=["object", "bool", "category"]).columns

print("Numerical columns:", numerical_columns)
print("Categorical columns:", categorical_columns)


Numerical columns: Index(['id', 'model_year', 'milage', 'price'], dtype='object')
Categorical columns: Index(['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col',
       'int_col', 'accident', 'clean_title'],
      dtype='object')


In [None]:
#this plot will show the 'brand' category showing that a small number of dataset dominate compare to other brands

# this was conducted to identify dominant and rare brands, guiding the choice of encoding strategies and dimensionality control. we can also change the column to other categorical columns too for better encoding strategies decision.

plt.figure(figsize=(6,4))
df_train["brand"].value_counts().head(10).plot(kind="bar")
plt.title("Top 10 Car Brands")
plt.ylabel("Count")
plt.show()




In [None]:
# The relationship between price and fuel type was analyzed to verify its influence on vehicle value which justified categorical encoding.
# Encoding is important so that the model used will be able to understand the differences between the fuel_types as it one of the features that the target depends on.

plt.figure(figsize=(6,4))
df_train.boxplot(column="price", by="fuel_type")
plt.title("Price vs Fuel Type")
plt.suptitle("")  # removes automatic subtitle
plt.show()

Missing values

the pandas library provides isnull() to check the column with missing values, sum() will give the total sum of the missing values if any exist,  and the sort_values() for sorting either in ascending or descending order.

In [5]:
# from pandas dataframe, we can use the .mean() method to find the mean for the each colums with missing values, then convert it to percentage multiplying be 100.

#  Missing value analysis was performed to quantify data incompleteness and justify the imputation strategies applied during preprocessing.

#  for train data

train_missing_values =df_train.isnull().sum().sort_values(ascending=False)
train_missing_columns = df_train.columns[df_train.isna().any()].tolist()

train_missing_percentage = df_train.isnull().mean().sort_values(ascending=False) * 100

train_missing_columns

['fuel_type', 'accident', 'clean_title']

In [6]:
#for test data

df_test.isnull().sum().sort_values(ascending=False)

train_missing_percentage[train_missing_percentage > 0]

clean_title    11.360876
fuel_type       2.696080
accident        1.300568
dtype: float64

In [None]:
# the plot for columns with missing data
(df_train.drop(['price'], axis=1).isnull().mean() * 100).plot(kind="bar")
plt.title("Missing values")
plt.ylabel("missing")
plt.show()


# Data preprocessing

## Filling missing data

From using info() method when doing EDA above on the train data, It was seen that there are some missing values. Like in the fuel_type, accident and the clean_title features.

This missing values are categorical features, therefore there is need to fill the the missing values before training because model can not be trained without with NaNs.

In [7]:
# For replacing the missing values in the training set, i used the mode() function from pandas to get the mode and using fillna() function from pandas library  to fill the positions where there was a misiing values.

for column in train_missing_columns:
    #  calculates the mode of the values.
    mode_value = df_train[column].mode()[0]

    # then filled it to the missing values position.
    df_train[column] = df_train[column].fillna(mode_value)

# For the test set, i filled only missing values for columns that are present in the train data set and excluding 'price'.
# Doing this bring cosistency and more accuracy when training. As we are dealing with unseen data, we can't be sure if the unseen data will have additional column or not, or if they will contain some missing data, so it is best to take care of that. 
 
#  this line will select only the column with missing data in test data that are also present in the train data 
missing_columns_test = [col for col in train_missing_columns if col in df_test.columns]

for column in missing_columns_test:
    # Use the mode calculated from the training set for consistency
    mode_value = df_train[column].mode()[0]
    df_test[column] = df_test[column].fillna(mode_value)


In [8]:
#confirming if the missing data was successfully filled

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            188533 non-null  int64 
 1   brand         188533 non-null  object
 2   model         188533 non-null  object
 3   model_year    188533 non-null  int64 
 4   milage        188533 non-null  int64 
 5   fuel_type     188533 non-null  object
 6   engine        188533 non-null  object
 7   transmission  188533 non-null  object
 8   ext_col       188533 non-null  object
 9   int_col       188533 non-null  object
 10  accident      188533 non-null  object
 11  clean_title   188533 non-null  object
 12  price         188533 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 18.7+ MB


In [None]:
df_test.info()

## Encoding Categorical Features

it was seen from previous that there are some categorical features present in our datasets, there these categorical need to be encoded to be numerical features.

To do this, i used the get_dummies() method that implements OneHotEncoding of categorical data from the pandas library.

In [9]:
df_train_encoded = pd.get_dummies(df_train, drop_first=True)

df_test_encoded = pd.get_dummies(df_test, drop_first=True)
df_test_encoded = df_test_encoded.reindex(columns=df_train_encoded.columns, fill_value=0)

spliting the trian data into features (X) and target (y) variables

In [10]:
from sklearn.model_selection import train_test_split

X = df_train_encoded.drop(['price'], axis=1)
y = df_train_encoded['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=42)

### Cell 52
Scales features, log-transforms the target, builds polynomial features, and prepares test features.

In [None]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

standard_Scaler = StandardScaler().fit(X)

df_train_scaled = pd.DataFrame(standard_Scaler.transform(X), columns = X.columns)
df_test_scaled = standard_Scaler.fit_transform(df_test_encoded)
df_train_scaled.plot(kind='box', figsize=(20,5))

# set and fit the scaler
delta = 1
y_logged =  np.log(y+delta)

polynimial_features = PolynomialFeatures(degree=2, include_bias=False).fit(df_train_scaled)

df_train_scaled_polynomial = pd.DataFrame(polynimial_features.transform(df_train_scaled), columns = polynimial_features.get_feature_names_out(df_train_scaled.columns))

df_train_scaled_polynomial.plot(kind='box', figsize=(20,5))

df_train_scaled_polynomial['price'] = y

# spliting
X_scaled = df_train_scaled_polynomial.drop(['price'], axis=1)
y_scaled = df_train_scaled_polynomial['price']

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_logged, train_size=0.2, random_state=42)

# Model Training

After I analyzed the Data, filled the columns with missing values and encoded the categorical features to numerical features. Then i moved to training the model.

Baseline models without tuning or applying polynomial features or log transform.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor



from sklearn.metrics import mean_squared_error

base_models = {
    "Linear regression": LinearRegression(),
    "Decision tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "k-NN": KNeighborsRegressor()
}

base_models_information = []

Defines a helper to train a model, compute RMSE/R^2, and plot predicted vs actual.

In [None]:
def fit_eval_plot(name, model, ax):
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)

    base_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, base_pred))

    ax.scatter(y_test, base_pred, c='g', alpha=0.5, label='Predicted')
    min_val = min(y_test.min(), base_pred.min())
    max_val = max(y_test.max(), base_pred.max())
    ax.plot([min_val, max_val], [min_val, max_val], c='r', label='Ideal')

    ax.set_ylabel('Predicted')
    ax.set_xlabel('Actual')
    ax.set_title(f'{name} / RMSE={rmse} / score={score}')
    ax.legend()

    return {"model": name, "rmse": rmse, "pred": base_pred}


Runs the baseline Linear Regression model and plots predicted vs actual.

In [None]:
# Linear Regression

fig, ax = plt.subplots(1, 1, figsize=(8, 5))
base_models_information.append(fit_eval_plot("Linear regression", base_models["Linear regression"], ax))
plt.tight_layout()
plt.show()

Runs the baseline Decision Tree model and plots predicted vs actual.

In [None]:

#  Decision tree

fig, ax = plt.subplots(1, 1, figsize=(8, 5))
base_models_information.append(fit_eval_plot("Decision tree", base_models["Decision tree"], ax))
plt.tight_layout()
plt.show()


Runs the baseline Random Forest model and plots predicted vs actual.

In [None]:
#  Random forest

fig, ax = plt.subplots(1, 1, figsize=(8, 5))
base_models_information.append(fit_eval_plot("Random Forest", base_models["Random Forest"], ax))
plt.tight_layout()
plt.show()


Runs the baseline k-NN model and plots predicted vs actual.

In [None]:
#  K-Nearest Neighbors

fig, ax = plt.subplots(1, 1, figsize=(8, 5))
base_models_information.append(fit_eval_plot("k-NN", base_models["k-NN"], ax))
plt.tight_layout()
plt.show()


the plots from base models above are underfitted.

Features scaling

Scales features, log-transforms the target, builds polynomial features, and prepares test features.

In [None]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

# scale using the training split to avoid leakage
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test), columns=X_test.columns, index=X_test.index
)

df_test_scaled = pd.DataFrame(
    scaler.transform(df_test_encoded), columns=df_test_encoded.columns, index=df_test_encoded.index
)

X_train_scaled.plot(kind="box", figsize=(20, 5))

# log-transform target for modeling
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)

# polynomial features based on the scaled training data
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train = pd.DataFrame(
    poly.fit_transform(X_train_scaled),
    columns=poly.get_feature_names_out(X_train_scaled.columns),
    index=X_train_scaled.index,
)
X_test = pd.DataFrame(
    poly.transform(X_test_scaled),
    columns=poly.get_feature_names_out(X_train_scaled.columns),
    index=X_test_scaled.index,
)

X_train.plot(kind="box", figsize=(20, 5))

df_test_poly = pd.DataFrame(
    poly.transform(df_test_scaled),
    columns=poly.get_feature_names_out(df_test_scaled.columns),
    index=df_test_scaled.index,
)


Models Tuning

Defines grid search helper with CV RMSE and predicted vs actual plots.

In [None]:
from sklearn.model_selection import GridSearchCV, KFold

results = []
best_models = {}
scoring = "neg_root_mean_squared_error"

cv = KFold(n_splits=3, shuffle=True, random_state=42)

def run_gridsearch(name, model, grid, scoring):
    gs = GridSearchCV(
        estimator=model,
        param_grid=grid,
        cv=cv,
        scoring=scoring,
        # n_jobs=-1,
    )
    gs.fit(X_train, y_train)
    rmse = -gs.best_score_
    results.append({"model": name, "rmse": rmse, "params": gs.best_params_})
    best_models[name] = gs.best_estimator_
    params = gs.best_params_
    print(f"{name} | RMSE={rmse:.4f}")
    print(f"params={params} ")

    # Predicted vs actual plot for the best model
    preds = gs.best_estimator_.predict(X_test)
    fig, ax = plt.subplots(1, 1, figsize=(8, 5))
    ax.scatter(y_test, preds, c='g', alpha=0.5, label='Predicted')
    min_val = min(y_test.min(), preds.min())
    max_val = max(y_test.max(), preds.max())
    ax.plot([min_val, max_val], [min_val, max_val], c='r', label='Ideal')
    ax.set_ylabel('Predicted')
    ax.set_xlabel('Actual')
    ax.set_title(f"{name} (best) / CV_RMSE={rmse:.4f}")
    ax.legend()
    plt.tight_layout()
    plt.show()

    return gs


Runs grid search for Linear Regression.

In [None]:
linreg_grid = {
    "fit_intercept": [True, False],
    "positive": [False, True]
}

run_gridsearch("linreg", LinearRegression(), linreg_grid, scoring)


Runs grid search for k-NN.

In [None]:
knn_grid = {
    "n_neighbors": [5, 10],
    "weights": ["uniform", "distance"],
    "metric": ["minkowski"],
    "p": [2],
}

run_gridsearch("knn", KNeighborsRegressor(), knn_grid, scoring)


Runs grid search for Decision Tree.

In [None]:
dt_grid = {
    "max_depth": [None, 10],
    "min_samples_split": [2, 10],
    "min_samples_leaf": [1, 4],
    "max_features": ["sqrt"],
    "ccp_alpha": [0.0],
}

run_gridsearch("dt", DecisionTreeRegressor(random_state=42), dt_grid, scoring)


Runs grid search for Random Forest.

In [None]:
rf_grid = {
    "n_estimators": [100],
    "max_depth": [None, 20],
    "min_samples_split": [2, 10],
    "min_samples_leaf": [1, 4],
    "max_features": ["sqrt"],
    "bootstrap": [True],
}

run_gridsearch("rf", RandomForestRegressor(random_state=42), rf_grid, scoring)


Builds and displays the sorted grid-search results table.

In [None]:
results_df = pd.DataFrame(results).sort_values("rmse")
results_df


Kaggle Submission File Generation

Generates submission predictions from the best model and saves submission.csv.

In [None]:
# Get the name of the best model
best_model_name = results_df.iloc[0]["model"]

# Retrieve the best estimator for that model
best = best_models[best_model_name]

# Keep test ids from the raw test set
test_ids = df_test["id"].copy()

# Use the same feature set used for training
try:
    X_submit = df_test_poly
except NameError:
    X_submit = df_test_encoded

# Make predictions on the test set
predictions = best.predict(X_submit)

# If target was log-transformed, invert back to price scale
use_log_target = y_train.max() < 1000
if use_log_target:
    predictions = np.expm1(predictions)

# Create the submission DataFrame
submission_df = pd.DataFrame({"id": test_ids, "price": predictions})

# Ensure prices are non-negative, as car prices cannot be negative
submission_df["price"] = submission_df["price"].clip(lower=0)

# Display the first few rows of the submission file
display(submission_df.head())

# Save to CSV for Kaggle submission
submission_df.to_csv("submission.csv", index=False)
