In [1]:
import pandas as pd

df_1 = pd.read_csv('train.csv')  # loads the training set including the target and the features

# separate the training set into features and target
X_train = df_1.drop(columns = ['price']) # training features
y_train = df_1['price']

X_valid = pd.read_csv('test.csv') # loads the validation dataset including targets and featues

In [2]:
X_valid.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,54273,Mercedes-Benz,E-Class E 350,2014,73000,Gasoline,302.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,A/T,White,Beige,None reported,Yes
1,54274,Lexus,RX 350 Base,2015,128032,Gasoline,275.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,8-Speed A/T,Silver,Black,None reported,Yes
2,54275,Mercedes-Benz,C-Class C 300,2015,51983,Gasoline,241.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Blue,White,None reported,Yes
3,54276,Land,Rover Range Rover 5.0L Supercharged Autobiogra...,2018,29500,Gasoline,518.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,White,White,At least 1 accident or damage reported,Yes
4,54277,BMW,X6 xDrive40i,2020,90000,Gasoline,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,White,Black,At least 1 accident or damage reported,Yes


## checking the dataset to see if there are missing values
col_with_missing_val_X_train = [col for col in X_train.columns if X_train[col].isnull().any()]
col_with_missing_val_X_valid = [col for col in X_valid.columns if X_valid[col].isnull().any()]

print('columns with missing value in X_train')
print(col_with_missing_val_X_train)
print('columns with missing value in X_valid')
print(col_with_missing_val_X_valid)

In [3]:
# we can successfully say that from our code above, we have no problem with missing variables so let us try working on categorical variables
object_cols = X_train.select_dtypes(include =['object']).columns.tolist()
print(object_cols)

['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']


In [4]:
for col in object_cols:
    print(f'{col}: {X_train[col].nunique()}')

brand: 53
model: 1827
fuel_type: 7
engine: 1061
transmission: 46
ext_col: 260
int_col: 124
accident: 2
clean_title: 1


In [5]:
# for now lets one hot encode the categotical columns with cadinality less than 54
low_cadinality_cols = [col for col in object_cols if X_train[col].nunique() < 54]
print(low_cadinality_cols)

['brand', 'fuel_type', 'transmission', 'accident', 'clean_title']


In [6]:
# now we do the one hot encoding for the low cadinality cols
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse = False, handle_unknown = 'ignore')

# fit and transform the training and validation data
encoded_train = encoder.fit_transform(X_train[low_cadinality_cols])
encoded_valid = encoder.transform(X_valid[low_cadinality_cols])

# create dataframe of the encoded data
encoded_train_df = pd.DataFrame(encoded_train, index = X_train.index, columns = encoder.get_feature_names_out(low_cadinality_cols))
encoded_valid_df = pd.DataFrame(encoded_valid, index = X_valid.index, columns = encoder.get_feature_names_out(low_cadinality_cols))

# drop the original colums from the dataset
X_train = X_train.drop(low_cadinality_cols, axis = 1)
X_valid = X_valid.drop(low_cadinality_cols, axis = 1)

# concatenate the encoded data and the modified X_train and X_valid
X_train = pd.concat([X_train, encoded_train_df], axis = 1)
X_valid = pd.concat([X_valid, encoded_valid_df], axis = 1)



Our engine model is very important for getting a very good predictive poweer or good accuracy for our model but its high cadinality
we can't use one hot encoding, rather we can extract some useful informations from the engine columns such as horsepower, number of cylinders, and displacement.

We can achieve this using the "re" Random Expression module of python

In [7]:
# now we write a function to extract useful information from the engine column
import re
def  extract_engine_feature(engine_str):
    # extract the hosepower feature as a new column from pattern in the engine column
    hp_match = re.search(r"(\d+(\.\d+)?)HP", engine_str)
    horsepower = float(hp_match.group(1)) if hp_match else None
    
    # extract the engine displacement feature
    displacement_match = re.search(r"(\d+(\.\d+)?)L", engine_str)
    displacement = float(displacement_match.group(1)) if displacement_match else None

    # extract the number of cylinders
    cylinder_match = re.search(r"(\d+) Cylinder", engine_str)
    cylinders = int(cylinder_match.group(1)) if cylinder_match else None

    return horsepower, displacement, cylinders

# Apply the extraction function to the 'engine' column
engine_feature_train = X_train['engine'].apply(extract_engine_feature)
engine_feature_valid = X_valid['engine'].apply(extract_engine_feature)

# convert it to a pandas dataframe
engine_train_df = pd.DataFrame(engine_feature_train.tolist(), columns = ['horsepower', 'displacement', 'cylinders'])
engine_valid_df = pd.DataFrame(engine_feature_valid.tolist(), columns = ['horsepower', 'displacement', 'cylinders'])

# concatenate the new data to the original 
X_train = pd.concat([X_train, engine_train_df], axis = 1)
X_valid = pd.concat([X_valid, engine_valid_df], axis = 1)

# Drop the original 'engine' column as it is now redundant
X_train = X_train.drop(columns = ['engine'])
X_valid = X_valid.drop(columns = ['engine'])

                               

In [8]:
X_train['cylinders'].mean()

6.282346600662701

In [9]:
# checking the dataset to see if there are missing values
col_with_missing_val_X_train = [col for col in X_train.columns if X_train[col].isnull().any()]
col_with_missing_val_X_valid = [col for col in X_valid.columns if X_valid[col].isnull().any()]

print('columns with missing value in X_train')
print(col_with_missing_val_X_train)
print('columns with missing value in X_valid')
print(col_with_missing_val_X_valid)

columns with missing value in X_train
['horsepower', 'displacement', 'cylinders']
columns with missing value in X_valid
['horsepower', 'displacement', 'cylinders']


In [10]:
# since some of my new columns have missing values, i will have to do some imputation
from sklearn.impute import SimpleImputer
columns_to_impute = ['horsepower', 'displacement', 'cylinders']
my_imputer = SimpleImputer(strategy='mean')

# We first make a copy to avoid changing the original data
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# Making new columns indicating whether the imputed data was missing and for that matter imputed or otherwise.
for col in columns_to_impute:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull().astype(int)
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull().astype(int)

# Apply imputation only to the specified columns
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train_plus[columns_to_impute]), columns=columns_to_impute)
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid_plus[columns_to_impute]), columns=columns_to_impute)

# Restore the original data including the imputed columns and new indicator columns
X_train_plus[columns_to_impute] = imputed_X_train
X_valid_plus[columns_to_impute] = imputed_X_valid

# Assign back to the original variables
X_train = X_train_plus
X_valid = X_valid_plus


In [11]:
X_train.head()

Unnamed: 0,id,model,model_year,milage,ext_col,int_col,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,...,transmission_–,accident_At least 1 accident or damage reported,accident_None reported,clean_title_Yes,horsepower,displacement,cylinders,horsepower_was_missing,displacement_was_missing,cylinders_was_missing
0,0,F-150 Lariat,2018,74349,Blue,Gray,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,375.0,3.5,6.0,0,0,0
1,1,335 i,2007,80000,Black,Black,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,300.0,3.0,6.0,0,0,0
2,2,XF Luxury,2009,91491,Purple,Beige,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,300.0,4.2,8.0,0,0,0
3,3,X7 xDrive40i,2022,2437,Gray,Brown,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,335.0,3.0,6.0,0,0,0
4,4,Firebird Base,2001,111000,White,Black,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,200.0,3.8,6.0,0,0,0


In [12]:
# we can successfully say that from our code above, we have no problem with missing variables so let us try working on categorical variables
object_cols = X_train.select_dtypes(include =['object']).columns.tolist()
print(object_cols)

['model', 'ext_col', 'int_col']


In [13]:
# I will be dropping features like color of the car's exterior and interior
# even though they matter in the buyers choice but i don't think it's a strong determinant of the car's price
# I will also be droping the column model since it has a high cadinality and no regular pattern for me to extract features from it.

X_train = X_train.drop(columns = object_cols, axis = 1)
X_valid = X_valid.drop(columns = object_cols, axis = 1)                    

In [14]:
X_valid.head()

Unnamed: 0,id,model_year,milage,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,...,transmission_–,accident_At least 1 accident or damage reported,accident_None reported,clean_title_Yes,horsepower,displacement,cylinders,horsepower_was_missing,displacement_was_missing,cylinders_was_missing
0,54273,2014,73000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,302.0,3.5,6.0,0,0,0
1,54274,2015,128032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,275.0,3.5,6.0,0,0,0
2,54275,2015,51983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,241.0,2.0,4.0,0,0,0
3,54276,2018,29500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,518.0,5.0,8.0,0,0,0
4,54277,2020,90000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,335.0,3.0,6.0,0,0,0


 Having finished the data processing stage Now lets do the modeling
 
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split

Range of max_leaf_nodes to evaluate

max_leaf_nodes_values = [5, 10, 20, 30, 40, 50, 100, 200, 400, 500, 700, 1000]

Initialize variables to store the best MAE and corresponding max_leaf_nodes
best_mae = float("inf")
best_max_leaf_nodes = None

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.2, random_state = 0)

Evaluate each max_leaf_nodes value
for max_leaf_nodes in max_leaf_nodes_values:
    model = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes, random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, predictions)
    
    if mae < best_mae:
        best_mae = mae
        best_max_leaf_nodes = max_leaf_nodes

    print(f"max_leaf_nodes: {max_leaf_nodes}, MAE: {mae}")

print(f"Best max_leaf_nodes: {best_max_leaf_nodes} with MAE: {best_mae}")

running the above as code, i had the Best Max_leaf_node: 200  with MAE: 17129.43210 
with this i think the global minimum is around this area.

In [15]:
from sklearn.ensemble import RandomForestRegressor
import joblib
my_model = RandomForestRegressor(n_estimators = 100, max_leaf_nodes = 200, random_state = 0)
my_model.fit(X_train, y_train)
joblib.dump(my_model, 'Car_price_predictor.joblib')

['Car_price_predictor.joblib']

In [16]:
loaded_model = joblib.load('Car_price_predictor.joblib')
predictions = loaded_model.predict(X_valid)
id = X_valid['id']

# Create a DataFrame to store car IDs and predicted prices
output_df = pd.DataFrame({'id': id, 'price': predictions})

# Save the predictions to a CSV file
output_df.to_csv('predictions.csv', index=False)
print("Predictions saved to predictions.csv successfully.")

Predictions saved to predictions.csv successfully.
