In [27]:
# STEP 1 - Load Data and Initial Inspection
import pandas as pd

df = pd.read_csv('../data/car-price-prediction-x/train.csv')
print(df.head())
print(df.info())
print(df.describe())


           model  year motor_type        running wheel    color   type  \
0         toyota  2022     petrol       3000  km  left  skyblue  sedan   
1  mercedes-benz  2014     petrol     132000  km  left    black  sedan   
2            kia  2018     petrol   95000  miles  left    other  sedan   
3  mercedes-benz  2002     petrol  137000  miles  left   golden  sedan   
4  mercedes-benz  2017     petrol     130000  km  left    black  sedan   

      status  motor_volume  price  
0  excellent           2.0  24500  
1  excellent           2.0  25500  
2  excellent           2.0  11700  
3  excellent           3.2  12000  
4       good           2.0  26000  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1642 entries, 0 to 1641
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         1642 non-null   object 
 1   year          1642 non-null   int64  
 2   motor_type    1642 non-null   object 
 3   running     

In [28]:
# STEP 1.5 - Clean 'running' column, convert to numeric 'running_km'
def convert_running_to_km(x):
    if 'km' in x:
        return float(x.replace(' km', '').replace(',', ''))
    elif 'miles' in x:
        return float(x.replace(' miles', '').replace(',', '')) * 1.60934
    else:
        return None

df['running_km'] = df['running'].apply(convert_running_to_km)
df.drop(columns=['running'], inplace=True)

print(df[['running_km']].describe())
print(df.info())


         running_km
count  1.642000e+03
mean   1.192104e+05
std    9.676625e+04
min    1.000000e+01
25%    5.632690e+04
50%    9.878604e+04
75%    1.609139e+05
max    1.251708e+06
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1642 entries, 0 to 1641
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         1642 non-null   object 
 1   year          1642 non-null   int64  
 2   motor_type    1642 non-null   object 
 3   wheel         1642 non-null   object 
 4   color         1642 non-null   object 
 5   type          1642 non-null   object 
 6   status        1642 non-null   object 
 7   motor_volume  1642 non-null   float64
 8   price         1642 non-null   int64  
 9   running_km    1642 non-null   float64
dtypes: float64(2), int64(2), object(6)
memory usage: 128.4+ KB
None


In [29]:
# STEP 2 - Add 'is_luxury' flag based on model
luxury_brands = ['mercedes-benz', 'bmw', 'audi']
df['is_luxury'] = df['model'].apply(lambda x: 1 if x in luxury_brands else 0)
print(df[['model', 'is_luxury']].head(10))


           model  is_luxury
0         toyota          0
1  mercedes-benz          1
2            kia          0
3  mercedes-benz          1
4  mercedes-benz          1
5  mercedes-benz          1
6         nissan          0
7         nissan          0
8  mercedes-benz          1
9        hyundai          0


In [30]:
# STEP 3 - One-hot encode categorical columns
df_encoded = pd.get_dummies(
    df,
    columns=['model', 'motor_type', 'wheel', 'color', 'type', 'status'],
    drop_first=True
)
print(df_encoded.head())
print(f"Encoded columns: {df_encoded.columns.tolist()}")


   year  motor_volume  price  running_km  is_luxury  model_kia  \
0  2022           2.0  24500     3000.00          0      False   
1  2014           2.0  25500   132000.00          1      False   
2  2018           2.0  11700   152887.30          0       True   
3  2002           3.2  12000   220479.58          1      False   
4  2017           2.0  26000   130000.00          1      False   

   model_mercedes-benz  model_nissan  model_toyota  motor_type_gas  ...  \
0                False         False          True           False  ...   
1                 True         False         False           False  ...   
2                False         False         False           False  ...   
3                 True         False         False           False  ...   
4                 True         False         False           False  ...   

   type_Universal  type_hatchback  type_minivan / minibus  type_pickup  \
0           False           False                   False        False   
1   

In [31]:
# STEP 4 - Train/Test Split
from sklearn.model_selection import train_test_split

X = df_encoded.drop('price', axis=1)
y = df_encoded['price']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")


X_train shape: (1313, 38), X_test shape: (329, 38)


In [32]:
# STEP 5 - Train Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [34]:
# STEP 6 - Evaluate Model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R2: {r2:.2f}")


MAE: 2158.82
RMSE: 14014736.47
R2: 0.68


In [35]:
# STEP 7 - Save model and feature columns
import joblib

joblib.dump(model, '../notebooks/model.joblib')
joblib.dump(X_train.columns, '../notebooks/X_columns.joblib')


['../notebooks/X_columns.joblib']

In [36]:
# 1. Load original data
import pandas as pd
df = pd.read_csv('../data/car-price-prediction-x/train.csv')

In [37]:
# 2. Clean 'running' column — convert to numeric km only
def convert_running_to_km(val):
    val = str(val).lower().replace(',', '').strip()
    if 'km' in val:
        return float(val.replace('km', '').strip())
    elif 'mile' in val:
        return float(val.replace('miles', '').replace('mile', '').strip()) * 1.60934
    else:
        return float(val)  # fallback

df['running_km'] = df['running'].apply(convert_running_to_km)

In [41]:
# 3. Create is_luxury flag
luxury_brands = ['bmw', 'mercedes-benz', 'audi']
df['is_luxury'] = df['model'].apply(lambda x: 1 if x in luxury_brands else 0)

In [40]:
# 4. One-hot encode categorical features — drop_first=True to avoid dummy trap
cat_cols = ['model', 'motor_type', 'wheel', 'color', 'type', 'status']
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)


In [42]:
# 5. Select features and target
feature_cols = [c for c in df_encoded.columns if c != 'price' and c != 'running']  # exclude original running and price target
X = df_encoded[feature_cols]
y = df_encoded['price']

In [43]:
# 6. Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
# 7. Train Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [46]:
# 8. Evaluate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

MAE: 2158.823125633232
RMSE: 14014736.473877652
R2: 0.6815367355276998


In [47]:
# 9. Save model and columns
import joblib
joblib.dump(model, '../notebooks/model.joblib')
joblib.dump(list(X.columns), '../notebooks/X_columns.joblib')

print("Model and feature columns saved successfully.")

Model and feature columns saved successfully.
