In [2]:
# --- Step 0: Imports ---
import pandas as pd
import numpy as np
import io  # Used to simulate a file for the example
from sklearn.model_selection import train_test_split
import torch  # For add_safe_globals
import typing  # For add_safe_globals
import collections  # For add_safe_globals
from omegaconf.base import ContainerMetadata, Metadata # For add_safe_globals
from omegaconf.listconfig import ListConfig # For add_safe_globals
from omegaconf.nodes import AnyNode # For add_safe_globals

# PyTorch Tabular imports
from pytorch_tabular import TabularModel
from pytorch_tabular.config import (
    DataConfig,
    TrainerConfig,
    ExperimentConfig,
    OptimizerConfig
)
from pytorch_tabular.models import FTTransformerConfig

print("--- Imports successful ---")

--- Imports successful ---


In [3]:
import pandas as pd
import numpy as np
try:
    # Try reading with the default engine first
    df = pd.read_csv("Car data to fed to the model.csv")
except pd.errors.ParserError as e:
    print(f"ParserError encountered: {e}. Trying to read with engine='python' and on_bad_lines='skip'.")
    # If a ParserError occurs, try reading with the 'python' engine and skip bad lines
    # Note: Using engine='python' can be slower than the default 'c' engine.
    df = pd.read_csv("Car data to fed to the model.csv", engine='python', on_bad_lines='skip')

df.head()

Unnamed: 0,myear,body,transmission,fuel,km,oem,model,variant,listed_price,utype,...,Height,Wheel Base,Kerb Weight,Gear Box,Drive Type,Seats,Steering Type,state,owner_type,Max Torque At
0,2016,hatchback,manual,cng,69162,maruti,maruti wagon r,lxi cng,370000,dealer,...,1700.0,2400.0,960.0,5 speed,fwd,5.0,power,uttar pradesh,first,3500.0
1,2015,hatchback,manual,cng,45864,maruti,maruti celerio,green vxi,365000,dealer,...,1560.0,2425.0,915.0,5 speed,fwd,5.0,power,maharashtra,first,3500.0
2,2015,sedan,manual,cng,81506,honda,honda amaze,s plus i-vtec,421000,dealer,...,1505.0,2405.0,950.0,5 speed,fwd,5.0,power,delhi,second,4500.0
3,2013,hatchback,manual,cng,115893,maruti,maruti wagon r,lxi cng,240000,dealer,...,1700.0,2400.0,960.0,5 speed,fwd,5.0,power,delhi,second,3500.0
4,2022,muv,manual,cng,18900,maruti,maruti ertiga,vxi cng,1175000,dealer,...,1690.0,2740.0,1250.0,5 speed,2wd,7.0,,maharashtra,first,4200.0


In [4]:
# --- Step 2: Clean Column Names ---
print("--- Cleaning column names... ---")
# Save original names before cleaning for Step 8 testing
original_categorical_cols = [
    'body', 'transmission', 'fuel','utype',
    'Engine Type', 'Drive Type', 'Steering Type', 'state', 'owner_type'
]
original_numerical_cols = [
    'myear', 'km', 'No of Cylinder', 'Length', 'Width', 'Height',
    'Wheel Base', 'Kerb Weight', 'Gear Box', 'Seats', 'Max Torque At'
]
original_target_col = 'listed_price'

df.columns = df.columns.str.replace(' ', '_').str.lower()
print("--- Column names cleaned ---")

--- Cleaning column names... ---
--- Column names cleaned ---


In [5]:
# 3.2: Remove Outliers (Crucial for stability)
print(f"Removing outliers (price < ₹20,000). Original shape: {df.shape}")
df = df[df['listed_price'] > 20000].copy()
print(f"New shape after outlier removal: {df.shape}")

Removing outliers (price < ₹20,000). Original shape: (7793, 30)
New shape after outlier removal: (7792, 30)


In [6]:
# 3.3: !! LOG TRANSFORM TARGET !!
# We use np.log1p which is log(1 + x) to securely handle any zero values
df['log_listed_price'] = np.log1p(df['listed_price'])
# Drop the original price column
df = df.drop(columns=[original_target_col])

In [7]:
# --- Step 3: Define Cleaned Columns ---
categorical_cols = [col.replace(' ', '_').lower() for col in original_categorical_cols]
numerical_cols = [col.replace(' ', '_').lower() for col in original_numerical_cols]
target_col = 'log_listed_price'# Assuming target name was already clean

# 3.5: Fix 'gear_box' column
print("Cleaning 'gear_box' column (e.g., '5 speed' -> 5.0)...")
df['gear_box'] = df['gear_box'].astype(str).str.extract(r'(\d+)').astype(float)

Cleaning 'gear_box' column (e.g., '5 speed' -> 5.0)...


In [8]:
# --- Step 4: Data Cleaning and Preprocessing ---
print("\n--- Cleaning data (filling missing values)... ---")
for col in numerical_cols:
    if df[col].isnull().any():
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)
        print(f"Filled missing '{col}' with median: {median_val}")
for col in categorical_cols:
    if df[col].isnull().any():
        mode_val = df[col].mode()[0]
        df[col] = df[col].fillna(mode_val)
        print(f"Filled missing '{col}' with mode: {mode_val}")

df[categorical_cols] = df[categorical_cols].astype(str)
df[numerical_cols] = df[numerical_cols].astype(float)
df[target_col] = df[target_col].astype(float)
print("\n--- Data cleaning complete! ---")


--- Cleaning data (filling missing values)... ---
Filled missing 'no_of_cylinder' with median: 4.0
Filled missing 'length' with median: 4225.0
Filled missing 'width' with median: 1699.0
Filled missing 'height' with median: 1555.0
Filled missing 'wheel_base' with median: 2552.0
Filled missing 'kerb_weight' with median: 1180.0
Filled missing 'gear_box' with median: 5.0
Filled missing 'seats' with median: 5.0
Filled missing 'max_torque_at' with median: 3000.0
Filled missing 'body' with mode: hatchback
Filled missing 'engine_type' with mode: in-line engine
Filled missing 'drive_type' with mode: fwd
Filled missing 'steering_type' with mode: power
Filled missing 'state' with mode: maharashtra
Filled missing 'owner_type' with mode: first

--- Data cleaning complete! ---


In [9]:
# --- Step 5: Split Data ---
if len(df) >= 20: # Need enough data for a meaningful split
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
else:
    print("\nWarning: Dataset is very small (<20 samples). Using full dataset for train and validation.")
    train_df = df.copy()
    val_df = df.copy()

print(f"\nTraining samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")


Training samples: 6233
Validation samples: 1559


In [10]:
# --- Step 6: Configure the FT-Transformer ---
print("\n--- Configuring model... ---")
data_config = DataConfig(
    target=[target_col],
    continuous_cols=numerical_cols,
    categorical_cols=categorical_cols,
)
trainer_config = TrainerConfig(
    auto_lr_find=False, # Set to False for tiny datasets, manually set LR
    # learning_rate=1e-4, # If auto_lr_find is False, uncomment and set manually
    batch_size=32,    # Smaller batch size for tiny dataset
    max_epochs=50,     # Fewer epochs might be needed for tiny dataset
    accelerator="auto",
    devices=-1,
    # Added early stopping to prevent overfitting on small data
    early_stopping="valid_loss", # Monitor validation loss
    early_stopping_patience=5, # Stop if valid_loss doesn't improve for 5 epochs
    check_val_every_n_epoch=1,
)
model_config = FTTransformerConfig(
    task="regression",
    input_embed_dim=32,
    num_heads=4,        # Reduced heads for smaller dataset
    num_attn_blocks=2,  # Reduced layers for smaller dataset
    attn_dropout=0.1,
    ff_dropout=0.1,
    embedding_dropout=0.1,
    learning_rate = 1e-4 # Manually setting learning rate
)
optimizer_config = OptimizerConfig() # Use default Adam
experiment_config = ExperimentConfig(
    project_name="Car_Price_Log_Transform_v1",
    log_target="tensorboard" # Log locally
)


--- Configuring model... ---


In [11]:
# (Keep all previous imports and code up to Step 7 initialization)
import torch
from omegaconf.base import ContainerMetadata, Metadata
from omegaconf.listconfig import ListConfig
from omegaconf.nodes import AnyNode
from omegaconf.dictconfig import DictConfig # <--- ADD this import
import typing
import collections

print("\n--- Initializing Model... ---")
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    experiment_config=experiment_config,
)

# --- UPDATE THIS LINE AGAIN TO FIX THE NEW UNPICKLING ERROR ---
# Add DictConfig to the list of allowed globals
torch.serialization.add_safe_globals([
    ContainerMetadata,
    typing.Any,
    dict,
    collections.defaultdict,
    ListConfig,
    list,
    int,
    AnyNode,
    Metadata,
    DictConfig # <--- Updated fix line (added DictConfig)
])

print("--- Starting Model Training... ---")
# Pass the DataFrames to the .fit() method (No change here)
tabular_model.fit(
    train=train_df,
    validation=val_df
)
print("--- Training Complete! ---")

# --- Step 8: Save and Test the Model ---
# (No changes needed here)
# ... save model ...
# ... load model ...
# ... predict ...


--- Initializing Model... ---


INFO:lightning_fabric.utilities.seed:Seed set to 42
INFO:pytorch_tabular.tabular_model:Preparing the DataLoaders


--- Starting Model Training... ---


INFO:pytorch_tabular.tabular_datamodule:Setting up the datamodule for regression task
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This

Output()

INFO:pytorch_tabular.tabular_model:Training the model completed
INFO:pytorch_tabular.tabular_model:Loading the best model


--- Training Complete! ---


In [12]:
# --- Step 8: Save and Test the Model (CLEANED NAMES VERSION) ---
import pandas as pd
import numpy as np

# --- Step 7: Save and Test the Model ---
model_save_path = "saved_car_model_log_v1"
print(f"\n--- Saving model to folder: {model_save_path} ---")

try:
    tabular_model.save_model(model_save_path)
    print("--- Model saved! ---")
except Exception as e:
    print(f"Error saving model: {e}")

# --- Load the model back ---
print("--- Loading model from disk... ---")
try:
    loaded_model = TabularModel.load_model(model_save_path)
    print("--- Model loaded successfully! ---")

    # --- Create Test Sample (using CLEANED names) ---
    test_data_dict_cleaned = {
        'body': 'SUV', 'transmission': 'Automatic', 'fuel': 'Diesel',
        'utype': 'Used', 'engine_type': 'V6', 'drive_type': 'AWD',
        'steering_type': 'Power', 'state': 'NY', 'owner_type': 'Second',
        'myear': 2020, 'km': 35000, 'no_of_cylinder': 6, 'length': 5052,
        'width': 1968, 'height': 1741, 'wheel_base': 2994,
        'kerb_weight': 2135, 'gear_box': 8, 'seats': 7, 'max_torque_at': 1500
    }
    test_df_cleaned = pd.DataFrame([test_data_dict_cleaned])

    # --- Ensure Correct Data Types ---
    print("\n--- Setting data types for test data... ---")
    test_df_cleaned[categorical_cols] = test_df_cleaned[categorical_cols].astype(str)
    test_df_cleaned[numerical_cols] = test_df_cleaned[numerical_cols].astype(float)
    print("--- Data types set successfully! ---")

    print(f"\n--- Predicting log_price for: ---\n{test_df_cleaned.T}")

    # --- Make Prediction ---
    print("--- Attempting prediction... ---")
    prediction_df = loaded_model.predict(test_df_cleaned)

    # --- Extract Result ---
    # The prediction is the LOG of the price
    predicted_log_price = prediction_df[f"{target_col}_prediction"].values[0]

    # --- !! CONVERT PREDICTION BACK !! ---
    # We use np.expm1 (inverse of np.log1p) to convert back to Rupees
    predicted_price_rupees = np.expm1(predicted_log_price)

    print("\n--- PREDICTION: ---")
    if pd.isna(predicted_price_rupees):
        print("Predicted Price: nan (Training might have been unstable)")
    else:
        print(f"Predicted Log Price: {predicted_log_price:,.4f}")
        print(f"Predicted Price (Rupees): ₹{predicted_price_rupees:,.2f}") # Final Price

except Exception as e:
    print(f"Error during loading or prediction: {e}")
    import traceback
    traceback.print_exc()

print("\n--- Step 8 Finished ---")


--- Saving model to folder: saved_car_model_log_v1 ---
--- Model saved! ---
--- Loading model from disk... ---


INFO:pytorch_tabular.tabular_model:Preparing the Trainer
INFO:pytorch_lightning.utilities.rank_zero:Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


--- Model loaded successfully! ---

--- Setting data types for test data... ---
--- Data types set successfully! ---

--- Predicting log_price for: ---
                        0
body                  SUV
transmission    Automatic
fuel               Diesel
utype                Used
engine_type            V6
drive_type            AWD
steering_type       Power
state                  NY
owner_type         Second
myear              2020.0
km                35000.0
no_of_cylinder        6.0
length             5052.0
width              1968.0
height             1741.0
wheel_base         2994.0
kerb_weight        2135.0
gear_box              8.0
seats                 7.0
max_torque_at      1500.0
--- Attempting prediction... ---

--- PREDICTION: ---
Predicted Log Price: 14.8021
Predicted Price (Rupees): ₹2,682,078.50

--- Step 8 Finished ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [13]:
import shutil
import os

folder_to_zip = "saved_car_model_log_v1"
zip_filename = f"{folder_to_zip}.zip"

# Create a zip archive of the folder
shutil.make_archive(folder_to_zip, 'zip', folder_to_zip)

# Provide a link to download the zip file
from google.colab import files
files.download(zip_filename)

print(f"'{folder_to_zip}' has been zipped as '{zip_filename}' and is ready for download.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

'saved_car_model_log_v1' has been zipped as 'saved_car_model_log_v1.zip' and is ready for download.
