In [20]:
# train_model.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import pickle

# Load the dataset
df = pd.read_csv("car_price_dataset.csv")

# Drop unnecessary columns
# You can tweak this list based on your dataset
df = df.drop(columns=["car_id", "owner", "torque", "max_power"], errors="ignore")

# Drop rows with missing target values
df = df.dropna(subset=["selling_price"])

# Handle missing values (simplest method: fill with median)
df = df.fillna(df.median(numeric_only=True))

# Encode categorical columns
# Corrected 'fuel' to 'fuel_type' to match the DataFrame's column name
label_cols = ["seller_type", "transmission_type", "fuel_type", "car_name", "brand", "model"]

# Filter label_cols to only include columns that are actually present in the DataFrame
existing_label_cols = [col for col in label_cols if col in df.columns]

label_encoders = {}
for col in existing_label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Remove outliers in target
df = df[df["selling_price"] < df["selling_price"].quantile(0.99)]

# Select features (X) and target (y)
X = df.drop(columns=["selling_price"])
y = df["selling_price"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=2, random_state=42)
model.fit(X_train, y_train)

# Save the model and encoders
with open("car_price_model.pkl", "wb") as f:
    pickle.dump((model, label_encoders), f)

print("✅ Model and encoders saved to car_price_model.pkl")


✅ Model and encoders saved to car_price_model.pkl
