In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# =============================
# 1) Load CSV
# =============================
data = pd.read_csv("/Users/indreshmr/Desktop/mlds/CarPrice_Assignment.csv")
print("Data loaded successfully")
print("Shape of dataset:", data.shape)
print(data.head())

# =============================
# 2) Drop obvious ID/text cols (edit as you like)
# =============================
data = data.drop(columns=["car_ID", "CarName"], errors="ignore")
print("\nAfter dropping unnecessary columns:", data.shape)

# =============================
# 3) Choose target & split features
# =============================
# Use 'price' if present, else last numeric col as fallback
if "price" in data.columns:
    target = "price"
else:
    # fallback: last numeric column as target
    num_cols_all = data.select_dtypes(include=[np.number]).columns.tolist()
    target = num_cols_all[-1]

X = data.drop(columns=[target])
y = data[target]

# Detect dtypes on X (NOT the full data), so target is not included
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

print("\nNumerical Features:", num_features)
print("Categorical Features:", cat_features)
print("Target:", target)

# =============================
# 4) Preprocessing pipelines
# =============================
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features),
    ],
    remainder="drop"
)

# =============================
# 5) Train/Test split
# =============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("\nX_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# =============================
# 6) Fit/transform
# =============================
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed  = preprocessor.transform(X_test)

print("\nPreprocessing complete!")
print("Transformed X_train shape:", X_train_processed.shape)
print("Transformed X_test shape:", X_test_processed.shape)

# (Optional) get feature names after OHE
try:
    feature_names = preprocessor.get_feature_names_out()
    print("Total transformed features:", len(feature_names))
except Exception:
    pass


Data loaded successfully
Shape of dataset: (205, 26)
   car_ID  symboling                   CarName fueltype aspiration doornumber  \
0       1          3        alfa-romero giulia      gas        std        two   
1       2          3       alfa-romero stelvio      gas        std        two   
2       3          1  alfa-romero Quadrifoglio      gas        std        two   
3       4          2               audi 100 ls      gas        std       four   
4       5          2                audi 100ls      gas        std       four   

       carbody drivewheel enginelocation  wheelbase  ...  enginesize  \
0  convertible        rwd          front       88.6  ...         130   
1  convertible        rwd          front       88.6  ...         130   
2    hatchback        rwd          front       94.5  ...         152   
3        sedan        fwd          front       99.8  ...         109   
4        sedan        4wd          front       99.4  ...         136   

   fuelsystem  boreratio  s