# -- ðŸš— Used Car Price Prediction --

# 6. Pipeline - Model Deployment Pipeline

Bu notebookta:

- Feature engineering iÅŸlemleri
- Leakage'siz final feature set
- RandomForest final modelinin eÄŸitimi
- Preprocessing + Model birleÅŸik pipeline yapÄ±sÄ±
- Model kaydetme
- Inference fonksiyonunun oluÅŸturulmasÄ±

iÅŸlemleri yapÄ±lacaktÄ±r.


In [1]:
# KÃ¼tÃ¼phaneler

import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

pd.set_option("display.max_columns", None)


In [2]:
# Veri YÃ¼kleme + Veri TemizliÄŸi

df = pd.read_csv("/kaggle/input/automl9/used_cars_dataset_v2.csv")

def clean_km(x):
    if pd.isna(x):
        return np.nan
    x = str(x).lower().replace("km", "").replace(",", "").strip()
    try:
        return float(x)
    except:
        return np.nan

def clean_price(x):
    if pd.isna(x):
        return np.nan
    x = str(x)
    x = (x.replace("â‚¹","").replace(",", "")
           .replace("rs.","").replace("rs","").strip())
    try:
        return float(x)
    except:
        return np.nan

df["kmDriven_clean"] = df["kmDriven"].apply(clean_km)
df["AskPrice_clean"] = df["AskPrice"].apply(clean_price)

df = df.dropna(subset=["kmDriven_clean", "AskPrice_clean"])


In [3]:
# Feature Engineering

df_fe = df.copy()

df_fe["price_per_km"] = df_fe["AskPrice_clean"] / (df_fe["kmDriven_clean"] + 1)
df_fe["km_per_year"] = df_fe["kmDriven_clean"] / (df_fe["Age"] + 1)
df_fe["log_kmDriven"] = np.log1p(df_fe["kmDriven_clean"])

final_features = [
    "Brand", "model", "Year", "Age",
    "kmDriven_clean", "Transmission", 
    "Owner", "FuelType",
    "price_per_km", "km_per_year", "log_kmDriven"]

X = df_fe[final_features]
y = df_fe["AskPrice_clean"]


In [4]:
# Train / Validation / Test Split

X_train_full, X_temp, y_train_full, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42)

X_val, X_test_final, y_val, y_test_final = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42)

X_train_full.shape, X_val.shape, X_test_final.shape



((10433, 11), (2236, 11), (2236, 11))

In [5]:
# Preprocessing

cat_cols = ["Brand", "model", "Transmission", "Owner", "FuelType"]
num_cols = ["Year", "Age", "kmDriven_clean",
            "price_per_km", "km_per_year", "log_kmDriven"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)])


## 6.1. Final Model (RandomForest â€“ En iyi performans gÃ¶steren model)


In [6]:
# RandomForest Final Model â€“ En iyi performans gÃ¶steren model
# Final Model Pipeline

final_model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1))])

final_model


In [7]:
# Model EÄŸitimi + Performans

final_model.fit(X_train_full, y_train_full)

y_pred_test = final_model.predict(X_test_final)

mse = mean_squared_error(y_test_final, y_pred_test)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_final, y_pred_test)
r2 = r2_score(y_test_final, y_pred_test)

mse, rmse, mae, r2


(122082371739.60013, 349402.8788370241, 37605.636273106735, 0.9431364384555981)

In [None]:
# Model Kaydetme 

joblib.dump(final_model, "final_car_price_model.pkl")
print("Model kaydedildi: final_car_price_model.pkl")


## 6.2. Inference Fonksiyonu 

In [None]:
#Inference Fonksiyonu

def predict_price(model, data_dict):
    """
    data_dict Ã¶rneÄŸi:
    {
        "Brand": "Toyota",
        "model": "Corolla",
        "Year": 2018,
        "Age": 5,
        "kmDriven_clean": 45000,
        "Transmission": "Manual",
        "Owner": "First Owner",
        "FuelType": "Petrol"
    }
    """
    
    df_input = pd.DataFrame([data_dict])
    
    # Feature engineering aynÄ± ÅŸekilde uygulanmalÄ±:
    df_input["price_per_km"] = df_input["AskPrice_clean"] / (df_input["kmDriven_clean"] + 1) if "AskPrice_clean" in df_input else 0
    df_input["km_per_year"] = df_input["kmDriven_clean"] / (df_input["Age"] + 1)
    df_input["log_kmDriven"] = np.log1p(df_input["kmDriven_clean"])
    
    df_input = df_input[final_features]
    
    return model.predict(df_input)[0]


## 6.3. Final Pipeline Ã–zet BulgularÄ±

- Model baÅŸarÄ±lÄ± bir genelleme performansÄ± gÃ¶stermektedir.
- RÂ² deÄŸeri ~0.94 civarÄ±nda olup sektÃ¶r iÃ§in kabul edilebilir seviyededir.
- Ortalama hata (MAE) fiyat segmentine gÃ¶re makuldÃ¼r.
- RandomForest modeli final model olarak doÄŸrulanmÄ±ÅŸtÄ±r.

- TÃ¼m preprocessing adÄ±mlarÄ± pipeline iÃ§ine alÄ±ndÄ±.
- RandomForest en iyi model olarak seÃ§ildi.
- Model baÅŸarÄ±yla test edildi ve kaydedildi.
- Inference fonksiyonu oluÅŸturuldu.

