In [65]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings

In [67]:
# Suppress harmless warnings often triggered by specific library versions
warnings.filterwarnings('ignore')

In [109]:
# --- 1. CONFIGURATION AND DATA LOADING ---
# NOTE: The file path below has been updated to the user's local system path.
# This variable is now set to the specific local path.
LOCAL_DATA_FILE_PATH = r"C:\Users\unnim\OneDrive\Desktop\KRISHNADEV\DATA SCIENCE entry Elevate\Final Project\cars_data.csv"
TARGET_COLUMN = 'Price'
MODEL_FILENAME = 'random_forest_car_predictor.joblib'



In [111]:
# Define features based on user request and data inspection
# Performance is inferred from 'Cylinders'
CATEGORICAL_FEATURES = ['Make', 'Model', 'Body Type', 'Transmission', 'Fuel Type', 'Location']
NUMERICAL_FEATURES = ['Mileage', 'Cylinders', 'Age'] # 'Age' will be engineered
DROP_FEATURES = ['Description', 'Color', 'Year'] # 'Year' is used to create 'Age', then dropped



In [113]:
def load_data(file_path):
    """Loads the dataset from the specified local file path."""
    try:
        # Use the hardcoded local path
        df = pd.read_csv(file_path) 
        print(f"Data loaded successfully from: {file_path}. Initial shape: {df.shape}")
        return df
    except FileNotFoundError:
        print(f"Error: File not found at the specified path: {file_path}")
        print("Please check the path is correct and the file exists.")
        return None

In [117]:
# --- 2. FEATURE ENGINEERING AND PREPROCESSING ---

def preprocess_data(df):
    """
    Performs data cleaning, feature engineering (Age calculation),
    and handles missing values.
    """
    # 2.1 Feature Engineering: Calculate Car Age
    current_year = datetime.now().year
    # Convert 'Year' to numeric, handling potential errors
    df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
    df['Age'] = current_year - df['Year']

    # 2.2 Clean Target Variable ('Price')
    # Use IQR to remove extreme outliers in Price (helps the model generalize)
    Q1_price = df[TARGET_COLUMN].quantile(0.05)
    Q3_price = df[TARGET_COLUMN].quantile(0.95)
    IQR_price = Q3_price - Q1_price
    # Filter out values below Q1 and above Q3 (a strong filter for typical price datasets)
    df_clean = df[
        (df[TARGET_COLUMN] > Q1_price) & (df[TARGET_COLUMN] < Q3_price)
    ].copy()
    print(f"Data cleaned. Shape after price outlier removal: {df_clean.shape}")

    # 2.3 Handle Missing Values

    # --- FIX START: Ensure Cylinders is numeric before imputation and conversion ---
    # Convert 'Cylinders' to numeric, coercing non-numeric values (like potential initial strings) to NaN
    df_clean['Cylinders'] = pd.to_numeric(df_clean['Cylinders'], errors='coerce')

    # Impute remaining missing 'Cylinders' with the mode (most common value)
    df_clean['Cylinders'] = df_clean['Cylinders'].fillna(df_clean['Cylinders'].mode()[0])
    
    # For categorical features, fill NaNs with a string like 'Unknown'
    for col in CATEGORICAL_FEATURES:
        df_clean[col] = df_clean[col].fillna('Unknown')
    # Drop rows where 'Mileage' or 'Price' might still be missing or invalid
    df_clean.dropna(subset=NUMERICAL_FEATURES + [TARGET_COLUMN], inplace=True)

    # Convert 'Cylinders' to integer after successful imputation
    df_clean['Cylinders'] = df_clean['Cylinders'].astype(int)
    # --- FIX END ---
     # 2.4 Select Final Features
    features = NUMERICAL_FEATURES + CATEGORICAL_FEATURES
    X = df_clean[features]
    y = df_clean[TARGET_COLUMN]

    return X, y, features



In [125]:
# --- 3. PIPELINE SETUP (TRANSFORMERS) ---

def create_preprocessor():
    """Creates a ColumnTransformer for preprocessing numerical and categorical data."""
    # Transformer for numerical data: Scale the values (important for some models)
    numerical_transformer = StandardScaler()

    # Transformer for categorical data: One-Hot Encode (essential for nominal data)
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Create the preprocessor combining both
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, NUMERICAL_FEATURES),
            ('cat', categorical_transformer, CATEGORICAL_FEATURES)
        ],
        remainder='drop' # Drop any columns not explicitly selected
    )
    return preprocessor

 

In [131]:


# --- 4. MODEL TRAINING AND EVALUATION ---

def train_and_evaluate_model(X, y, preprocessor):
    """Splits data, trains models, evaluates, and saves the best one."""

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    print(f"\nTraining set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

    # Initialize models
    models = {
        "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=15),
        "LinearRegression": LinearRegression()
    }

    results = {}
    best_r2 = -np.inf
    best_model_name = ""
    best_pipeline = None

    for name, model in models.items():
        print(f"\n--- Training {name} ---")

        # Create the full pipeline: Preprocessing -> Model
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)])

        # Train the model
        pipeline.fit(X_train, y_train)

        # Predict on the test set
        y_pred = pipeline.predict(X_test)

       

In [139]:
# --- 4. MODEL TRAINING AND EVALUATION ---

def train_and_evaluate_model(X, y, preprocessor):
    """Splits data, trains models, evaluates, and saves the best one."""

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    print(f"\nTraining set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

    # Initialize models
    models = {
        "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=15),
        "LinearRegression": LinearRegression()
    }

    results = {}
    best_r2 = -np.inf
    best_model_name = ""
    best_pipeline = None

    for name, model in models.items():
        print(f"\n--- Training {name} ---")

        # Create the full pipeline: Preprocessing -> Model
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)])

        # Train the model
        pipeline.fit(X_train, y_train)

        # Predict on the test set
        y_pred = pipeline.predict(X_test)

        # Evaluate
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        results[name] = {'MAE': mae, 'RMSE': rmse, 'R2': r2}

        print(f"MAE (Mean Absolute Error): AED {mae:,.2f}")
        print(f"RMSE (Root Mean Squared Error): AED {rmse:,.2f}")
        print(f"R-squared ($R^2$): {r2:.4f}")

        # Track the best model
        if r2 > best_r2:
            best_r2 = r2
            best_model_name = name
            best_pipeline = pipeline

    # Save the best model pipeline (FIXED INDENTATION)
    if best_pipeline:
        joblib.dump(best_pipeline, MODEL_FILENAME)
        print(f"\n✅ Best model ({best_model_name}) saved to {MODEL_FILENAME}")

    return results, best_pipeline


In [141]:
# --- 5. PREDICTION EXAMPLE FUNCTION ---

def predict_car_price(model, new_car_data):
    """Uses the trained model to predict the price of a single new car."""
    # Create a DataFrame from the input dictionary
    df_new = pd.DataFrame([new_car_data])

    # Add the 'Age' feature, just like during training
    current_year = datetime.now().year
    df_new['Age'] = current_year - df_new['Year']
    df_new.drop(columns=['Year'], errors='ignore', inplace=True)

    # Select the required features (including the new 'Age')
    features_for_prediction = [col for col in NUMERICAL_FEATURES + CATEGORICAL_FEATURES if col != 'Year']
    X_new = df_new[features_for_prediction]

    # Make prediction
    predicted_price = model.predict(X_new)[0]

    return predicted_price

In [143]:
# --- 6. MAIN EXECUTION BLOCK ---

if __name__ == "__main__":
    # 1. Load Data
    # Use the local path defined at the top of the script
    df = load_data(LOCAL_DATA_FILE_PATH)
    if df is None:
        # If df is None (file not found), print a clear error and exit
        print("Data loading failed. Cannot proceed with preprocessing or model training.")
        exit()

Data loaded successfully from: C:\Users\unnim\OneDrive\Desktop\KRISHNADEV\DATA SCIENCE entry Elevate\Final Project\cars_data.csv. Initial shape: (10000, 12)


In [157]:
 # --- 6. MAIN EXECUTION BLOCK ---

if __name__ == "__main__":
    # 1. Load Data
    # Use the local path defined at the top of the script
    df = load_data(LOCAL_DATA_FILE_PATH)
    if df is None:
        # If df is None (file not found), print a clear error and exit
        print("Data loading failed. Cannot proceed with preprocessing or model training.")
        exit()

    # 2. Preprocess Data and Get Features
    X, y, features = preprocess_data(df)

    # 3. Create Preprocessing Pipeline (Transformer)
    preprocessor = create_preprocessor()

    # 4. Train and Evaluate Models
    results, best_pipeline = train_and_evaluate_model(X, y, preprocessor)

    # 5. Demonstration: Predict the price of a new hypothetical car
    if best_pipeline:
        print("\n--- Model Demonstration ---")
        example_car = {
            'Make': 'toyota',
            'Model': 'camry',
            'Year': 2021,
            'Mileage': 50000,
            'Body Type': 'Sedan',
            'Cylinders': 4,
            'Transmission': 'Automatic Transmission',
            'Fuel Type': 'Gasoline',
            'Location': 'Dubai'
        }

        # Predict the price
        predicted_price = predict_car_price(best_pipeline, example_car)

        print("Hypothetical Car Details:")
        for k, v in example_car.items():
            print(f"  {k}: {v}")

        print(f"\n💰 Predicted Price (using Random Forest): AED {predicted_price:,.2f}")


Data loaded successfully from: C:\Users\unnim\OneDrive\Desktop\KRISHNADEV\DATA SCIENCE entry Elevate\Final Project\cars_data.csv. Initial shape: (10000, 12)
Data cleaned. Shape after price outlier removal: (9000, 13)

Training set size: 7200, Test set size: 1800

--- Training RandomForest ---
MAE (Mean Absolute Error): AED 82,638.20
RMSE (Root Mean Squared Error): AED 125,160.85
R-squared ($R^2$): 0.5276

--- Training LinearRegression ---
MAE (Mean Absolute Error): AED 75,654.15
RMSE (Root Mean Squared Error): AED 116,237.97
R-squared ($R^2$): 0.5926

✅ Best model (LinearRegression) saved to random_forest_car_predictor.joblib

--- Model Demonstration ---
Hypothetical Car Details:
  Make: toyota
  Model: camry
  Year: 2021
  Mileage: 50000
  Body Type: Sedan
  Cylinders: 4
  Transmission: Automatic Transmission
  Fuel Type: Gasoline
  Location: Dubai

💰 Predicted Price (using Random Forest): AED 27,773.19
