## Car Price Predictor

In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [50]:
# Load data
df = pd.read_csv('used_cars.csv')

# Data Cleaning
# Remove '$' and ',' from the 'price' column and convert it to numeric
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)

# Convert mileage to numeric (remove ' mi.' and commas)
df['milage'] = df['milage'].replace('[\s,mi.]', '', regex=True).astype(float)

# Handle missing values: Drop rows with missing 'price'
df = df.dropna(subset=['price'])

# Convert categorical columns to numeric using Label Encoding
label_cols = ['brand', 'model', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
label_encoders = {}
for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le  # Save the encoder


In [52]:
# Features and target variable
X = df[['brand', 'model', 'model_year', 'milage', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']]
y = df['price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: ${mae:.2f}")

# Optional: Print some predictions with brand names
predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Brand': X_test['brand']})
# Decode the brand labels back to their original names
predictions['Brand'] = predictions['Brand'].apply(lambda x: label_encoders['brand'].inverse_transform([int(x)])[0])
print(predictions.head())

import joblib
# Save the results to a CSV file
predictions[['Brand', 'Actual', 'Predicted Price', 'Difference']].to_csv('car_price_prediction_results_with_brands.csv', index=False)

# Save the model if needed
joblib.dump(model, 'car_price_predictor_model.pkl')
print("Model saved as 'car_price_predictor_model.pkl'")

Mean Absolute Error: $31370.70
       Actual     Predicted          Brand
2580  28000.0  49302.045068          Lexus
3660   5900.0  28329.724796      Chevrolet
897   41000.0  46737.078708            RAM
2091  40250.0  54609.966169  Mercedes-Benz
1044  77999.0  50370.828087           Ford


KeyError: "['Actual Price', 'Predicted Price', 'Difference'] not in index"

## The End