In [2]:
import pandas as pd

# Load dataset
file_path = "used_cars.csv"  # Update this with your actual file path
df = pd.read_csv(file_path)

# Display first few rows
df.head()




Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,Ford,Utility Police Interceptor Base,2013,"51,000 mi.",E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,"$10,300"
1,Hyundai,Palisade SEL,2021,"34,742 mi.",Gasoline,3.8L V6 24V GDI DOHC,8-Speed Automatic,Moonlight Cloud,Gray,At least 1 accident or damage reported,Yes,"$38,005"
2,Lexus,RX 350 RX 350,2022,"22,372 mi.",Gasoline,3.5 Liter DOHC,Automatic,Blue,Black,None reported,,"$54,598"
3,INFINITI,Q50 Hybrid Sport,2015,"88,900 mi.",Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,None reported,Yes,"$15,500"
4,Audi,Q3 45 S line Premium Plus,2021,"9,835 mi.",Gasoline,2.0L I4 16V GDI DOHC Turbo,8-Speed Automatic,Glacier White Metallic,Black,None reported,,"$34,999"


In [6]:
print(df.columns)  # Show available columns

Index(['brand', 'model', 'model_year', 'milage', 'fuel_type', 'transmission',
       'ext_col', 'int_col', 'accident', 'clean_title', 'price',
       'engine_size'],
      dtype='object')


In [7]:
# Convert 'milage' to numeric (remove "mi." and commas)
if 'milage' in df.columns:
    df['milage'] = df['milage'].astype(str).str.replace(r' mi.', '', regex=True).str.replace(r',', '', regex=True).astype(float)
else:
    print("⚠️ Warning: 'milage' column not found in dataset. Skipping this step.")

# Convert 'price' to numeric (remove "$" and commas)
if 'price' in df.columns:
    df['price'] = df['price'].astype(str).str.replace(r'$', '', regex=True).str.replace(r',', '', regex=True).astype(float)
else:
    print("⚠️ Warning: 'price' column not found in dataset. Skipping this step.")

# Check if 'engine' column exists before extracting values
if 'engine' in df.columns:
    df['engine_size'] = df['engine'].str.extract(r'(\d+\.\d+)').astype(float)
    df.drop(columns=['engine'], inplace=True)  # Drop original column after extraction
    print("✅ Extracted 'engine_size' from 'engine' and dropped 'engine'.")
elif 'engine_size' in df.columns:
    print("✅ 'engine_size' column already exists. Skipping extraction step.")
else:
    print("⚠️ Warning: 'engine' column not found in dataset. Skipping extraction.")

# Handle missing values
df.fillna(df.median(numeric_only=True), inplace=True)  # Use numeric_only=True to avoid warnings
df.fillna("Unknown", inplace=True)  # Fill categorical missing values with "Unknown"

# Verify changes
print("✅ Data preprocessing completed successfully.")


✅ 'engine_size' column already exists. Skipping extraction step.
✅ Data preprocessing completed successfully.


In [8]:
# Convert categorical variables into numerical format using One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=['brand', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'clean_title', 'accident'], drop_first=True)

# Drop 'model' column as it's not useful for prediction
df_encoded.drop(columns=['model'], inplace=True, errors='ignore')

# Check the new dataset structure
print(df_encoded.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Columns: 604 entries, model_year to accident_Unknown
dtypes: float64(3), int64(1), uint8(600)
memory usage: 2.4 MB
None


In [9]:
from sklearn.model_selection import train_test_split

# Define features (X) and target variable (y)
X = df_encoded.drop(columns=['price'])  # Features (exclude target variable)
y = df_encoded['price']  # Target variable

# Split dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Show data split details
print(f"Training Data: {X_train.shape[0]} samples")
print(f"Testing Data: {X_test.shape[0]} samples")


Training Data: 3207 samples
Testing Data: 802 samples


In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize the Linear Regression model
lin_reg = LinearRegression()

# Train the model
lin_reg.fit(X_train, y_train)

# Predict on the test set
y_pred_lin = lin_reg.predict(X_test)

# Evaluate the model
mae_lin = mean_absolute_error(y_test, y_pred_lin)
mse_lin = mean_squared_error(y_test, y_pred_lin)
r2_lin = r2_score(y_test, y_pred_lin)

# Display evaluation results
print("📊 Linear Regression Model Performance:")
print(f"✅ Mean Absolute Error (MAE): {mae_lin:.2f}")
print(f"✅ Mean Squared Error (MSE): {mse_lin:.2f}")
print(f"✅ R² Score: {r2_lin:.4f}")


📊 Linear Regression Model Performance:
✅ Mean Absolute Error (MAE): 24124.90
✅ Mean Squared Error (MSE): 18777226564.13
✅ R² Score: 0.0813


In [11]:
from sklearn.tree import DecisionTreeRegressor

# Initialize the Decision Tree model
dt_reg = DecisionTreeRegressor(random_state=42)

# Train the model
dt_reg.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = dt_reg.predict(X_test)

# Evaluate the model
mae_dt = mean_absolute_error(y_test, y_pred_dt)
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

# Display evaluation results
print("📊 Decision Tree Regression Model Performance:")
print(f"✅ Mean Absolute Error (MAE): {mae_dt:.2f}")
print(f"✅ Mean Squared Error (MSE): {mse_dt:.2f}")
print(f"✅ R² Score: {r2_dt:.4f}")


📊 Decision Tree Regression Model Performance:
✅ Mean Absolute Error (MAE): 20841.51
✅ Mean Squared Error (MSE): 18365886955.96
✅ R² Score: 0.1015
