In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Load the data
data = pd.read_csv('true_car_listings.csv')
print("Data loaded successfully.")

# Display basic data information
print("First few rows of the dataset:")
print(data.head())
print("Summary statistics of the dataset:")
print(data.describe())

# Define categorical and numeric features
categorical_features = ['City', 'State', 'Make', 'Model']
numeric_features = ['Year', 'Mileage']

# Define Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create a pipeline that processes the data and then runs the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', LinearRegression())])

# Separate target variable and features
X = data.drop('Price', axis=1)
y = data['Price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split into training and testing sets.")

# Fit the pipeline
pipeline.fit(X_train, y_train)
print("Model trained successfully.")

# Predict prices on the testing set
predictions = pipeline.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f"Mean Absolute Error: {mae}")
print(f"Accuracy Score: {r2}")


Data loaded successfully.
First few rows of the dataset:
   Price  Year  Mileage              City State                Vin   Make  \
0   8995  2014    35725           El Paso    TX  19VDE2E53EE000083  Acura   
1  10888  2013    19606  Long Island City    NY  19VDE1F52DE012636  Acura   
2   8995  2013    48851           El Paso    TX  19VDE2E52DE000025  Acura   
3  10999  2014    39922           Windsor    CO  19VDE1F71EE003817  Acura   
4  14799  2016    22142            Lindon    UT  19UDE2F32GA001284  Acura   

          Model  
0    ILX6-Speed  
1    ILX5-Speed  
2    ILX6-Speed  
3    ILX5-Speed  
4  ILXAutomatic  
Summary statistics of the dataset:
               Price           Year       Mileage
count  852122.000000  852122.000000  8.521220e+05
mean    21464.100210    2013.289145  5.250779e+04
std     13596.202241       3.414987  4.198896e+04
min      1500.000000    1997.000000  5.000000e+00
25%     13000.000000    2012.000000  2.383600e+04
50%     18500.000000    2014.000000  