In [9]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [10]:

# Read in the CSV file as a pandas DataFrame.
df = pd.read_csv('Resources/model_car_prices_no_mmr.csv')

# Review the DataFrame
df.head()


Unnamed: 0,year,make,body,color,interior,sellingprice,odometer,automatic
0,2015,18,SUV,17,1,21500.0,16639.0,True
1,2015,18,SUV,17,0,21500.0,9393.0,True
2,2014,2,Sedan,8,1,30000.0,1331.0,True
3,2015,40,Sedan,17,1,27750.0,14282.0,True
4,2014,2,Sedan,8,1,67000.0,2641.0,True


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Assuming df is your DataFrame

# Features to include (excluding 'body' as it's not used for modeling)
features = ['year', 'odometer', 'make', 'interior', 'color', 'automatic']  # Already encoded

# Function to remove outliers for given column
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# List of unique body types
unique_bodies = ['SUV', 'Sedan', 'Convertible', 'Coupe', 'Wagon', 'Hatchback', 'Crew_Truck', 'Minivan', 'Van', 'Ext_Truck', 'Truck']

# Process each body type separately
for body_type in unique_bodies:
    print(f"Results for body type: {body_type}")
    
    # Filter the DataFrame for the current body type
    df_body = df[df['body'] == body_type]
    
    # Remove outliers from the 'odometer' column
    df_body = remove_outliers(df_body, 'odometer')

    # Set up X and y (excluding 'body' from the model)
    X = df_body[features]
    y = df_body['sellingprice']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    
    # Initialize the Random Forest Regressor
    rf_model = RandomForestRegressor(n_estimators=500, random_state=1)
    
    # Train the model
    rf_model.fit(X_train, y_train)
    
    # Make predictions using the testing set
    y_pred = rf_model.predict(X_test)
    
    # Calculate the performance metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Print the performance metrics
    print(f'Mean Squared Error (MSE) for {body_type}: {mse}')
    print(f'R-squared for {body_type}: {r2}\n')


Results for body type: SUV
Mean Squared Error (MSE) for SUV: 30171491.491845623
R-squared for SUV: 0.7296187322428098

Results for body type: Sedan
Mean Squared Error (MSE) for Sedan: 13128699.078272583
R-squared for Sedan: 0.7757312249101904

Results for body type: Convertible
Mean Squared Error (MSE) for Convertible: 33964613.780782
R-squared for Convertible: 0.829880514799062

Results for body type: Coupe
Mean Squared Error (MSE) for Coupe: 44814381.02653974
R-squared for Coupe: 0.7551008630238139

Results for body type: Wagon
Mean Squared Error (MSE) for Wagon: 5515284.175321737
R-squared for Wagon: 0.9004660020661694

Results for body type: Hatchback
Mean Squared Error (MSE) for Hatchback: 7261070.286251895
R-squared for Hatchback: 0.7492245928591454

Results for body type: Crew_Truck
Mean Squared Error (MSE) for Crew_Truck: 27749827.72891401
R-squared for Crew_Truck: 0.7251494558543361

Results for body type: Minivan
Mean Squared Error (MSE) for Minivan: 4355316.656488592
R-squar