# Part a

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the data
data = pd.read_csv('ToyotaCorolla.csv', delimiter=',')

# Selecting predictors and the target variable
X = data[['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Automatic', 'Doors', 'Quarterly_Tax', 'Mfr_Guarantee', 'Guarantee_Period', 'Airco', 'Automatic_airco', 'CD_Player', 'Powered_Windows', 'Sport_Model', 'Tow_Bar']]
y = data['Price']

# Preprocessing: Convert 'Fuel_Type' into dummies
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['Fuel_Type'])
    ], remainder='passthrough')

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=1)

# Build a pipeline with preprocessing and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', DecisionTreeRegressor(random_state=1))])

# Fit the model
pipeline.fit(X_train, y_train)

# Predictions
y_train_pred = pipeline.predict(X_train)
y_val_pred = pipeline.predict(X_val)

# Calculate and print MSE
mse_train = mean_squared_error(y_train, y_train_pred)
mse_val = mean_squared_error(y_val, y_val_pred)

print(f"Training MSE: {mse_train}")
print(f"Validation MSE: {mse_val}")

# To find the most important features, we look at the feature importances of the tree
# Note: Adjust the feature names based on the one-hot encoding
feature_names = list(pipeline.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out()) + ['Age_08_04', 'KM', 'HP', 'Automatic', 'Doors', 'Quarterly_Tax', 'Mfr_Guarantee', 'Guarantee_Period', 'Airco', 'Automatic_airco', 'CD_Player', 'Powered_Windows', 'Sport_Model', 'Tow_Bar']
importances = pipeline.named_steps['regressor'].feature_importances_
important_features = pd.Series(importances, index=feature_names).sort_values(ascending=False)

print("Most important features:")
print(important_features.head(4))

Training MSE: 0.0
Validation MSE: 2101312.659130435
Most important features:
Age_08_04          0.834615
KM                 0.060138
HP                 0.051740
Automatic_airco    0.013349
dtype: float64


# Part b

In [29]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search over
param_grid = {
    'regressor__max_depth': range(3, 10),  # Exploring tree depths from 3 to 10
    'regressor__min_samples_split': range(2, 10)  # Minimum number of samples required to split a node
}

# Create the GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

# Fit the model using grid search
grid_search.fit(X_train, y_train)

# Best parameters found
print("Best parameters:", grid_search.best_params_)

# Predictions with the best found model
y_val_pred_optimized = grid_search.predict(X_val)

# Calculate and print MSE for the optimized model
mse_val_optimized = mean_squared_error(y_val, y_val_pred_optimized)
print(f"Optimized Validation MSE: {mse_val_optimized}")

# Comparing with the full-grown tree
print(f"Full-grown Tree Validation MSE: {mse_val}")

Best parameters: {'regressor__max_depth': 6, 'regressor__min_samples_split': 7}
Optimized Validation MSE: 1445948.9422725793
Full-grown Tree Validation MSE: 2101312.659130435
