In [None]:
# 27-03-2024
# CSC354 – Assignmen2 – ML – Decision Trees
# Hamna Shahbaz
# FA21-BSE-048
# Performing classification and regression tasks using decision trees and generating a report about them.

In [11]:
#Question 01:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

dataset = pd.read_csv('/datasaurus.csv')

# Split the dataset into features and target variable
X = dataset.drop(columns=['dataset'])
y = dataset['dataset']

# Train-test Split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Baseline model with default parameters: J48 (DecisionTreeClassifier)
j48_classifier = DecisionTreeClassifier(random_state=42)
j48_classifier.fit(X_train, y_train)
j48_predictions_baseline = j48_classifier.predict(X_test)
j48_accuracy_baseline = accuracy_score(y_test, j48_predictions_baseline)
print("J48 Baseline Accuracy:", j48_accuracy_baseline)

# Baseline model with default parameters: Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
rf_predictions_baseline = rf_classifier.predict(X_test)
rf_accuracy_baseline = accuracy_score(y_test, rf_predictions_baseline)
print("Random Forest Baseline Accuracy:", rf_accuracy_baseline)

# Hyperparameter tuning using Randomized Search for J48
j48_param_dist = {
    "criterion": ["gini", "entropy"],
    "splitter": ["best", "random"],
    "max_depth": [None, 10, 20, 30, 40, 50],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["auto", "sqrt", "log2"]
}
j48_random_search = RandomizedSearchCV(j48_classifier, j48_param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1)
j48_random_search.fit(X_train, y_train)
j48_best_params_random = j48_random_search.best_params_
print("J48 Best Parameters (Random Search):", j48_best_params_random)

# Adjusting parameter search spaces for Grid Search (Random Forest)
rf_param_grid = {
    "n_estimators": [100, 200, 300],
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
    "max_features": ["auto", "sqrt"]
}

rf_grid_search = GridSearchCV(rf_classifier, rf_param_grid, cv=5, n_jobs=-1)
rf_grid_search.fit(X_train, y_train)
rf_best_params_grid = rf_grid_search.best_params_
print("Random Forest Best Parameters (Grid Search):", rf_best_params_grid)

# Evaluate models with best parameters
j48_best_classifier = DecisionTreeClassifier(**j48_best_params_random, random_state=42)
j48_best_classifier.fit(X_train, y_train)
j48_predictions_best = j48_best_classifier.predict(X_test)
j48_accuracy_best = accuracy_score(y_test, j48_predictions_best)
print("J48 Best Accuracy:", j48_accuracy_best)

rf_best_classifier = RandomForestClassifier(**rf_best_params_grid, random_state=42)
rf_best_classifier.fit(X_train, y_train)
rf_predictions_best = rf_best_classifier.predict(X_test)
rf_accuracy_best = accuracy_score(y_test, rf_predictions_best)
print("Random Forest Best Accuracy:", rf_accuracy_best)


J48 Baseline Accuracy: 0.3972972972972973
Random Forest Baseline Accuracy: 0.46216216216216216




J48 Best Parameters (Random Search): {'splitter': 'best', 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 40, 'criterion': 'entropy'}


KeyboardInterrupt: 

In [10]:
#Question 02:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

dataset = pd.read_csv('/cars-dataset.csv')

# Check the column names in the dataset
print(dataset.columns)

# Split the dataset into features and target variable
X = dataset.drop(columns=['selling_price'])  # Features
y = dataset['selling_price']  # Target variable

# Preprocess categorical variables using one-hot encoding to avoid ValueErrors
X_encoded = pd.get_dummies(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

dt_regressor_baseline = DecisionTreeRegressor(random_state=42)
dt_regressor_baseline.fit(X_train, y_train)
dt_predictions_baseline = dt_regressor_baseline.predict(X_test)
mse_baseline = mean_squared_error(y_test, dt_predictions_baseline)
print("Baseline Mean Squared Error:", mse_baseline)

# Hyperparameter tuning using Randomized Search
dt_param_dist = {
    "criterion": ["friedman_mse", "squared_error"],
    "splitter": ["best", "random"],
    "max_depth": [None, 10, 20, 30, 40, 50],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["auto", "sqrt", "log2"]
}

dt_random_search = RandomizedSearchCV(dt_regressor_baseline, dt_param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1)
dt_random_search.fit(X_train, y_train)
best_params_random = dt_random_search.best_params_
print("Best Parameters (Random Search):", best_params_random)

# Hyperparameter tuning using Grid Search
dt_param_grid = {
    "criterion": ["friedman_mse", "squared_error"],
    "splitter": ["best", "random"],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
    "max_features": ["auto", "sqrt"]
}

dt_grid_search = GridSearchCV(dt_regressor_baseline, dt_param_grid, cv=5, n_jobs=-1)
dt_grid_search.fit(X_train, y_train)
best_params_grid = dt_grid_search.best_params_
print("Best Parameters (Grid Search):", best_params_grid)

# Evaluate model with best parameters
dt_regressor_best_random = DecisionTreeRegressor(**best_params_random, random_state=42)
dt_regressor_best_random.fit(X_train, y_train)
dt_predictions_best_random = dt_regressor_best_random.predict(X_test)
mse_best_random = mean_squared_error(y_test, dt_predictions_best_random)
print("Best Mean Squared Error (Random Search):", mse_best_random)

dt_regressor_best_grid = DecisionTreeRegressor(**best_params_grid, random_state=42)
dt_regressor_best_grid.fit(X_train, y_train)
dt_predictions_best_grid = dt_regressor_best_grid.predict(X_test)
mse_best_grid = mean_squared_error(y_test, dt_predictions_best_grid)
print("Best Mean Squared Error (Grid Search):", mse_best_grid)


Index(['year', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner',
       'selling_price'],
      dtype='object')
Baseline Mean Squared Error: 190042705342.0196




Best Parameters (Random Search): {'splitter': 'best', 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 20, 'criterion': 'squared_error'}
Best Parameters (Grid Search): {'criterion': 'friedman_mse', 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'splitter': 'best'}
Best Mean Squared Error (Random Search): 166372967605.11325
Best Mean Squared Error (Grid Search): 165453664215.49405




**QUestion 03:**
In question 01 the manual viewing of data did not provide me with enough information so I was forced to learn the importance of data visualization; which showed that apparently  mathematically different data is so interconnected so as to be used to draw a whole shape. The hyperparametering of the datasaurus dataset consumed a great amount of time for it to execute properly due to the continuosly optimal parameter searching by random and gird searches. Different accuracy values were generated at each iteration of the hyperparametering that was used for exploration of the concept.
In question 2, classification tasks are fairly familiar and easy for me but regression is complex and new as I haven't been able to pay much attention to it. I had to deal with a great deals of warnings to properly comprehend and understand the nature of dynamic hyperparametering for a regression task. Overall I had quite a bit of difficulty trying to learn and track the output and the datasets used in these questions.