# 1. Voting Classifier
#### In this assignment, you are expected to build an ensemble of different models and train it on cover type dataset.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

## 1.1. Load dataset
#### You will need to read the data from the file (cover.csv). It contains 581012 samples and 54 attributes for each sample. The target column is Cover_Type.

In [2]:
# Load the dataset
df = pd.read_csv('cover.csv')

# Separate features and target
X = df.drop('Cover_Type', axis=1)
y = df['Cover_Type']

## 1.2. Prepare dataset
#### Split the data into train, validation, and test sets using train_test_split twice with 0.2 test_size. Your final distribution will be 371847-92962-116203.

In [3]:
# First split to separate out the test set
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Second split to divide the remaining data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

# Check the sizes of each dataset to confirm they match the required distribution
train_size = X_train.shape[0]
val_size = X_val.shape[0]
test_size = X_test.shape[0]

train_size, val_size, test_size


(371847, 92962, 116203)

## 1.3. Modeling
#### Train 4-5 different classifiers on the data. You can train RandomForestClassifier, ExtraTreesClassifier, LinearSVC, SGDClassifier, MLPClassifier, etc. Evaluate their performances using validation set. Note that training may take quite a while (up to 30 minutes) depending on the hardware.

In [4]:
RandomForest_clf = RandomForestClassifier(n_jobs=-1, random_state=42)
ExtraTree_clf = ExtraTreesClassifier(n_jobs=-1, random_state=42)
MLP_clf = MLPClassifier(random_state=42)
GradientBoosting_clf = GradientBoostingClassifier(random_state=42)

# List of classifiers for easier management, now including GradientBoostingClassifier
classifiers = [RandomForest_clf, ExtraTree_clf, MLP_clf, GradientBoosting_clf]

# Train the classifiers
for clf in classifiers:
    print(f"Training {clf.__class__.__name__}...")
    clf.fit(X_train, y_train)
    print(f"Training completed for {clf.__class__.__name__}")

# Evaluate each classifier on the validation set
for clf in classifiers:
    # Predict using the pipeline, which will apply scaling automatically for SVC and SGD
    y_pred = clf.predict(X_val)
    print(f"{clf.__class__.__name__}: {accuracy_score(y_val, y_pred)}")

Training RandomForestClassifier...
Training completed for RandomForestClassifier
Training ExtraTreesClassifier...


MemoryError: could not allocate 16777216 bytes

## 1.4. Ensembling
#### Create a hard and soft voting classifier using the models you have trained. You can use VotingClassifier. Check its performance on the validation set. Do you get better or worse performance than any of the individual classifiers?

In [7]:
# Hard and soft voting classifier
hard_voting_clf = VotingClassifier(
    estimators=[('rf', RandomForest_clf), ('et', ExtraTree_clf), ('mlp', MLP_clf), ('gb', GradientBoosting_clf)],
    voting='hard')
soft_voting_clf = VotingClassifier(
    estimators=[('rf', RandomForest_clf), ('et', ExtraTree_clf), ('mlp', MLP_clf), ('gb', GradientBoosting_clf)],
    voting='soft')

# Training voting classifiers
hard_voting_clf.fit(X_train, y_train)
soft_voting_clf.fit(X_train, y_train)

# Checking performance of the voting classifiers on the validation set
for clf in (hard_voting_clf, soft_voting_clf):
    y_pred = clf.predict(X_val)
    print(clf.__class__.__name__, accuracy_score(y_val, y_pred))


VotingClassifier 0.8853940319700523
VotingClassifier 0.9114799595533659


#### Check if any of the models hurts the performance of the ensemble. You can access the estimators of the ensemble using estimators_ attribute. If so, drop those using set_params and reevaluate.

In [8]:
# Evaluating the performance of each individual classifier
individual_accuracies = {}
for clf in (RandomForest_clf, ExtraTree_clf, MLP_clf, GradientBoosting_clf):
    clf_name = clf.__class__.__name__
    y_pred = clf.predict(X_val)
    individual_accuracy = accuracy_score(y_val, y_pred)
    individual_accuracies[clf_name] = individual_accuracy
    print(f"{clf_name} accuracy: {individual_accuracy}")

# Evaluating the hard voting classifier
hard_voting_accuracy = accuracy_score(y_val, hard_voting_clf.predict(X_val))
print(f"Hard Voting Classifier accuracy: {hard_voting_accuracy}")

# Evaluating the soft voting classifier
soft_voting_accuracy = accuracy_score(y_val, soft_voting_clf.predict(X_val))
print(f"Soft Voting Classifier accuracy: {soft_voting_accuracy}")

# Checking whether any individual classifier has a higher accuracy than the ensemble
for clf_name, accuracy in individual_accuracies.items():
    if accuracy > hard_voting_accuracy:
        print(f"{clf_name} is hurting hard voting ensemble performance.")
    if accuracy > soft_voting_accuracy:
        print(f"{clf_name} is hurting soft voting ensemble performance.")

RandomForestClassifier accuracy: 0.9509799703104495
ExtraTreesClassifier accuracy: 0.9497859340375637
MLPClassifier accuracy: 0.7685505905638863
GradientBoostingClassifier accuracy: 0.7733697639895871
Hard Voting Classifier accuracy: 0.8853940319700523
Soft Voting Classifier accuracy: 0.9114799595533659
RandomForestClassifier is hurting hard voting ensemble performance.
RandomForestClassifier is hurting soft voting ensemble performance.
ExtraTreesClassifier is hurting hard voting ensemble performance.
ExtraTreesClassifier is hurting soft voting ensemble performance.


In [21]:
# Displaying the names of the current models within the hard voting ensemble before any changes
print("Current models in the hard voting ensemble prior to any exclusions:", hard_voting_clf.estimators_)

# Removing the MLPClassifier from the ensemble designated for hard voting
hard_voting_clf.set_params(estimators=[(model_name, estimator) for model_name, estimator in hard_voting_clf.estimators if model_name != 'MLPClassifier'])

# Updating the hard voting ensemble with the new set of estimators
hard_voting_clf.fit(X_train, y_train)

# Assessing the updated performance of the hard voting ensemble
predictions_hard = hard_voting_clf.predict(X_val)
accuracy_after_removal_hard = accuracy_score(y_val, predictions_hard)
print("Accuracy of the updated hard voting ensemble (MLP excluded): {}".format(accuracy_after_removal_hard))


Current models in the hard voting ensemble prior to any exclusions: [RandomForestClassifier(n_jobs=-1, random_state=42), ExtraTreesClassifier(n_jobs=-1, random_state=42), MLPClassifier(random_state=42), GradientBoostingClassifier(random_state=42)]


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Aspect
- Elevation
- Hillshade_3pm
- Hillshade_9am
- Hillshade_Noon
- ...
Feature names seen at fit time, yet now missing:
- -3.718306912453772384e+02
- 1.018255499889504426e+04


# 2. Random Forest
#### In this assignment, you are expected to build a random forest that classifies a toy dataset.

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

## 2.1. Load dataset
#### You will need to read the data from the file (data.csv). It contains 15000 samples and two features for each sample.

In [11]:
df = pd.read_csv('data.csv')

## 2.2. Prepare dataset
#### Split the data into train and test sets with 0.2 test size.

In [12]:
# Splitting the data into features and target variable
X = df.drop(columns=df.columns[-1])  # Features (all columns except the last one)
y = df[df.columns[-1]] # Target variable (the last column)

# Splitting the dataset into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## 2.3. Modeling
#### Train a DecisionTreeClassifier on the data. Use GridSearchCV to tune the hyperparameters.

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Decision Tree Classifier
decision_tree_clf = DecisionTreeClassifier(random_state=42)

# Hyperparameter grid for tuning
hyperparam_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid search
grid_search = GridSearchCV(decision_tree_clf, hyperparam_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Printing the best parameters and the best score
print(f"Best parameters:, {grid_search.best_params_}")
print(f"Best score:, {grid_search.best_score_}")



Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters:, {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best score:, 0.8389872516326247


#### Train the best model on the whole train set (do you need to?) and evaluate the model on the test set.

In [15]:
# Training the best model on the whole train set
best_decision_tree_clf = grid_search.best_estimator_
best_decision_tree_clf.fit(X_train, y_train)

# Evaluating the model on the test set
y_pred_test = best_decision_tree_clf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test accuracy of the best Decision Tree model: {test_accuracy}")

Test accuracy of the best Decision Tree model: 0.8523333333333334


#### Generate 1,200 subsets of the training set, each containing 100 randomly chosen instances. You can use ShuffleSplit.

In [16]:
from sklearn.model_selection import ShuffleSplit

# Generating 1,200 subsets using ShuffleSplit
new_subsets = ShuffleSplit(n_splits=1200, train_size=100, random_state=42)

#### Train one tree on each subset, using the best model you previously found. Evaluate the performance of the trees using the test set. Did you get lower or higher accuracy? Why?

In [18]:
# Training one tree on each subset
subset_trees = [DecisionTreeClassifier(**grid_search.best_params_, random_state=42) for _ in range(1200)]
subset_accuracies = []

for train_index, _ in new_subsets.split(X_train):
    X_subset, y_subset = X_train.iloc[train_index], y_train.iloc[train_index]
    for tree in subset_trees:
        tree.fit(X_subset, y_subset)

    # Evaluate the tree on the test set
    y_pred_test_subset = tree.predict(X_test)
    subset_accuracy = accuracy_score(y_test, y_pred_test_subset)
    subset_accuracies.append(subset_accuracy)

average_subset_accuracy = sum(subset_accuracies) / len(subset_accuracies)
print(f"Average accuracy of the trees on the test set: {average_subset_accuracy}")

Average accuracy of the trees on the test set: 0.8014688888888879


#### For each instance in the test set, predict its class using 1200 trees, and keep only the most frequent prediction. You can use mode from scipy.stats. Evaluate these predictions. Did you get lower or higher accuracy?

In [20]:
from scipy.stats import mode
import numpy as np
from sklearn.metrics import accuracy_score

# Array for predictions with dimensions (number of test instances, number of trees)
predictions_matrix = np.array([tree.predict(X_test) for tree in subset_trees]).transpose()

# The most common prediction across trees for each test instance
majority_vote_predictions, _ = mode(predictions_matrix, axis=1)
majority_vote_predictions = majority_vote_predictions.squeeze()

# Accuracy of the ensemble model
accuracy_of_ensemble = accuracy_score(y_test, majority_vote_predictions)
print("Ensemble accuracy: {}".format(accuracy_of_ensemble))

Ensemble accuracy: 0.8076666666666666
