<a href="https://colab.research.google.com/github/JaxDoge/CS6410/blob/main/Group5Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade datasets
!pip install --upgrade catboost

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-

Import all necessary libraries

# Loading Data

In [2]:
from datasets import load_dataset

import pyarrow as pa
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from joblib import Memory

from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from tempfile import mkdtemp

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, label_binarize
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import torch

Import Dataset

In [3]:
dataset = load_dataset("mstz/covertype", "covertype")["train"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/19.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/581012 [00:00<?, ? examples/s]

In [4]:
seed = 123
cv_fold = 3

df_new = dataset.to_pandas()
df_new.head()

# Drop rows if responsor is missing
df_new = df_new.dropna(subset=['cover_type'])

Data Pre-Processing

In [5]:
X = df_new.drop('cover_type', axis = 1)
y = df_new['cover_type']

In [6]:

numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
bool_features = X.select_dtypes(include=['bool']).columns.tolist()

In [7]:
# Manually apply transformation to each boolean feature
for column in bool_features:
    X[column] = X[column].astype(int)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99, random_state=seed)

In [9]:
# Function to convert booleans to integers
def bool_to_int(x):
    return x.astype(int)

In [10]:
# Define scorers with multi-class handling
def multiclass_roc_auc_score(y_true, y_pred, average="macro"):
    y_true = label_binarize(y_true, classes=np.unique(y_true))

    return roc_auc_score(y_true, y_pred, average=average, multi_class="ovr")

In [11]:
# For numerical values
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# For Bool value
bool_transformer = Pipeline(steps=[
    ('bool_to_int', FunctionTransformer(bool_to_int))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('bool', 'passthrough', bool_features)
    ])

In [12]:
# Feature Selection with XGboost.
xgb_classifier = XGBClassifier(n_estimators=100, random_state=seed)
feature_selection = SelectFromModel(estimator=xgb_classifier, threshold='median')

In [13]:


roc_auc_scorer = make_scorer(multiclass_roc_auc_score, needs_proba=True, average='macro')

scoring = {
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1_score': make_scorer(f1_score, average='weighted'),
    'roc_auc': roc_auc_scorer,
    'accuracy': 'accuracy'
}

# Logistic Regression

In [14]:
# Create a temporary folder to store the pipeline's memory
cachedir = mkdtemp()

# Construct the final modeling pipeline with all components with data imbalance handler
lr_pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('oversampler', RandomOverSampler(random_state=seed)),
    ('feature_selection', feature_selection),
    ('model', LogisticRegression(multi_class='multinomial'))
    ], memory=Memory(cachedir, verbose=0)
)
lr_pipeline

In [15]:
# Perform cross-validation with parameter tuning
# Define the parameter grid
param_grid = {
    'model__C': [0.1, 0.2],
    'model__solver': ['sag']
}

grid_search = GridSearchCV(lr_pipeline, param_grid, cv=cv_fold, scoring=scoring, refit='recall')
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best Recall score found: ", grid_search.best_score_)

Best parameters found:  {'model__C': 0.1, 'model__solver': 'sag'}
Best Recall score found:  0.49553017147587014


In [16]:
# Test on test data set
best_pipeline = grid_search.best_estimator_

predictions = best_pipeline.predict(X_test)
y_proba = best_pipeline.predict_proba(X_test)

# Evaluate the predictions
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
roc = roc_auc_score(label_binarize(y_test, classes=np.unique(y_train)), y_proba, multi_class='ovr', average='macro')
accuracy = accuracy_score(y_test, predictions)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC:", roc)

Accuracy: 0.5221539563492478
Precision: 0.5810465339303259
Recall: 0.5221539563492478
F1 Score: 0.5404030153700666
ROC: 0.8504223980264188


# XGBoost Tree

In [17]:
# Switch the model in the pipeline
xgb_pipeline = lr_pipeline
xgb_pipeline.set_params(model=XGBClassifier(use_label_encoder=False))

In [18]:
param_grid = {
    'model__n_estimators': [300, 400],
    'model__learning_rate': [0.25, 0.5],
    'model__max_depth': [2],
    'model__colsample_bytree': [0.8],
    'model__subsample': [0.8],
    'model__reg_alpha': [1, 1.5],
    'model__reg_lambda': [3, 3.5],
    'model__eval_metric': ['mlogloss']
}

grid_search = GridSearchCV(xgb_pipeline, param_grid, scoring=scoring, refit='recall', cv=cv_fold, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best Recall score found: ", grid_search.best_score_)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best parameters found:  {'model__colsample_bytree': 0.8, 'model__eval_metric': 'mlogloss', 'model__learning_rate': 0.25, 'model__max_depth': 2, 'model__n_estimators': 400, 'model__reg_alpha': 1, 'model__reg_lambda': 3, 'model__subsample': 0.8}
Best Recall score found:  0.5111957266853545


In [19]:
# Test on test data set
best_pipeline = grid_search.best_estimator_

predictions = best_pipeline.predict(X_test)
y_proba = best_pipeline.predict_proba(X_test)

# Evaluate the predictions
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
roc = roc_auc_score(label_binarize(y_test, classes=np.unique(y_train)), y_proba, multi_class='ovr', average='macro')
accuracy = accuracy_score(y_test, predictions)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC:", roc)

Accuracy: 0.5233031178612035
Precision: 0.583543198432635
Recall: 0.5233031178612035
F1 Score: 0.5411211967739498
ROC: 0.8544726721069121


# CatBoost Tree

In [23]:
# Switch the model in the pipeline
catb_pipeline = xgb_pipeline
catb_pipeline.set_params(model=CatBoostClassifier(random_seed=seed, loss_function='MultiClass', verbose=False))

In [None]:
param_grid = {
    'model__iterations': [400, 500, 800],
    'model__learning_rate': [0.1, 0.25, 0.5],
    'model__max_depth': [2, 3, 4],
}

grid_search = GridSearchCV(catb_pipeline, param_grid, scoring=scoring, refit='recall', cv=cv_fold, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best Recall score found: ", grid_search.best_score_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [22]:
# Test on test data set
best_pipeline = grid_search.best_estimator_

predictions = best_pipeline.predict(X_test)
y_proba = best_pipeline.predict_proba(X_test)

# Evaluate the predictions
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
roc = roc_auc_score(label_binarize(y_test, classes=np.unique(y_train)), y_proba, multi_class='ovr', average='macro')
accuracy = accuracy_score(y_test, predictions)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC:", roc)

Accuracy: 0.5233031178612035
Precision: 0.583543198432635
Recall: 0.5233031178612035
F1 Score: 0.5411211967739498
ROC: 0.8538068873896739


# Neural Network