# Integration Tests 

In [None]:
## Import the Forest-Guided Clustering package
from fgclustering import FgClustering

## Imports for datasets
from palmerpenguins import load_penguins
from sklearn.datasets import load_breast_cancer, load_iris, load_boston, fetch_california_housing

## Additional imports for use-cases
import joblib
import pandas as pd

from random_word import RandomWords
from sklearn.datasets import make_classification, make_regression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import GridSearchCV

## Binary Classification: Breast Cancer Dataset

In [None]:
data_breast_cancer = load_breast_cancer(as_frame=True)
data_breast_cancer = data_breast_cancer.frame
data_breast_cancer['target'] = data_breast_cancer['target'].map({0: 'malignant', 1: 'benign'})

X_breast_cancer = data_breast_cancer.loc[:, data_breast_cancer.columns != 'target']
y_breast_cancer = data_breast_cancer.target

grid = {'max_depth':[2, 5], 'max_features': ['sqrt', 'log2']}
classifier = RandomForestClassifier(max_samples=0.8, bootstrap=True, oob_score=True, random_state=42)
grid_classifier = GridSearchCV(classifier, grid, cv=5)
grid_classifier.fit(X_breast_cancer, y_breast_cancer)
rf_breast_cancer = grid_classifier.best_estimator_

print('Parameters of best prediction model:')
print(grid_classifier.best_params_)
print('OOB accuracy of prediction model:')
print(rf_breast_cancer.oob_score_)

In [None]:
fgc = FgClustering(model=rf_breast_cancer, data=data_breast_cancer, target_column='target')
fgc.run()

In [None]:
fgc.plot_global_feature_importance()
fgc.plot_local_feature_importance()
fgc.plot_decision_paths(thr_pvalue=0.01)

## Multiclass Classification: Iris Dataset

In [None]:
data_iris = load_iris(as_frame=True)
data_iris = data_iris.frame
data_iris['target'] = data_iris['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

X_iris = data_iris.loc[:, data_iris.columns != 'target']
y_iris = data_iris.target

grid = {'max_depth':[2, 5], 'max_features': ['sqrt', 'log2']}
classifier = RandomForestClassifier(max_samples=0.8, bootstrap=True, oob_score=True, random_state=42)
grid_classifier = GridSearchCV(classifier, grid, cv=5)
grid_classifier.fit(X_iris, y_iris)
rf_iris = grid_classifier.best_estimator_

print('Parameters of best prediction model:')
print(grid_classifier.best_params_)
print('OOB accuracy of prediction model:')
print(rf_iris.oob_score_)

In [None]:
fgc = FgClustering(model=rf_iris, data=data_iris, target_column='target')
fgc.run()

In [None]:
fgc.plot_global_feature_importance()
fgc.plot_local_feature_importance()
fgc.plot_decision_paths(thr_pvalue=0.01)

## Multiclass Classification: Penguins Dataset

In [None]:
data_penguins = load_penguins()
data_penguins.rename(columns={'species':'target'}, inplace=True)

# Remove the instances with missing values and check how many we are left with
print(f"Before omiting the missing values the dataset has {data_penguins.shape[0]} instances")
data_penguins.dropna(inplace=True)
print(f"After omiting the missing values the dataset has {data_penguins.shape[0]} instances")

# preprocess categorical features such that they can be used for the RF model
data_penguins_encoded = pd.get_dummies(data_penguins, columns=['island', 'sex'], prefix=['island', 'sex'], drop_first=True)

X_penguins = data_penguins_encoded.loc[:, data_penguins_encoded.columns != 'target']
y_penguins = data_penguins_encoded.target

grid = {'max_depth':[2, 5], 'max_features': ['sqrt', 'log2']}
classifier = RandomForestClassifier(max_samples=0.8, bootstrap=True, oob_score=True, random_state=42)
grid_classifier = GridSearchCV(classifier, grid, cv=5)
grid_classifier.fit(X_penguins, y_penguins)
rf_penguins = grid_classifier.best_estimator_

print('Parameters of best prediction model:')
print(grid_classifier.best_params_)
print('OOB accuracy of prediction model:')
print(rf_penguins.oob_score_)

In [None]:
fgc = FgClustering(model=rf_penguins, data=data_penguins_encoded, target_column='target')
fgc.run()

In [None]:
fgc.plot_global_feature_importance()
fgc.plot_local_feature_importance()
fgc.plot_decision_paths(thr_pvalue=0.01)

In [None]:
X = data_penguins.drop('target', axis=1)
X['island'] = X['island'].astype('category')
X['sex'] = X['sex'].astype('category')

fgc.calculate_statistics(X)

In [None]:
fgc.plot_global_feature_importance()
fgc.plot_local_feature_importance()
fgc.plot_decision_paths(thr_pvalue=0.01)

## Regression: Boston Housing Dataset

In [None]:
data = load_boston()

data_boston = pd.DataFrame(columns=data['feature_names'], index=range(data['data'].shape[0]))
data_boston.loc[:,:] = data['data']
data_boston['target'] = data['target']

# features need to be converted to correct select_dtypes
for feature in data_boston.columns:
    if feature != 'CHAS':
        data_boston[feature] = data_boston[feature].astype('float64')
data_boston['CHAS'] = data_boston['CHAS'].astype('category')

X_boston = data_boston.loc[:, data_boston.columns != 'target']
y_boston = data_boston.target

grid = {'max_depth':[2, 5], 'max_features': ['sqrt', 'log2']}
classifier = RandomForestRegressor(n_estimators=100, bootstrap=True, oob_score=True, random_state=42)
grid_classifier = GridSearchCV(classifier, grid, cv=5)
grid_classifier.fit(X_boston, y_boston)
rf_boston = grid_classifier.best_estimator_

print('Parameters of best prediction model:')
print(grid_classifier.best_params_)
print('OOB R^2 of prediction model:')
print(rf_boston.oob_score_)

In [None]:
fgc = FgClustering(model=rf_boston, data=data_boston, target_column='target')
fgc.run()

In [None]:
fgc.plot_global_feature_importance()
fgc.plot_local_feature_importance()
fgc.plot_decision_paths(thr_pvalue=0.01)

## Regression: California Housing Dataset

In [None]:
data_housing = fetch_california_housing(as_frame=True)
data_housing = data_housing.frame
data_housing.rename(columns={'MedHouseVal':'target'}, inplace=True)

# for sake of runtime we only use the first 6000 samples
data_housing = data_housing[:3000]
data_housing.head()

X_housing = data_housing.loc[:, data_housing.columns != 'target']
y_housing = data_housing.target

grid = {'max_depth':[2, 5], 'max_features': ['sqrt', 'log2']}
classifier = RandomForestRegressor(n_estimators=100, bootstrap=True, oob_score=True, random_state=42)
grid_classifier = GridSearchCV(classifier, grid, cv=5)
grid_classifier.fit(X_housing, y_housing)
rf_housing = grid_classifier.best_estimator_

print('Parameters of best prediction model:')
print(grid_classifier.best_params_)
print('OOB R^2 of prediction model:')
print(rf_housing.oob_score_)

In [None]:
fgc = FgClustering(model=rf_housing, data=data_housing, target_column='target')
fgc.run(method_clustering='pam', init_clustering='k-medoids++', n_jobs=6)

In [None]:
fgc.plot_global_feature_importance()
fgc.plot_local_feature_importance()
fgc.plot_decision_paths(thr_pvalue=0.01)

In [None]:
fgc_fast = FgClustering(model=rf_housing, data=data_housing, target_column='target')
fgc_fast.run(method_clustering='alternate', init_clustering='k-medoids++', n_jobs=6)

In [None]:
fgc_fast.plot_global_feature_importance()
fgc_fast.plot_local_feature_importance()
fgc_fast.plot_decision_paths(thr_pvalue=0.01)

## Artificial Datasets

### Binary Classification

In [None]:
# create random feature labels
n_features = 10
feature_names = []

r = RandomWords()
for i in range(n_features):
    feature_names.append(r.get_random_word())

In [None]:
# make classification dataset
X, y = make_classification(n_samples=300, n_features=n_features, n_informative=4, n_redundant=2, n_classes=2, n_clusters_per_class=1, random_state=1)

data_classification = pd.DataFrame(X, columns=feature_names)
data_classification['target'] = y
data_classification['target'] = data_classification['target'].map({0: 'alpaca', 1: 'lion'})

data_classification.head()

In [None]:
X_classification = data_classification.loc[:, data_classification.columns != 'target']
y_classification = data_classification.target

grid = {'max_depth':[2, 5, 10], 'max_features': ['sqrt', 'log2']}
classifier = RandomForestClassifier(max_samples=0.8, bootstrap=True, oob_score=True, random_state=42)
grid_classifier = GridSearchCV(classifier, grid, cv=5)
grid_classifier.fit(X_classification, y_classification)
rf_classification = grid_classifier.best_estimator_

print('Parameters of best prediction model:')
print(grid_classifier.best_params_)
print('OOB accuracy of prediction model:')
print(rf_classification.oob_score_)

In [None]:
fgc = FgClustering(model=rf_classification, data=data_classification, target_column='target')
fgc.run(n_jobs=3)

In [None]:
fgc.plot_global_feature_importance()
fgc.plot_local_feature_importance()
fgc.plot_decision_paths()

### Regression

In [None]:
# make regression dataset
X, y = make_regression(n_samples=500, n_features=n_features, n_informative=4, n_targets=1, noise=0, random_state=1)

data_regression = pd.DataFrame(X, columns=feature_names)
data_regression['target'] = y

data_regression.head()

In [None]:
X_regression = data_regression.loc[:, data_regression.columns != 'target']
y_regression = data_regression.target

grid = {'max_depth':[2, 5], 'max_features': ['sqrt', 'log2']}
classifier = RandomForestRegressor(max_samples=0.8, bootstrap=True, oob_score=True, random_state=42)
grid_classifier = GridSearchCV(classifier, grid, cv=5)
grid_classifier.fit(X_regression, y_regression)
rf_regression = grid_classifier.best_estimator_

print('Parameters of best prediction model:')
print(grid_classifier.best_params_)
print('OOB R^2 of prediction model:')
print(rf_regression.oob_score_)

In [None]:
fgc = FgClustering(model=rf_regression, data=data_regression, target_column='target')
fgc.run(n_jobs=3, discart_value_JI=0.7)

In [None]:
fgc.plot_global_feature_importance()
fgc.plot_local_feature_importance()
fgc.plot_decision_paths(thr_pvalue=0.01, num_cols=3)