In [None]:
## Tune model hyperparameters
## From feature extraction -> (np array) -> UMAP Feature Selection -> (np array) -> ML model
## Calculate F2 score of results -> determine best model parameters

# Use F2 score (weigh recall higher) because in epilepsy detection, it is most important to detect ALL true positives

In [24]:
from sklearn.pipeline import make_pipeline
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA

In [2]:
!pip install umap-learn

Collecting umap-learn
  Downloading umap-learn-0.5.4.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.8/90.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.4-py3-none-any.whl size=86770 sha256=057195abc8965b0131c13934cb2b2cf1dea9c2d90838e224f635934d78141898
  Stored in directory: /root/.cache/pip/wheels/fb/66/29/199acf5784d0f7b8add6d466175ab45506c96e386ed5dd0633
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Created wheel for py

In [3]:
import umap.umap_ as umap

In [26]:
# should receive 32x178 arrays per file
# each array is labeled epilepsy or not epilepsy

def train_test_tune(data, labels):
  # Cross validate loop
  # Inside loop - UMAP + model
  # Use grid search
  # May not work for unsupervised models
  # Add in normalization

  # Inputs: numpy array of data, numpy array of labels
  # Outputs: best hyperparameters for each classical ml model

  # svc_pipeline = make_pipeline(umap.UMAP(), SVC())

  # full_pipeline = make_pipeline(
  #       GridSearchCV(
  #         estimator = svc_pipeline,
  #         param_grid = [{'umap':[umap.UMAP()],
  #                        'umap__n_components':[5, 10], 'umap__n_neighbors':[5, 10]},
  #                       {'svc': [SVC()],
  #                        'svc__kernel':['linear', 'rbf'], 'svc__C':[1, 10]}],
  #         n_jobs=1,
  #         scoring="neg_mean_squared_error",
  #         cv=5,
  #         verbose=2
  #       ))
  svc_pipeline = make_pipeline(PCA(), SVC())

  full_pipeline = make_pipeline(
        GridSearchCV(
          estimator = svc_pipeline,
          param_grid = [{'pca':[PCA()],
                         'pca__n_components':[2, 5]},
                        {'svc': [SVC()],
                         'svc__kernel':['linear', 'rbf'], 'svc__C':[1, 10]}],
          n_jobs=1,
          scoring="neg_mean_squared_error",
          error_score="raise",
          cv=5,
          verbose=2
        ))


  full_pipeline.fit(data, labels)


  print('Cross validate to determine optimal feature selection and model hyperparameters')

  # return all of best parameters of each model as a multidimensional list
  parameters = [['kernel', 'C', 'gamma', 'degree'], ['n_estimators', 'min_samples_leaf', 'max_features'],
                ['_covariance_type'], ['n_clusters'], ['n_components', 'n_neighbors', 'min_dist', 'metrics']]

  return parameters


In [27]:
data = np.random.rand(10,32,166)
labels = np.array([0,1,0,1,0,1,0,1,0,1])

param = train_test_tune(data,labels)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


ValueError: ignored

### Initial UMAP Testing

In [18]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [19]:
!pip install umap-learn



In [None]:
# penguins = pd.read_csv("https://raw.githubusercontent.com/allisonhorst/palmerpenguins/c19a904462482430170bfe2c718775ddb7dbb885/inst/extdata/penguins.csv")
# penguins.head()
# penguins = penguins.dropna()
# penguins.species.value_counts()

Adelie       146
Gentoo       119
Chinstrap     68
Name: species, dtype: int64

In [20]:
import umap.umap_ as umap

In [22]:
# Create UMAP object
reducer = umap.UMAP()

# Clean up data (won't need to do this once we have feature extraction(?)
# penguin_data = penguins[
# [
#         "bill_length_mm",
#         "bill_depth_mm",
#         "flipper_length_mm",
#         "body_mass_g",
#     ]
# ].values

scaled_penguin_data = np.random.rand(10,32,166)

# Turn data into z-scores
# scl = StandardScaler()
# scaled_penguin_data = scl.fit_transform(penguin_data)

# Data has been reduced into two features from four
embedding = reducer.fit_transform(scaled_penguin_data)
embedding.shape

ValueError: ignored