In [1]:
import numpy as np
import pandas as pd
from numpy import array
import matplotlib.pyplot as plt

from sklearn.feature_selection import f_classif, mutual_info_classif, SelectFdr

from sklearn.feature_selection import GenericUnivariateSelect
from sklearn.svm import LinearSVC, NuSVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  

import xgboost as xgb
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split

from warnings import filterwarnings
filterwarnings('ignore')


In [2]:
class hyper_model:
    
    def __init__(self,model_name,X_train,y_train,X_test,y_test):
        
        self.model_name = model_name
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.best_model = None
        self.best_params = None
        self.best_model_perf = None
        self.all_models_perf = pd.DataFrame(None,columns=['Model Iteration','Score'])
        self.stock_model = None
        self.rd_model = None
        self.grid_model = None


    def fit(self):
        
 
        
        # Linear SVM



        if self.model_name == 'linear_svm':

            # Initial Model Fit with stock hyper_params    

            model = LinearSVC(random_state=0, tol=1e-5)

            model.fit(self.X_train,self.y_train)

            init_score = model.score(self.X_test,self.y_test)

            self.all_models_perf = self.all_models_perf.append({'Model Iteration': 'Stock Params',
                                        'Score':init_score}, ignore_index=True)

            print('Initial Model with Stock Params Score : {}'.format(init_score))

            # Random Search Cross Validation

            params = {'C': [0.1, 1, 10, 100, 1000]}  



            search_linear_svm = RandomizedSearchCV(model,
                                                   param_distributions=params,
                                                   random_state=42, 
                                                   n_iter=300, 
                                                   cv=4, 
                                                   verbose=1, 
                                                   n_jobs=1, 
                                                   return_train_score=True)

            search_linear_svm.fit(X,y)

            best_rd_linear_svm = search_linear_svm.best_estimator_

            best_rd_linear_svm_score = best_rd_linear_svm.score(X_test, y_test)

            self.all_models_perf = self.all_models_perf.append({'Model Iteration': 'Random HP Search Best Model',
                                'Score':best_rd_linear_svm_score }, ignore_index=True)

            rd_score_improv = ((best_rd_linear_svm_score - init_score)/init_score) * 100

            print('Random HP Search Best Model Score : {}'.format(best_rd_linear_svm_score))

            print('Improvement of {}% relative to the initial model'.format(rd_score_improv))

            # Further Grid Search Cross Validation

            params = {'C': np.linspace(search_linear_svm.best_params_['C']-5,
                                       search_linear_svm.best_params_['C']+5,
                                       10)}  

            grid_search_linear_svm = GridSearchCV(model,
                                             param_grid=params, 
                                             cv=4, 
                                             verbose=1, 
                                             n_jobs=1)

            grid_search_linear_svm.fit(X,y)

            grid_best_rd_linear_svm = grid_search_linear_svm.best_estimator_

            grid_best_rd_linear_svm_score = grid_best_rd_linear_svm.score(X_test, y_test)

            self.all_models_perf = self.all_models_perf.append({'Model Iteration': 'Further Grid HP Search Best Model',
                                'Score':grid_best_rd_linear_svm_score}, ignore_index=True)

            grid_score_improv = ((grid_best_rd_linear_svm_score - init_score)/init_score) * 100

            print('Further Grid HP Search Best Model Score : {}'.format(grid_best_rd_linear_svm_score))

            print('Improvement of {}% relative to the initial model'.format(grid_score_improv))


            mask_best = self.all_models_perf['Score'] == self.all_models_perf['Score'].max()

            overall_best = str(self.all_models_perf['Model Iteration'][self.all_models_perf['Score'][mask_best].index])

            print('Overall Best Model : {}'.format(overall_best))

            if overall_best == 'Stock Params':

                self.best_model = model
                self.best_params = model.get_params()

            elif overall_best == 'Random HP Search Best Model':

                self.best_model = best_rd_linear_svm
                self.best_params = best_rd_linear_svm.get_params()

            elif overall_best == 'Further Grid HP Search Best Model':

                self.best_model = grid_best_rd_linear_svm
                self.best_params = grid_best_rd_linear_svm.get_params()


            self.stock_model = model
            self.rd_model = best_rd_linear_svm
            self.grid_model = grid_best_rd_linear_svm



        # Non Linear SVM


        elif self.model_name == 'non_linear_svm':

            # Initial Model Fit with stock hyper_params

            model = NuSVC(gamma='auto',random_state=True,kernel='poly')

            model.fit(self.X_train,self.y_train)

            init_score = model.score(self.X_test,self.y_test)

            self.all_models_perf = self.all_models_perf.append({'Model Iteration': 'Stock Params',
                                        'Score':init_score}, ignore_index=True)

            print('Initial Model with Stock Params Score : {}'.format(init_score))

            # Random Search Cross Validation


            params = {'nu': [0.01, 0.3, 0.5, 0.7, 1],  
                          'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
                          'kernel': ['rbf','poly','sigmoid']
                         }  

            search_non_linear_svm = RandomizedSearchCV(model,
                                                   param_distributions=params,
                                                   random_state=42, 
                                                   n_iter=300, 
                                                   cv=4, 
                                                   verbose=1, 
                                                   n_jobs=1, 
                                                   return_train_score=True)

            search_non_linear_svm.fit(X,y)

            best_rd_non_linear_svm = search_non_linear_svm.best_estimator_

            best_rd_non_linear_svm_score = best_rd_non_linear_svm.score(X_test, y_test)

            self.all_models_perf = self.all_models_perf.append({'Model Iteration': 'Random HP Search Best Model',
                                'Score':best_rd_non_linear_svm_score}, ignore_index=True)

            rd_score_improv = ((best_rd_non_linear_svm_score - init_score)/init_score) * 100

            print('Random HP Search Best Model Score : {}'.format(best_rd_non_linear_svm_score))

            print('Improvement of {}% relative to the initial model'.format(rd_score_improv))
            
            

            # Further Grid Search Cross Validation

            params = {'nu': np.linspace(search_non_linear_svm.best_params_['nu']-0.1,
                                       search_non_linear_svm.best_params_['nu']+0.1,
                                       10),  
                      'gamma': np.linspace(search_non_linear_svm.best_params_['gamma']-0.1,
                                       search_non_linear_svm.best_params_['gamma']+0.1,
                                       10), 
                      'kernel': [search_non_linear_svm.best_params_['kernel']]
                     }  

            grid_search_non_linear_svm = GridSearchCV(model,
                                             param_grid=params, 
                                             cv=4, 
                                             verbose=1, 
                                             n_jobs=1)

            grid_search_non_linear_svm.fit(X,y)

            grid_best_rd_non_linear_svm = grid_search_non_linear_svm.best_estimator_

            grid_best_rd_non_linear_svm_score = grid_best_rd_non_linear_svm.score(X_test, y_test)

            self.all_models_perf = self.all_models_perf.append({'Model Iteration': 'Further Grid HP Search Best Model',
                                'Score':grid_best_rd_non_linear_svm_score}, ignore_index=True)

            grid_score_improv = ((grid_best_rd_non_linear_svm_score - init_score)/init_score) * 100

            print('Further Grid HP Search Best Model Score : {}'.format(grid_best_rd_non_linear_svm_score))

            print('Improvement of {}% relative to the initial model'.format(grid_score_improv))

            mask_best = self.all_models_perf['Score'] == self.all_models_perf['Score'].max()

            overall_best = str(self.all_models_perf['Model Iteration'][self.all_models_perf['Score'][mask_best].index])

            print('Overall Best Model : {}'.format(overall_best))

            if overall_best == 'Stock Params':

                self.best_model = model
                self.best_params = model.get_params()

            elif overall_best == 'Random HP Search Best Model':

                self.best_model = best_rd_non_linear_svm
                self.best_params = best_rd_non_linear_svm.get_params()

            elif overall_best == 'Further Grid HP Search Best Model':

                self.best_model = grid_best_rd_non_linear_svm
                self.best_params = grid_best_rd_non_linear_svm.get_params()

            self.stock_model = model
            self.rd_model = best_rd_non_linear_svm
            self.grid_model = grid_best_rd_non_linear_svm


        # Random Forest Classifier


        elif self.model_name == 'random_forest_classifier':


            # Initial Model Fit with stock hyper_params

            model = RandomForestClassifier(verbose=1)

            model.fit(self.X_train,self.y_train)

            init_score = model.score(self.X_test,self.y_test)

            self.all_models_perf = self.all_models_perf.append({'Model Iteration': 'Stock Params',
                                        'Score':init_score}, ignore_index=True)

            print('Initial Model with Stock Params Score : {}'.format(init_score))

            # Random Search Cross Validation


            params = {'bootstrap': [True, False],
                      'max_depth': [50, 60, 70, 80, 90, 100],
                      'max_features': ['auto', 'sqrt'],
                      'min_samples_leaf': [1, 2, 4],
                      'min_samples_split': [2, 5, 10],
                      'n_estimators': [1000, 1200, 1400, 1600, 1800, 2000]
                     }  

            search_rf = RandomizedSearchCV(model,
                                                   param_distributions=params,
                                                   random_state=42, 
                                                   n_iter=30, 
                                                   cv=4, 
                                                   verbose=1, 
                                                   n_jobs=1, 
                                                   return_train_score=True)

            search_rf.fit(X,y)

            best_rd_rf = search_rf.best_estimator_

            best_rd_rf_score = best_rd_rf.score(X_test, y_test)

            self.all_models_perf = self.all_models_perf.append({'Model Iteration': 'Random HP Search Best Model',
                                'Score':best_rd_rf_score}, ignore_index=True)

            rd_score_improv = ((best_rd_rf_score - init_score)/init_score) * 100

            print('Random HP Search Best Model Score : {}'.format(best_rd_rf_score))

            print('Improvement of {}% relative to the initial model'.format(rd_score_improv))


            # Further Grid Search Cross Validation

            params = {'bootstrap': [search_rf.best_params_['bootstrap']],
                      'max_depth': np.linspace(search_rf.best_params_['max_depth']-15,
                                       search_rf.best_params_['max_depth']+15,
                                       10),
                      'max_features': [search_rf.best_params_['max_features']],
                      'min_samples_leaf': np.linspace(search_rf.best_params_['min_samples_leaf']-1,
                                       search_rf.best_params_['min_samples_leaf']+1,
                                       10),
                      'min_samples_split': np.linspace(search_rf.best_params_['min_samples_split']-1,
                                       search_rf.best_params_['min_samples_split']+1,
                                       10),
                      'n_estimators': [search_rf.best_params_['n_estimators']]
                     }  

            grid_search_rf = GridSearchCV(model,
                                             param_grid=params, 
                                             cv=4, 
                                             verbose=1, 
                                             n_jobs=1)

            grid_search_rf.fit(X,y)

            grid_best_rd_rf = grid_search_rf.best_estimator_

            grid_best_rd_rf_score = grid_best_rd_rf.score(X_test, y_test)

            self.all_models_perf = self.all_models_perf.append({'Model Iteration': 'Further Grid HP Search Best Model',
                                'Score':grid_best_rd_rf_score}, ignore_index=True)

            grid_rd_score_improv = ((grid_best_rd_rf_score - init_score)/init_score) * 100

            print('Further Grid HP Search Best Model Score : {}'.format(grid_best_rd_rf_score))

            print('Improvement of {}% relative to the initial model'.format(grid_rd_score_improv))

            mask_best = self.all_models_perf['Score'] == self.all_models_perf['Score'].max()

            overall_best = str(self.all_models_perf['Model Iteration'][self.all_models_perf['Score'][mask_best].index])

            print('Overall Best Model : {}'.format(overall_best))

            if overall_best == 'Stock Params':

                self.best_model = model
                self.best_params = model.get_params()

            elif overall_best == 'Random HP Search Best Model':

                self.best_model = best_rd_rf
                self.best_params = best_rd_rf.get_params()

            elif overall_best == 'Further Grid HP Search Best Model':

                self.best_model = grid_best_rd_rf
                self.best_params = grid_best_rd_rf.get_params()


            self.stock_model = model
            self.rd_model = best_rd_rf
            self.grid_model = grid_best_rd_rf




        # XG Boost Classifier


        elif self.model == 'xg_boost':


            # Initial Model Fit with stock hyper_params

            model = xgb.XGBClassifier()

            model.fit(self.X_train,self.y_train)

            init_score = model.score(self.X_test,self.y_test)

            self.all_models_perf = self.all_models_perf.append({'Model Iteration': 'Stock Params',
                                        'Score':init_score}, ignore_index=True)

            print('Initial Model with Stock Params Score : {}'.format(init_score))




            # Random Search Cross Validation


            params = {
                "colsample_bytree": uniform(0.7, 0.3),
                "gamma": uniform(0, 0.5),
                "learning_rate": uniform(0.03, 0.3), 
                "max_depth": randint(2, 6), 
                "n_estimators": randint(100, 150),
                "subsample": uniform(0.6, 0.4)
                }  

            search_xgb = RandomizedSearchCV(model,
                                                   param_distributions=params,
                                                   random_state=42, 
                                                   n_iter=300, 
                                                   cv=4, 
                                                   verbose=1, 
                                                   n_jobs=1, 
                                                   return_train_score=True)

            search_xgb.fit(X,y)

            best_rd_xgb = search_xgb.best_estimator_

            best_rd_xgb_score = best_rd_xgb.score(X_test, y_test)

            self.all_models_perf = self.all_models_perf.append({'Model Iteration': 'Random HP Search Best Model',
                                'Score':best_rd_xgb_score}, ignore_index=True)

            rd_score_improv = ((best_rd_xgb_score - init_score)/init_score) * 100

            print('Random HP Search Best Model Score : {}'.format(best_rd_xgb_score))

            print('Improvement of {}% relative to the initial model'.format(rd_score_improv))


            # Further Grid Search Cross Validation



            params = {
                "colsample_bytree": np.linspace(search_xgb.best_params_['colsample_bytree']-0.1,
                                       search_xgb.best_params_['colsample_bytree']+0.1,
                                       10),
                "gamma": np.linspace(search_xgb.best_params_['gamma']-0.1,
                                       search_xgb.best_params_['gamma']+0.1,
                                       10),
                "learning_rate": np.linspace(search_xgb.best_params_['learning_rate']-0.1,
                                       search_xgb.best_params_['learning_rate']+0.1,
                                       10), 
                "max_depth": [search_xgb.best_params_['max_depth']], 
                "n_estimators": [search_xgb.best_params_['n_estimators']],
                "subsample": np.linspace(search_xgb.best_params_['subsample']-0.1,
                                       search_xgb.best_params_['subsample']+0.1,
                                       10)
                }  

            grid_search_xgb = GridSearchCV(model,
                                             param_grid=params, 
                                             cv=4, 
                                             verbose=1, 
                                             n_jobs=1)

            grid_search_xgb.fit(X,y)

            grid_best_rd_xgb = grid_search_xgb.best_estimator_

            grid_best_rd_xgb_score = grid_best_rd_xgb.score(X_test, y_test)

            self.all_models_perf = self.all_models_perf.append({'Model Iteration': 'Further Grid HP Search Best Model',
                                'Score':grid_best_rd_xgb_score}, ignore_index=True)

            grid_rd_score_improv = ((grid_best_rd_xgb_score - init_score)/init_score) * 100

            print('Further Grid HP Search Best Model Score : {}'.format(grid_best_rd_xgb_score))

            print('Improvement of {}% relative to the initial model'.format(grid_rd_score_improv))

            mask_best = self.all_models_perf['Score'] == self.all_models_perf['Score'].max()

            overall_best = str(self.all_models_perf['Model Iteration'][self.all_models_perf['Score'][mask_best].index])

            print('Overall Best Model : {}'.format(overall_best))

            if overall_best == 'Stock Params':

                self.best_model = model
                self.best_params = model.get_params()

            elif overall_best == 'Random HP Search Best Model':

                self.best_model = best_rd_xgb
                self.best_params = best_rd_xgb.get_params()

            elif overall_best == 'Further Grid HP Search Best Model':

                self.best_model = grid_best_rd_xgb
                self.best_params = grid_best_rd_xgb.get_params()

            self.stock_model = model
            self.rd_model = best_rd_xgb
            self.grid_model = grid_best_rd_xgb
                
                

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cd drive/MyDrive/CEA/

/content/drive/MyDrive/CEA


#### Data Preparation

In [5]:
X = np.load('X_pref_lss_shift.npy')

In [6]:
X.shape = (480, 11457)

In [7]:
y = np.load('y_pref_lss_shift.npy')

In [8]:
y.shape = (480, 1)

In [22]:
df.columns[:-1]

Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
       ...
       11447, 11448, 11449, 11450, 11451, 11452, 11453, 11454, 11455, 11456],
      dtype='object', length=11457)

In [27]:
df = pd.DataFrame(X,columns=range(11457))
df['Pleasance'] = y
df = df[[df.columns[-1]] + list(df.columns[:-1])]

In [28]:
df.head()

Unnamed: 0,Pleasance,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,...,11417,11418,11419,11420,11421,11422,11423,11424,11425,11426,11427,11428,11429,11430,11431,11432,11433,11434,11435,11436,11437,11438,11439,11440,11441,11442,11443,11444,11445,11446,11447,11448,11449,11450,11451,11452,11453,11454,11455,11456
0,HP,-1.704227,1.925462,-0.106916,-0.52504,-0.431107,-2.071764,-2.566338,-3.049446,-3.21523,-3.361049,-3.759208,-1.114603,-1.315468,-2.52204,-1.961206,-1.902875,-1.786457,-2.171262,-1.818855,-1.794381,-2.338208,0.627008,-0.277206,-1.599694,-2.480201,0.307,-0.698269,-1.621889,1.775852,0.204341,-0.526373,-1.676566,-2.829765,-1.281726,-0.118907,-0.814016,1.215617,-1.022385,-1.570267,...,-0.519382,-0.903702,-0.041838,-0.630033,-0.861217,-1.00518,-0.270685,1.449069,1.297987,-0.379659,-0.940588,-0.949879,-0.919468,1.172928,-0.082074,-0.207358,0.753851,0.938954,1.196739,-2.309771,-0.12228,-1.562214,-2.750969,-2.833302,-1.782996,-2.259498,-0.387538,-0.695699,-0.920711,1.151091,-0.90218,-1.555644,-0.347132,0.095122,-0.721606,0.593643,1.289841,0.700836,0.093239,0.620513
1,HP,0.784934,-0.431708,-0.303649,-0.242427,-0.019843,2.068726,0.722113,1.067205,1.306717,0.921533,1.209853,0.704976,0.27742,-0.37941,0.648984,-0.365461,-0.970451,-0.369939,-0.247815,-0.071891,0.45532,-1.65432,0.523864,1.23319,1.162943,0.979981,1.30346,1.267262,0.315339,0.573701,0.504845,-0.981162,-0.455958,-0.749199,-1.650827,-0.795522,-1.376439,0.000525,-0.061682,...,-1.954675,-2.20528,-2.296662,-1.544042,-0.782207,-0.071884,-0.159004,-1.269453,-0.191307,1.525302,1.695688,1.334699,-0.110598,0.533445,1.546837,1.608114,0.799164,1.299705,0.509888,-0.169066,-0.49411,-0.868996,1.728676,1.335425,0.772371,1.055809,-0.311071,-1.925679,-1.701003,-1.702712,-1.732676,-1.598245,-0.861691,0.269639,-1.197135,-1.772642,-0.257704,-2.193575,0.633638,-1.915317
2,HP,1.651002,-0.713096,-0.671642,-0.545217,-0.21475,1.608899,1.295009,1.378288,1.257798,0.910032,0.399402,2.116071,1.532138,0.564015,1.911376,0.692246,-1.559284,0.732906,0.290827,-0.610992,-0.508413,-0.17796,-1.316687,0.055704,0.80197,-0.038479,0.705371,1.25566,0.071422,0.866842,-0.330154,1.52051,0.796959,1.101152,-0.980723,0.638614,-1.920285,-1.439364,-0.991175,...,0.559852,0.174908,0.514745,0.97544,2.34065,2.84367,2.292057,-2.241348,-0.140358,-1.315152,-0.988279,-0.173327,1.140883,1.848733,-0.593065,-0.448917,2.025195,-0.217067,-0.753952,1.148847,-0.12858,1.28919,1.909322,1.858358,2.006626,2.229551,2.959271,1.505167,1.88714,-0.043402,1.578903,1.4521,0.704673,-1.014668,0.079712,-0.318978,0.430328,0.023924,-2.355849,-1.67773
3,HP,-2.252201,0.74638,1.167366,-0.268236,-0.576084,-1.160257,-0.580592,-1.118481,-1.571498,-1.00969,-0.615242,-1.561618,-0.606804,-0.72081,-1.207996,-1.01369,-0.182615,-0.806681,0.144711,1.094884,-0.246308,0.48915,0.606202,-0.505775,-0.822612,-0.371397,-0.698592,-0.763609,-0.489503,-0.795111,-0.612309,-1.787912,-1.509154,-1.53738,1.303255,0.112557,1.338032,2.033037,1.405216,...,1.88032,1.125101,0.986171,-0.738961,-0.70836,-0.723879,-1.230551,0.128641,0.033675,-1.161334,-0.885043,-1.012474,-1.612306,-1.337056,-1.399128,-1.083608,-1.35423,-0.608377,-0.286494,-0.012946,0.869028,-0.198108,-0.636096,-1.017157,-0.458988,-1.099202,-0.375585,-1.933792,-1.677349,-1.566688,-1.633401,-1.206223,-0.83766,1.239585,2.361403,-0.455235,0.139887,-1.696722,2.214205,1.023099
4,HP,0.282161,-0.71711,-0.504083,-0.017307,0.11491,0.805828,-0.695419,-0.212335,0.396045,0.540109,0.216686,-0.259559,-0.314577,0.634926,0.406299,0.996021,-1.172315,-1.490953,0.767344,0.347172,-0.788802,-1.720638,-1.077181,-0.526394,-0.435804,-1.371072,-0.97008,-0.590674,-1.454369,-1.602668,1.408044,1.681355,2.467435,3.057041,-1.354072,-1.344196,-1.190083,0.659241,1.238605,...,0.510252,0.99751,0.038122,0.711285,-0.219353,-0.839999,-0.408223,0.720258,0.485219,0.557591,0.333529,0.47851,0.952142,-0.599886,0.47953,0.613897,-0.168005,0.971246,1.778535,1.535442,-0.972893,1.045675,-0.556147,-0.249968,-0.344364,-0.240159,-0.747252,1.123062,-0.019771,1.070729,0.661991,-0.72294,-0.338803,-1.278308,-0.726343,-1.81482,-0.407999,1.476642,0.584545,1.143373


## Features Selection

### Filter

In [9]:
transformer = GenericUnivariateSelect(mutual_info_classif, mode='k_best', param=3)

In [10]:
X_trans = transformer.fit_transform(X, y)

In [11]:
X_trans.shape

(480, 3)

#### PCA

In [12]:
from sklearn.decomposition import PCA

Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
       ...
       11447, 11448, 11449, 11450, 11451, 11452, 11453, 11454, 11455, 11456],
      dtype='object', length=11457)

In [30]:
pca = PCA(n_components = 3)

X_pca = pca.fit_transform(df[df.columns[1:]])

In [31]:
tot_var = pca.explained_variance_ratio_.sum()*100

### Data Visualization

In [34]:
import plotly.express as px

##### PCA

In [32]:
X_pca

array([[-52.19988263,   0.06656338,  43.54160776],
       [ 35.92005037, -13.91231222,  -9.53909839],
       [ -1.41984768, -42.20226351,  24.8118998 ],
       ...,
       [  3.91759733,  55.81398304,  19.09824146],
       [-30.45244534, -21.91738147,  24.16519087],
       [ -1.40201966,  32.50277446,  -8.88577992]])

In [36]:
fig = px.scatter_3d(
    X_pca, x=0, y=1, z=2, color=df['Pleasance'],
    title=f'Total Explained Variance: {tot_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()

In [40]:
fig = px.scatter(
    X_pca, x=0, y=1, color=df['Pleasance'],
    labels={'0': 'PC 1', '1': 'PC 2'}
)
fig.show()

##### Features Selection

In [37]:
fig = px.scatter_3d(
    X_trans, x=0, y=1, z=2, color=df['Pleasance'],
    
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()

In [38]:
fig = px.scatter(
    X_trans, x=0, y=1, color=df['Pleasance'],
    
    labels={'0': 'PC 1', '1': 'PC 2'}
)
fig.show()