In [1]:
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import RidgeCV, LassoCV, Lasso, Ridge
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA, KernelPCA
from qiskit.circuit.library import ZZFeatureMap, PauliFeatureMap
from qiskit.primitives import Sampler
from qiskit_algorithms.state_fidelities import ComputeUncompute
from qiskit_machine_learning.kernels import FidelityQuantumKernel

In [2]:
train = pd.read_csv("../data/train_fe.csv")
test = pd.read_csv("../data/test_fe.csv")

In [3]:
train.columns

Index(['cryosleep', 'vip', 'side', 'destination_55_cancri_e',
       'destination_pso_j318_5_22', 'destination_trappist_1e',
       'homeplanet_earth', 'homeplanet_europa', 'homeplanet_mars', 'age',
       'roomservice', 'foodcourt', 'shoppingmall', 'spa', 'vrdeck',
       'transported'],
      dtype='object')

In [4]:
x_train = train.drop("transported", axis = 1)
y_train = train["transported"]
x_test = test.drop("transported", axis = 1)
y_test = test["transported"]

In [5]:
estimator = Lasso()

In [6]:
selector = RFECV(estimator, min_features_to_select = 10, step=12, cv = 5)
selector = selector.fit(x_train, y_train)

In [7]:
selector.get_feature_names_out()

array(['destination_trappist_1e', 'homeplanet_earth', 'homeplanet_europa',
       'homeplanet_mars', 'age', 'roomservice', 'foodcourt',
       'shoppingmall', 'spa', 'vrdeck'], dtype=object)

In [8]:
selector.feature_names_in_

array(['cryosleep', 'vip', 'side', 'destination_55_cancri_e',
       'destination_pso_j318_5_22', 'destination_trappist_1e',
       'homeplanet_earth', 'homeplanet_europa', 'homeplanet_mars', 'age',
       'roomservice', 'foodcourt', 'shoppingmall', 'spa', 'vrdeck'],
      dtype=object)

In [9]:
selector.ranking_

array([2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [10]:
x_train.shape

(6047, 15)

In [11]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6047 entries, 0 to 6046
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   cryosleep                  6047 non-null   float64
 1   vip                        6047 non-null   float64
 2   side                       6047 non-null   float64
 3   destination_55_cancri_e    6047 non-null   float64
 4   destination_pso_j318_5_22  6047 non-null   float64
 5   destination_trappist_1e    6047 non-null   float64
 6   homeplanet_earth           6047 non-null   float64
 7   homeplanet_europa          6047 non-null   float64
 8   homeplanet_mars            6047 non-null   float64
 9   age                        6047 non-null   float64
 10  roomservice                6047 non-null   float64
 11  foodcourt                  6047 non-null   float64
 12  shoppingmall               6047 non-null   float64
 13  spa                        6047 non-null   float

### Regular PCA

In [12]:
pca = PCA(n_components=0.95)
pca.fit(x_train)

In [13]:
pca.n_components_

6

In [14]:
np.sum(pca.explained_variance_ratio_)

0.9598630798979065

In [15]:
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)

In [16]:
col = []
for i in range(pca.n_components_):
    col.append("V_" + str(i))

In [17]:
train_sav = pd.DataFrame(x_train_pca, columns=col)
train_sav["transported"] = y_train
test_sav = pd.DataFrame(x_test_pca, columns=col)
test_sav["transported"] = y_test

In [18]:
train_0 = train_sav[train_sav["transported"] == 0].drop_duplicates().reset_index(drop=True)
train_1 = train_sav[train_sav["transported"] == 1].reset_index(drop=True)

In [19]:
len(train_0), len(train_1)

(2789, 3032)

In [20]:
np.random.seed(42)
numbers = np.random.randint(0,high=len(train_1), size=len(train_0))

In [21]:
len(numbers)

2789

In [22]:
train_1 = train_1.iloc[numbers,:].reset_index(drop=True)
train_new = pd.concat([train_0, train_1], axis = 0).reset_index(drop=True)

In [23]:
train_new.to_csv("../data/train_pca_new.csv", index=False)

In [24]:
train_sav.to_csv("../data/train_pca.csv", index=False)

In [25]:
test_sav.to_csv("../data/test_pca.csv", index=False)

In [26]:
train_0 = train[train["transported"] == 0].drop_duplicates().reset_index(drop=True)
train_1 = train[train["transported"] == 1].reset_index(drop=True)

In [27]:
np.random.seed(42)
numbers = np.random.randint(0,high=len(train_1), size=len(train_0))

In [28]:
train_1 = train_1.iloc[numbers,:].reset_index(drop=True)
train = pd.concat([train_0, train_1], axis = 0).reset_index(drop=True)

In [29]:
train.to_csv("../data/train_fe_small.csv", index=False)

### Quantum PCA

In [30]:
x_train.shape[1]

15

In [31]:
#sampler = Sampler()
#fidelity = ComputeUncompute(sampler=sampler)
#feature_map = PauliFeatureMap(feature_dimension=x_train.shape[1], reps=2, entanglement="linear")
#qpca_kernel = FidelityQuantumKernel(fidelity=fidelity, feature_map=feature_map)

In [32]:
#matrix_train = qpca_kernel.evaluate(x_vec=x_train)
#matrix_test = qpca_kernel.evaluate(x_vec=x_test, y_vec=x_train)

In [33]:
#kernel_pca_q = KernelPCA(n_components=0.95, kernel="precomputed")
#train_q= kernel_pca_q.fit_transform(matrix_train)
#test_features_q = kernel_pca_q.transform(matrix_test)