In [1]:
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import RidgeCV, LassoCV, Lasso, Ridge
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA, KernelPCA
from qiskit.circuit.library import ZZFeatureMap, PauliFeatureMap
from qiskit.primitives import Sampler
from qiskit_algorithms.state_fidelities import ComputeUncompute
from qiskit_machine_learning.kernels import FidelityQuantumKernel

In [2]:
train = pd.read_csv("../data/train_fe.csv")
test = pd.read_csv("../data/test_fe.csv")

In [3]:
train

Unnamed: 0,international_plan,voice_mail_plan,area_code_408,area_code_415,area_code_510,region_Midwest,region_Northeast,region_South,region_West,account_length,...,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls,churn
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.524793,...,0.582353,0.542866,0.572161,0.436090,0.572152,0.500,0.15,0.500000,0.111111,0.0
1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.438017,...,0.605882,0.537690,0.599772,0.526316,0.600000,0.685,0.15,0.685185,0.111111,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.561983,...,0.647059,0.333225,0.338457,0.533835,0.338608,0.610,0.25,0.609259,0.000000,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.342975,...,0.517647,0.170171,0.436095,0.421053,0.436076,0.330,0.35,0.329630,0.222222,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.305785,...,0.717647,0.407959,0.407629,0.661654,0.407595,0.505,0.15,0.505556,0.333333,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2661,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.322314,...,0.400000,0.521514,0.505835,0.714286,0.505696,0.590,0.25,0.590741,0.222222,0.0
2662,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.789256,...,0.741176,0.592688,0.670083,0.375940,0.670253,0.495,0.30,0.494444,0.222222,0.0
2663,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.276860,...,0.323529,0.421870,0.420154,0.676692,0.420253,0.480,0.20,0.479630,0.333333,0.0
2664,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.111570,...,0.341176,0.794241,0.421862,0.436090,0.422152,0.705,0.30,0.705556,0.222222,0.0


In [4]:
train.columns

Index(['international_plan', 'voice_mail_plan', 'area_code_408',
       'area_code_415', 'area_code_510', 'region_Midwest', 'region_Northeast',
       'region_South', 'region_West', 'account_length',
       'number_vmail_messages', 'total_day_minutes', 'total_day_calls',
       'total_day_charge', 'total_eve_minutes', 'total_eve_calls',
       'total_eve_charge', 'total_night_minutes', 'total_night_calls',
       'total_night_charge', 'total_intl_minutes', 'total_intl_calls',
       'total_intl_charge', 'customer_service_calls', 'churn'],
      dtype='object')

In [5]:
x_train = train.drop("churn", axis = 1)
y_train = train["churn"]
x_test = test.drop("churn", axis = 1)
y_test = test["churn"]

In [6]:
estimator = Lasso()

In [7]:
selector = RFECV(estimator, min_features_to_select = 8, step=10, cv = 5)
selector = selector.fit(x_train, y_train)

In [8]:
selector.get_feature_names_out()

array(['region_South', 'region_West', 'account_length',
       'number_vmail_messages', 'total_day_minutes', 'total_day_calls',
       'total_intl_charge', 'customer_service_calls'], dtype=object)

In [9]:
selector.feature_names_in_

array(['international_plan', 'voice_mail_plan', 'area_code_408',
       'area_code_415', 'area_code_510', 'region_Midwest',
       'region_Northeast', 'region_South', 'region_West',
       'account_length', 'number_vmail_messages', 'total_day_minutes',
       'total_day_calls', 'total_day_charge', 'total_eve_minutes',
       'total_eve_calls', 'total_eve_charge', 'total_night_minutes',
       'total_night_calls', 'total_night_charge', 'total_intl_minutes',
       'total_intl_calls', 'total_intl_charge', 'customer_service_calls'],
      dtype=object)

In [10]:
selector.ranking_

array([3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       1, 1])

In [11]:
x_train.shape

(2666, 24)

In [12]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   international_plan      2666 non-null   float64
 1   voice_mail_plan         2666 non-null   float64
 2   area_code_408           2666 non-null   float64
 3   area_code_415           2666 non-null   float64
 4   area_code_510           2666 non-null   float64
 5   region_Midwest          2666 non-null   float64
 6   region_Northeast        2666 non-null   float64
 7   region_South            2666 non-null   float64
 8   region_West             2666 non-null   float64
 9   account_length          2666 non-null   float64
 10  number_vmail_messages   2666 non-null   float64
 11  total_day_minutes       2666 non-null   float64
 12  total_day_calls         2666 non-null   float64
 13  total_day_charge        2666 non-null   float64
 14  total_eve_minutes       2666 non-null   

### Regular PCA

In [13]:
pca = PCA(n_components=0.9)
pca.fit(x_train)

In [14]:
pca.n_components_

9

In [15]:
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)

In [16]:
col = []
for i in range(pca.n_components_):
    col.append("V_" + str(i))

In [17]:
train_sav = pd.DataFrame(x_train_pca, columns=col)
train_sav["churn"] = y_train
test_sav = pd.DataFrame(x_test_pca, columns=col)
test_sav["churn"] = y_test

In [18]:
train_0 = train_sav[train_sav["churn"] == 0].drop_duplicates().reset_index(drop=True)
train_1 = train_sav[train_sav["churn"] == 1].reset_index(drop=True)

In [19]:
np.random.seed(42)
numbers = np.random.randint(0,high=len(train_0), size=len(train_1))

In [20]:
train_0 = train_0.iloc[numbers,:].reset_index(drop=True)
train_new = pd.concat([train_0, train_1], axis = 0).reset_index(drop=True)

In [21]:
train_new.to_csv("../data/train_pca_new.csv", index=False)

In [22]:
train_sav.to_csv("../data/train_pca.csv", index=False)

In [23]:
test_sav.to_csv("../data/test_pca.csv", index=False)

In [24]:
train_0 = train[train["churn"] == 0].drop_duplicates().reset_index(drop=True)
train_1 = train[train["churn"] == 1].reset_index(drop=True)

In [25]:
np.random.seed(42)
numbers = np.random.randint(0,high=len(train_0), size=len(train_1))

In [26]:
train_0 = train_0.iloc[numbers,:].reset_index(drop=True)
train = pd.concat([train_0, train_1], axis = 0).reset_index(drop=True)

In [27]:
train.to_csv("../data/train_fe_small.csv", index=False)

### Quantum PCA

In [28]:
x_train.shape[1]

24

In [29]:
#sampler = Sampler()
#fidelity = ComputeUncompute(sampler=sampler)
#feature_map = PauliFeatureMap(feature_dimension=x_train.shape[1], reps=2, entanglement="linear")
#qpca_kernel = FidelityQuantumKernel(fidelity=fidelity, feature_map=feature_map)

In [30]:
#matrix_train = qpca_kernel.evaluate(x_vec=x_train)
#matrix_test = qpca_kernel.evaluate(x_vec=x_test, y_vec=x_train)

In [31]:
#kernel_pca_q = KernelPCA(n_components=0.95, kernel="precomputed")
#train_q= kernel_pca_q.fit_transform(matrix_train)
#test_features_q = kernel_pca_q.transform(matrix_test)