In [9]:
import pandas as pd
import numpy as np
import haversine # pip install haversine
import category_encoders as ce # pip install category_encoders
import pygeohash

from datetime import timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import IsolationForest

# Load raw data
data = pd.read_csv('fraudTrain.csv')
data.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [6]:


# 1. Convert transaction time to datetime type
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])

# # 2. Treating missing values
# imputer = SimpleImputer(strategy='median')
# imputed_df = pd.DataFrame(imputer.fit_transform(df))
# imputed_df.columns = df.columns

# Label encode categorical columns  
label_encoder = ce.OrdinalEncoder(cols=['merchant', 'job','category', 'state']) 
df = label_encoder.fit_transform(data)

# Apply one-hot encoding
# categorical_cols = ['category', 'state']  
# df = pd.get_dummies(df, prefix=None, columns=categorical_cols)


# 3. Label encoding 
# label_encoder = ce.OrdinalEncoder(cols=['merchant', 'first', 'last', 'job'])
# data_label_encoded = label_encoder.fit_transform(df)
# data_label_encoded['first']
# # 4. One hot encoding
# data_one_hot_encoded = pd.get_dummies(data_label_encoded, prefix=['card_'], 
#                                         columns=['cc_num'])  
                                       
# 5. Standard scaling amount
amt_scaler = StandardScaler()
df['std_amt'] = amt_scaler.fit_transform(df[['amt']])


# 6. Time features
df['month'] = df['trans_date_trans_time'].dt.month
df['day'] = df['trans_date_trans_time'].dt.day




# # 7. Calculate time from last transaction per customer
# # grouped_df = data_one_hot_encoded.groupby('cc_num')
# # grouped_df['days_since_last'] = grouped_df.diff().dt.days 

# 8. Geospatial features 
df['lat_bin'] = pd.cut(df['lat'], bins=10)
df['geohash'] = df[['lat', 'long']].apply(lambda x: pygeohash.encode(x['lat'], x['long']), axis=1)
df.drop(columns=['amt','lat', 'long','trans_date_trans_time'],inplace=True)
print(df.head())
print(df.shape)
print(df.columns)


# print(data_one_hot_encoded)
# # 9. Distance between transaction points 
# data_one_hot_encoded['dist_last'] = grouped_df.apply(
#     lambda x: haversine.haversine(
#         (x['lat'].iloc[-2], x['long'].iloc[-2]),
#         (x['lat'].iloc[-1], x['long'].iloc[-1])
#     ))
  
# 10. Anomaly detection
# isolation_forest = IsolationForest(n_estimators=100) 
# isolation_forest.fit(data_one_hot_encoded)
# data_one_hot_encoded['anomaly_score'] = isolation_forest.decision_function(data_one_hot_encoded)


   Unnamed: 0            cc_num  merchant  category      first     last  \
0           0  2703186189652095         1         1   Jennifer    Banks   
1           1      630423337322         2         2  Stephanie     Gill   
2           2    38859492057661         3         3     Edward  Sanchez   
3           3  3534093764340240         4         4     Jeremy    White   
4           4   375534208663984         5         5      Tyler   Garcia   

  gender                        street            city  state  ...  \
0      F                561 Perry Cove  Moravian Falls      1  ...   
1      F  43039 Riley Greens Suite 393          Orient      2  ...   
2      M      594 White Dale Suite 530      Malad City      3  ...   
3      M   9443 Cynthia Court Apt. 038         Boulder      4  ...   
4      M              408 Bradley Rest        Doe Hill      5  ...   

                          trans_num   unix_time  merch_lat  merch_long  \
0  0b242abb623afc578575680df30655b9  1325376018  36.01

In [85]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset 
df = pd.read_csv('fraudTrain.csv')
df['dob'] = pd.to_datetime(df['dob'])
# Convert DOB to numeric age 
df['age'] = df['dob'].apply(lambda x: datetime.now().year - x.year)

# Drop original dob column
df = df.drop('dob', axis=1)
# Convert transaction date-time to datetime
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])  
df['timestamp'] = df['trans_date_trans_time'].apply(lambda x: x.timestamp())
# Encode categorical features 
le = LabelEncoder()
# cat_cols = ['cc_num', 'merchant', 'category',  'gender', 
#             'street', 'city', 'state', 'job']
cat_cols = ['category',  'gender','job']
for col in cat_cols:
    df[col] = le.fit_transform(df[col])
 
# Standardize numerical features
num_cols = ['amt', 'lat', 'long', 'city_pop', 'age', 'unix_time', 'merch_lat', 'merch_long','timestamp']  
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Get time since last transaction 
# df['time_since_last'] = df['trans_date_trans_time'].diff().dt.total_seconds()

# Split into X (features) and y (target)
df = df.drop(['Unnamed: 0','trans_date_trans_time','first', 'last','trans_num',
              'cc_num', 'merchant', 
              'street', 'city', 'state',], axis=1)
X = df.drop('is_fraud', axis=1)  
y = df['is_fraud']
# X = X.sort_values(by='cc_num')
X

Unnamed: 0,category,amt,gender,zip,lat,long,city_pop,job,unix_time,merch_lat,merch_long,age,timestamp
0,8,-0.407826,0,28654,-0.484420,0.657620,-0.282589,370,-1.858664,-0.494354,0.593864,-0.848322,-1.851825
1,4,0.230039,0,99160,2.039120,-2.033870,-0.293670,428,-1.858662,2.078699,-2.030341,-0.272898,-1.851823
2,0,0.934149,1,83252,0.717754,-1.601537,-0.280406,307,-1.858662,0.902849,-1.592323,0.647781,-1.851822
3,2,-0.158132,1,59632,1.515617,-1.590766,-0.287742,328,-1.858660,1.662886,-1.621848,0.360069,-1.851820
4,9,-0.177094,1,24433,-0.023035,0.782279,-0.293835,116,-1.858651,0.026941,0.841909,-0.733237,-1.851812
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,0,-0.341769,1,84735,-0.161575,-1.617214,-0.293309,215,1.757855,-0.331926,-1.558650,0.705324,1.760708
1296671,1,-0.116339,1,21790,0.143638,0.924207,-0.293832,360,1.757855,0.072321,0.869934,-0.330440,1.760709
1296672,1,0.221930,1,88325,-1.102883,-1.133257,-0.291186,308,1.757856,-0.962433,-1.082272,0.360069,1.760710
1296673,1,0.028375,1,57756,0.948613,-0.895029,-0.290434,485,1.757861,0.832051,-0.945074,-0.387982,1.760715


In [86]:
# X['cc_num'].unique().shape

In [87]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Assuming X_train, X_test, y_train, y_test are your feature and target variable splits

# Apply RandomUnderSampler to undersample the majority class
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)
print(y_resampled.value_counts())




is_fraud
0    7506
1    7506
Name: count, dtype: int64


In [88]:
from sklearn.utils import shuffle

# Combine X and y into a single DataFrame
data_combined = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled)],axis=1)

#shuffle data
shuffled_data = shuffle(data_combined, random_state=42)
shuffled_data.tail()
# Randomly sample 300 points from the combined data
sampled_data = shuffled_data.sample(n=600, random_state=42)

# # Separate the sampled data back into X and y
X_new = sampled_data.drop(columns=['is_fraud'])
y_new = sampled_data['is_fraud']
y_new.value_counts()

is_fraud
1    310
0    290
Name: count, dtype: int64

In [89]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.svm import SVC

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.2, random_state=42)
# Create the SVM with RBF kernel and tune hyperparameters
svm = SVC(kernel='rbf', C=1.0, gamma=0.1) 

# Fit SVM to the training data   
svm.fit(X_train, y_train)  

# Make predictions on test data
y_pred = svm.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy: %.3f%%"%(accuracy*100.0))
print("Precision: %.3f%%"%(precision*100.0))
print("Recall: %.3f%%"%(recall*100.0))

Accuracy: 52.500%
Precision: 50.909%
Recall: 94.915%


In [90]:
classification_report_result = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_report_result)

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.11      0.20        61
           1       0.51      0.95      0.66        59

    accuracy                           0.53       120
   macro avg       0.60      0.53      0.43       120
weighted avg       0.61      0.53      0.43       120



In [91]:
from sklearn.metrics import classification_report, confusion_matrix

confusion_matrix_result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", confusion_matrix_result)

Confusion Matrix:
 [[ 7 54]
 [ 3 56]]


In [92]:
# X_train.shape

(480, 13)

In [93]:
# Importing standard Qiskit libraries
from qiskit import QuantumCircuit, transpile, BasicAer, IBMQ, execute, Aer, assemble
# from qiskit.tools.jupyter import *
from qiskit.visualization import *
from qiskit import Aer
from qiskit.algorithms.optimizers import SPSA
from qiskit.circuit import ParameterVector
from qiskit.visualization import circuit_drawer



from qiskit.providers.aer import QasmSimulator, StatevectorSimulator, UnitarySimulator


# General libraries\
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import cm
from sklearn import metrics

# QKT related
from qiskit.circuit.library import ZZFeatureMap,ZFeatureMap, PauliFeatureMap
from qiskit_machine_learning.algorithms import QSVC
from qiskit_machine_learning.kernels import FidelityQuantumKernel, TrainableFidelityQuantumKernel,QuantumKernel
# from qiskit_machine_learning.kernels.algorithms import QuantumKernelTrainer
from qiskit.algorithms.state_fidelities import ComputeUncompute
# from qiskit.primitives import Sampler

# Additional imports
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

from collections import Counter

In [94]:
data_map = lambda x: x[0]*x[0]+1
components=13
# components =17
# fm = ZFeatureMap(components, reps=1,data_map_func=data_map)
# print(circuit_drawer(fm, output='text'))

fm = ZZFeatureMap(components, entanglement='linear')
print(transpile(fm,optimization_level=3).depth())

46


In [95]:
# Create a quantum kernel using the feature map
quantum_kernel = QuantumKernel(feature_map=fm, quantum_instance=Aer.get_backend('qasm_simulator'))
# quantum_kernel = TrainableFidelityQuantumKernel(feature_map=fm, training_parameters=training_params)

# Create a QSVC model
qsvc = QSVC(quantum_kernel=quantum_kernel)

# Fit the QSVC model using the training data
qsvc.fit(X_train, y_train)

In [96]:
# Predict the labels
labels_test = qsvc.predict(X_test)



In [97]:
# Evalaute the test accuracy
accuracy_test = metrics.balanced_accuracy_score(y_true=y_test, y_pred=labels_test)
print(confusion_matrix(y_test, labels_test))
print(classification_report(y_test, labels_test))
print(f"Test Accuracy: {accuracy_test}")

[[ 0 61]
 [ 0 59]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        61
           1       0.49      1.00      0.66        59

    accuracy                           0.49       120
   macro avg       0.25      0.50      0.33       120
weighted avg       0.24      0.49      0.32       120

Test Accuracy: 0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
