In [42]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import numpy as np

In [43]:
# Create SparkSession
ss = SparkSession.builder.appName("MapReduceExample").getOrCreate()

In [44]:
# Load Data to RDD
dataRDD = ss.read.csv("card_transdata.csv", header=True, inferSchema=True).rdd
#print first rows
print(dataRDD.take(1))

[Row(distance_from_home=57.87785658389723, distance_from_last_transaction=0.3111400080477545, ratio_to_median_purchase_price=1.9459399775518593, repeat_retailer=1.0, used_chip=1.0, used_pin_number=0.0, online_order=0.0, fraud=0.0)]


In [45]:
def calculateDistance(x1, x2):
    # Euclidean distance
    distance = np.linalg.norm(x1-x2)
    return distance

In [46]:

trainRDD, testRDD = dataRDD.randomSplit([0.8, 0.2], seed=42)
#drop fraud column from testRDD
testlabels = testRDD.map(lambda x: x[-1])
testRDD = testRDD.map(lambda x: x[:-1])



In [47]:
# Apply the map transformation 
#apply on trainRDD and testRDD
#replace testPoint with testRDD
from tqdm import tqdm

testPredictions = []
k = 3
testPoints = testRDD.collect()[200:500]  

for testPoint in tqdm(testPoints):
    fraudDetection = trainRDD.map(lambda x: (None, (x,calculateDistance(testPoint, np.array(x[:-1])))))
    fraudDetection = fraudDetection.takeOrdered(k, key=lambda x: x[1][1])
    results = fraudDetection
    countFraud = 0
    for result in results:
        if  result[1][0][7] == 1:
            countFraud += 1
    
    if countFraud > k/2:
        testPredictions.append(1)
    else:
        testPredictions.append(0)
        

100%|██████████| 300/300 [52:04<00:00, 10.42s/it]


In [48]:
testPredictions = np.array(testPredictions)
print(testPredictions)
testLabels = np.array(testlabels.collect()[200:500])
print(testLabels)
accuracy = np.sum(testPredictions == testLabels)/len(testLabels)
print("Accuracy: ", accuracy)

[0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0
 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0
 0 0 0 1]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 1. 0. 1. 0. 0. 0. 

In [49]:
#save testPredictions and test labels to same text file
#open file
f = open("testPredictions2.txt", "w")
#write testPredictions and test labels beside each other to file
for i in range(len(testPredictions)):
    f.write(str(testPredictions[i]) + " " + str(testLabels[i]) + "\n")
#close file
f.close()


