In [21]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import numpy as np
from tqdm import tqdm
from numba import njit

In [2]:
# Create SparkSession
ss = SparkSession.builder.appName("MapReduceExample").getOrCreate()

In [3]:
# Load Data to RDD
dataRDD = ss.read.csv("card_transdata.csv", header=True, inferSchema=True).rdd
#print first rows
print(dataRDD.take(1))

[Row(distance_from_home=57.87785658389723, distance_from_last_transaction=0.3111400080477545, ratio_to_median_purchase_price=1.9459399775518593, repeat_retailer=1.0, used_chip=1.0, used_pin_number=0.0, online_order=0.0, fraud=0.0)]


In [37]:
def calculateDistance(x1, x2):
    # Euclidean distance
    distance = np.linalg.norm(x1-x2)
    return distance

In [5]:

trainRDD, testRDD = dataRDD.randomSplit([0.8, 0.2], seed=42)
#drop fraud column from testRDD
testlabels = testRDD.map(lambda x: x[-1])
testRDD = testRDD.map(lambda x: x[:-1])



## KNN MapReduce

In [6]:
def KNNMapReduce():
    
    testPredictions = []
    k = 3
    testPoints = testRDD.collect()[200:500]  

    for testPoint in tqdm(testPoints):
        fraudDetection = trainRDD.map(lambda x: (None, (x,calculateDistance(testPoint, np.array(x[:-1])))))
        fraudDetection = fraudDetection.takeOrdered(k, key=lambda x: x[1][1])
        results = fraudDetection
        countFraud = 0
        for result in results:
            if  result[1][0][7] == 1:
                countFraud += 1
        
        if countFraud > k/2:
            testPredictions.append(1)
        else:
            testPredictions.append(0)

    return testPredictions
        

In [7]:
    
def testKNN():
    testPredictions = np.array(KNNMapReduce)
    print(testPredictions)
    testLabels = np.array(testlabels.collect()[200:500])
    print(testLabels)
    accuracy = np.sum(testPredictions == testLabels)/len(testLabels)
    print("Accuracy: ", accuracy)

In [8]:
#save testPredictions and test labels to same text file
# #open file
# f = open("testPredictions2.txt", "w")
# #write testPredictions and test labels beside each other to file
# for i in range(len(testPredictions)):
#     f.write(str(testPredictions[i]) + " " + str(testLabels[i]) + "\n")
# #close file
# f.close()




## Clustering (K-means)

In [38]:
def calCentroid(x):
        centroid = np.mean(x, axis=0)
        return centroid

def initializeCentroids(dataRDD, k):
    centroids = dataRDD.takeSample(False, k, seed=42)
    return centroids

def assignCluster(x, centroids) -> int:
    distances = np.zeros(len(centroids))
    for i in range(len(centroids)):
        distances[i]= calculateDistance(x, centroids[i])
    cluster = np.argmin(distances)
    return cluster

def KMeans(iter: int = 5):
    

    k = 2
    centroids = initializeCentroids(dataRDD, k)
    centroids = np.array(centroids)



    print(f"Training KNN... {iter} iterations")
    for i in tqdm(range(iter)):
        #Train KNN
        fraudDetection = dataRDD.map(lambda x: (assignCluster(np.array(x),centroids), x))
        fraudDetection = fraudDetection.groupByKey().map(lambda x: (x[0], list(x[1])))
        fraudDetection = fraudDetection.reduceByKey(lambda x,y: x+y)
        fraudDetection = fraudDetection.map(lambda x: (x[0], calCentroid(np.array(x[1]))))
        fraudDetection = fraudDetection.collect()

        # print(fraudDetection.take(11))

        firstCentroid = fraudDetection[0]
        secondCentroid = fraudDetection[1]

        # print(firstCentroid)
        # print(secondCentroid)

        centroids = np.array([firstCentroid[1], secondCentroid[1]])
    
    return centroids

def KMeansWSS(centroids, dataRDD):
    WSS = 0
    data = dataRDD.collect()
    print("Calculating WSS...")
    for i in tqdm(range(len(data))):
        tstPt = np.array(data[i])
        cluster = assignCluster(tstPt, centroids)
        WSS += calculateDistance(tstPt, centroids[cluster]) ** 2

    return WSS


        



In [19]:

centroids = KMeans(2)


Training KNN... 2 iterations


100%|██████████| 2/2 [02:42<00:00, 81.02s/it]


In [39]:
WSS = KMeansWSS(centroids, dataRDD)
print("WSS: ", WSS)

Calculating WSS...
WSS:  3311305539.2410283
