## Part 1: Reading input file and storing to DataFrame

In [23]:
from pyspark.sql.types import *
import sys
import os
sqlContext = SQLContext(sc)

Read input file and store to RDD
1. Read from file system
2. split columns
3. filter header
4. filter empty rows

In [24]:
filename = '/vagrant/PS_20174392719_1491204439457_log.csv'

def convertToTransactionSchema(arr):
    res = arr
    res[0] = int(arr[0]) # step
    res[2] = float(arr[2]) # amount
    res[4] = float(arr[4]) # old balance
    res[5] = float(arr[5]) # new balance
    res[7] = float(arr[7]) # old balance destination
    res[8] = float(arr[8]) # new balance destination
    res[9] = int(arr[9]) # is fraud
    res[10] = int(arr[10]) # is flagged fraud
    return res

transactionsRDD = (sc.textFile(filename)
       .map(lambda line: line.split(","))
       .filter(lambda line: line[0] != "step")
       .filter(lambda line: len(line)>1)
       .map(convertToTransactionSchema))

Convert transactionsRDD into DataFrame and cache it

In [25]:
transactionSchema = StructType([
    StructField("step", IntegerType(), True),
    StructField("type", StringType(), True),
    StructField("amount", FloatType(), True),
    StructField("nameOrig", StringType(), True),
    StructField("oldbalanceOrig", FloatType(), True),
    StructField("newbalanceOrig", FloatType(), True),
    StructField("nameDest", StringType(), True),
    StructField("oldbalanceDest", FloatType(), True),
    StructField("newbalanceDest", FloatType(), True),
    StructField("isFraud", IntegerType(), True),
    StructField("isFlaggedFraud", IntegerType(), True)])

transactionsDF = sqlContext.createDataFrame(transactionsRDD, transactionSchema)
print transactionsDF.take(2)

[Row(step=1, type=u'PAYMENT', amount=9839.6396484375, nameOrig=u'C1231006815', oldbalanceOrig=170136.0, newbalanceOrig=160296.359375, nameDest=u'M1979787155', oldbalanceDest=0.0, newbalanceDest=0.0, isFraud=0, isFlaggedFraud=0), Row(step=1, type=u'PAYMENT', amount=1864.280029296875, nameOrig=u'C1666544295', oldbalanceOrig=21249.0, newbalanceOrig=19384.720703125, nameDest=u'M2044282225', oldbalanceDest=0.0, newbalanceDest=0.0, isFraud=0, isFlaggedFraud=0)]


## Part 2: Data Cleaning

TODO: Check if there is any invalid or missing values

Print the datatype for each column

In [27]:
print transactionsDF.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: float (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrig: float (nullable = true)
 |-- newbalanceOrig: float (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: float (nullable = true)
 |-- newbalanceDest: float (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)

None


Check each column for invalid values. 

For columns with string type: empty or null values.
For integer or float: null or minus values.

In [43]:
from pyspark.sql.functions import *
transactionsDF.where(col("step").isNull()).count()

0L

In [50]:
transactionsDF.select(when(transactionsDF.step == 1,1))

NameError: name 'when' is not defined

In [49]:
from pyspark.sql import functions as F
transactionsDF.select([count(F.when(isNull(c), c)).alias(c) for c in transactionsDF.columns]).show()

AttributeError: 'module' object has no attribute 'when'

In [None]:
transactionsDF.registerTempTable("record")
sqlContext.sql("SELECT type FROM record WHERE ")

## Part 3: Exploring the dataset

From kaggle:
1. Types of fraudulent transactions
2. How many items are marked as fraud and not fraud?
3. What determines whether the feature isFlaggedFraud gets set or not?


3. Statistics of each column
4. Correllation analysis

## Part 4: Building the fraud detection model

## Part 5: Evaluating the fraud detection model

## Part 6: Analyzing the fraud detection model