In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('CreditCardFraudML').getOrCreate()
raw_data = spark.read.csv('fraudTrain.csv', header=True, inferSchema=True)

In [3]:
raw_data.printSchema()
raw_data.show(5)

root
 |-- _c0: integer (nullable = true)
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- cc_num: long (nullable = true)
 |-- merchant: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- unix_time: integer (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)

+---+---------------------+----------------+--------------------+-------------+------+---------+-------+--

In [4]:
num_rows = raw_data.count()
num_columns = len(raw_data.columns)

print(num_rows)
print(num_columns)

1296675
23


In [5]:
last_rows = raw_data.tail(5)
for row in last_rows:
    print(row)

Row(_c0=1296670, trans_date_trans_time=datetime.datetime(2020, 6, 21, 12, 12, 8), cc_num=30263540414123, merchant='fraud_Reichel Inc', category='entertainment', amt=15.56, first='Erik', last='Patterson', gender='M', street='162 Jessica Row Apt. 072', city='Hatch', state='UT', zip=84735, lat=37.7175, long=-112.4777, city_pop=258, job='Geoscientist', dob=datetime.date(1961, 11, 24), trans_num='440b587732da4dc1a6395aba5fb41669', unix_time=1371816728, merch_lat=36.841266, merch_long=-111.69076499999998, is_fraud=0)
Row(_c0=1296671, trans_date_trans_time=datetime.datetime(2020, 6, 21, 12, 12, 19), cc_num=6011149206456997, merchant='fraud_Abernathy and Sons', category='food_dining', amt=51.7, first='Jeffrey', last='White', gender='M', street='8617 Holmes Terrace Suite 651', city='Tuscarora', state='MD', zip=21790, lat=39.2667, long=-77.5101, city_pop=100, job='Production assistant, television', dob=datetime.date(1979, 12, 11), trans_num='278000d2e0d2277d1de2f890067dcc0a', unix_time=137181673

In [6]:
raw_data.describe().show()

+-------+-----------------+--------------------+-------------------+-------------+-----------------+-------+-------+-------+--------------------+-------+-------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+------------------+------------------+--------------------+
|summary|              _c0|              cc_num|           merchant|     category|              amt|  first|   last| gender|              street|   city|  state|              zip|              lat|              long|         city_pop|               job|           trans_num|           unix_time|         merch_lat|        merch_long|            is_fraud|
+-------+-----------------+--------------------+-------------------+-------------+-----------------+-------+-------+-------+--------------------+-------+-------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+----------------

In [7]:
from pyspark.sql.functions import count, when, col

raw_data.select([count(when(col(c).isNull(), c)).alias(c) for c in raw_data.columns]).show()

+---+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+
|_c0|trans_date_trans_time|cc_num|merchant|category|amt|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|trans_num|unix_time|merch_lat|merch_long|is_fraud|
+---+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+
|  0|                    0|     0|       0|       0|  0|    0|   0|     0|     0|   0|    0|  0|  0|   0|       0|  0|  0|        0|        0|        0|         0|       0|
+---+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+



In [8]:
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler

numeric_columns = ['cc_num', 'amt', 'zip','long','city_pop','unix_time','merch_lat','merch_long','is_fraud']

assembler = VectorAssembler(inputCols=numeric_columns, outputCol="features")
raw_data_vectorized = assembler.transform(raw_data)

correlation_matrix = Correlation.corr(raw_data_vectorized, 'features').head()
print(correlation_matrix[0])


DenseMatrix([[ 1.00000000e+00,  1.76939977e-03,  4.14589540e-02,
              -4.82778009e-02, -8.99106682e-03,  3.53664067e-04,
              -5.89421349e-02, -4.82519959e-02, -9.81455613e-04],
             [ 1.76939977e-03,  1.00000000e+00,  1.84340203e-03,
              -1.87475599e-04,  5.81828398e-03, -2.92997831e-04,
              -1.87254740e-03, -1.50995346e-04,  2.19403889e-01],
             [ 4.14589540e-02,  1.84340203e-03,  1.00000000e+00,
              -9.09732122e-01,  7.84670560e-02,  6.70469820e-04,
              -1.13561358e-01, -9.08924027e-01, -2.16190728e-03],
             [-4.82778009e-02, -1.87475599e-04, -9.09732122e-01,
               1.00000000e+00, -5.27146379e-02, -6.41788388e-04,
              -1.54518140e-02,  9.99119582e-01,  1.72081275e-03],
             [-8.99106682e-03,  5.81828398e-03,  7.84670560e-02,
              -5.27146379e-02,  1.00000000e+00, -1.71400250e-03,
              -1.54781442e-01, -5.26867904e-02,  2.13590242e-03],
             [ 3.536

In [9]:
# raw_data.groupBy('category').count().show()
# raw_data.groupBy('is_fraud').count().show()

# raw_data.printSchema()
# raw_data.describe().show()
raw_data.select("is_fraud").groupBy("is_fraud").count().show()


+--------+-------+
|is_fraud|  count|
+--------+-------+
|       1|   7506|
|       0|1289169|
+--------+-------+



In [10]:
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

In [11]:
# Only keeping relevant columns
data = raw_data.select(
    'trans_date_trans_time', 'category', 'amt',
    'lat', 'long', 'merch_lat', 'merch_long', 'is_fraud'
)

In [None]:
data.head(5)

[Row(trans_date_trans_time=datetime.datetime(2019, 1, 1, 0, 0, 18), category='misc_net', amt=4.97, lat=36.0788, long=-81.1781, merch_lat=36.011293, merch_long=-82.048315, is_fraud=0),
 Row(trans_date_trans_time=datetime.datetime(2019, 1, 1, 0, 0, 44), category='grocery_pos', amt=107.23, lat=48.8878, long=-118.2105, merch_lat=49.159046999999994, merch_long=-118.186462, is_fraud=0),
 Row(trans_date_trans_time=datetime.datetime(2019, 1, 1, 0, 0, 51), category='entertainment', amt=220.11, lat=42.1808, long=-112.262, merch_lat=43.150704, merch_long=-112.154481, is_fraud=0),
 Row(trans_date_trans_time=datetime.datetime(2019, 1, 1, 0, 1, 16), category='gas_transport', amt=45.0, lat=46.2306, long=-112.1138, merch_lat=47.034331, merch_long=-112.561071, is_fraud=0),
 Row(trans_date_trans_time=datetime.datetime(2019, 1, 1, 0, 3, 6), category='misc_pos', amt=41.96, lat=38.4207, long=-79.4629, merch_lat=38.674999, merch_long=-78.632459, is_fraud=0)]