In [2]:
from pyspark.sql import SparkSession

# 1. Data Preprocessing 

In [3]:
spark = SparkSession \
        .builder \
        .appName('FraudDet') \
        .getOrCreate()

In [4]:
spark

In [5]:
file_path = 'fraud_detection.csv'
df_fraud_det = spark.read.csv(file_path, header=True)

In [6]:
df_fraud_det.printSchema()

root
 |-- step: string (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: string (nullable = true)
 |-- newbalanceOrig: string (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: string (nullable = true)
 |-- newbalanceDest: string (nullable = true)
 |-- isFraud: string (nullable = true)
 |-- isFlaggedFraud: string (nullable = true)



In [7]:
num_rows = df_fraud_det.count()
num_cols = len(df_fraud_det.columns)

print(num_rows, num_cols)

6362620 11


In [8]:
df_fraud_det.na.drop().show()

+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT|  9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT|  1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|    181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|    181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1| PAYMENT| 11668.14|C2048537720|      41554.0|      29885.86|M123070170

In [9]:
df_fraud_det = df_fraud_det.dropDuplicates()

In [11]:
from pyspark.sql.functions import col

old_cols = df_fraud_det.columns
new_cols = [col(col_name).alias(col_name.lower()) for col_name in old_cols]

In [12]:
df_fraud_det = df_fraud_det.select(*new_cols)

In [13]:
df_fraud_det.columns

['step',
 'type',
 'amount',
 'nameorig',
 'oldbalanceorg',
 'newbalanceorig',
 'namedest',
 'oldbalancedest',
 'newbalancedest',
 'isfraud',
 'isflaggedfraud']

In [14]:
df_fraud_det.show()

+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|   amount|   nameorig|oldbalanceorg|newbalanceorig|   namedest|oldbalancedest|newbalancedest|isfraud|isflaggedfraud|
+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT|  9753.55|C1048712791|      60829.0|      51075.45| M487792155|           0.0|           0.0|      0|             0|
|   1|CASH_OUT| 338767.1| C691691381|          0.0|           0.0| C453211571|     544481.28|    3461666.05|      0|             0|
|   1|TRANSFER|276460.62|C1871680329|        595.0|           0.0|C1360767589|    1105242.28|    2107965.39|      0|             0|
|   1| PAYMENT|  7649.41|C1129869771|     44435.75|      36786.34|M1620459733|           0.0|           0.0|      0|             0|
|   1| PAYMENT| 14895.51|C1527882132|          0.0|           0.0| M93516000

In [19]:
cols_to_convert = ["amount","oldbalanceorg","newbalanceorig", "oldbalancedest", "newbalancedest","isfraud", "isflaggedfraud"]

for c in cols_to_convert:
    if(c == "isfraud" or c == "isflaggedfraud"):
        df_fraud_det = df_fraud_det.withColumn(c, col(c).cast("int"))
    else:
        df_fraud_det = df_fraud_det.withColumn(c, col(c).cast("float"))

In [20]:
df_fraud_det.printSchema()

root
 |-- step: string (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: float (nullable = true)
 |-- nameorig: string (nullable = true)
 |-- oldbalanceorg: float (nullable = true)
 |-- newbalanceorig: float (nullable = true)
 |-- namedest: string (nullable = true)
 |-- oldbalancedest: float (nullable = true)
 |-- newbalancedest: float (nullable = true)
 |-- isfraud: integer (nullable = true)
 |-- isflaggedfraud: integer (nullable = true)



# 2. Exploration and Data analysis

In [26]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pyspark.sql.functions import rand

In [29]:
fraction = 0.01
df_sample = df_fraud_det.sample(withReplacement=False, fraction=fraction, seed=42)

In [25]:
numerical_features = cols_to_convert

df_fraud_det[numerical_features].hist(bins=30, color='skyblue')
plt.tight_layout()
plt.show()


AttributeError: 'DataFrame' object has no attribute 'hist'