# <center>Credit Card Fraud Prediction</center>
<br><center>Dataset source: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud</center>
<br><center>We will use PySpark API and MLlib library for predicting fraudulent credit card transactions.</center>
<br><center>Class 1 denotes frauds and Class 0 denotes normal transactions.</center>

## Create Spark session

In [37]:
import findspark
findspark.init('/home/shekhar/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession

In [38]:
spark = SparkSession.builder.appName('cred').getOrCreate()

## Read and format the data

In [39]:
df = spark.read.csv('creditcard.csv',inferSchema=True,header=True)



In [40]:
df.columns

['Time',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Amount',
 'Class']

In [11]:
from pyspark.sql.functions import col
df = df.drop(col("time"))

In [12]:
df.head(1)

[Row(V1=-1.3598071336738, V2=-0.0727811733098497, V3=2.53634673796914, V4=1.37815522427443, V5=-0.338320769942518, V6=0.462387777762292, V7=0.239598554061257, V8=0.0986979012610507, V9=0.363786969611213, V10=0.0907941719789316, V11=-0.551599533260813, V12=-0.617800855762348, V13=-0.991389847235408, V14=-0.311169353699879, V15=1.46817697209427, V16=-0.470400525259478, V17=0.207971241929242, V18=0.0257905801985591, V19=0.403992960255733, V20=0.251412098239705, V21=-0.018306777944153, V22=0.277837575558899, V23=-0.110473910188767, V24=0.0669280749146731, V25=0.128539358273528, V26=-0.189114843888824, V27=0.133558376740387, V28=-0.0210530534538215, Amount=149.62, Class=0)]

In [13]:
df.describe().select(['summary','Amount','Class']).show()



+-------+------------------+--------------------+
|summary|            Amount|               Class|
+-------+------------------+--------------------+
|  count|            284807|              284807|
|   mean| 88.34961925094233|0.001727485630620034|
| stddev|250.12010924018833| 0.04152718963546499|
|    min|               0.0|                   0|
|    max|          25691.16|                   1|
+-------+------------------+--------------------+



                                                                                

## MLlib Logistic Regression

In [14]:
from pyspark.ml.feature import VectorAssembler

In [17]:
assembler = VectorAssembler(inputCols=df.columns[:-1], outputCol='features')

In [18]:
from pyspark.ml.classification import LogisticRegression

In [19]:
lr = LogisticRegression(featuresCol='features',labelCol='Class')

In [21]:
final_data = assembler.transform(df).select("features","Class")

In [22]:
final_data.show()

+--------------------+-----+
|            features|Class|
+--------------------+-----+
|[-1.3598071336738...|    0|
|[1.19185711131486...|    0|
|[-1.3583540615982...|    0|
|[-0.9662717115720...|    0|
|[-1.1582330934952...|    0|
|[-0.4259658844124...|    0|
|[1.22965763450793...|    0|
|[-0.6442694423481...|    0|
|[-0.8942860822028...|    0|
|[-0.3382617524257...|    0|
|[1.44904378114715...|    0|
|[0.38497821518095...|    0|
|[1.249998742053,-...|    0|
|[1.0693735878819,...|    0|
|[-2.7918547659339...|    0|
|[-0.7524170429566...|    0|
|[1.10321543528383...|    0|
|[-0.4369050713606...|    0|
|[-5.4012576631582...|    0|
|[1.4929359769862,...|    0|
+--------------------+-----+
only showing top 20 rows



In [23]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [24]:
train_data.describe().show()



+-------+--------------------+
|summary|               Class|
+-------+--------------------+
|  count|              199428|
|   mean|0.001750005014341...|
| stddev| 0.04179654598887494|
|    min|                   0|
|    max|                   1|
+-------+--------------------+



                                                                                

In [25]:
test_data.describe().show()

[Stage 19:>                                                         (0 + 6) / 6]

+-------+--------------------+
|summary|               Class|
+-------+--------------------+
|  count|               85379|
|   mean|0.001674884924864428|
| stddev|0.040891310443485096|
|    min|                   0|
|    max|                   1|
+-------+--------------------+





In [26]:
lrModel = lr.fit(train_data)

22/05/24 10:16:23 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/05/24 10:16:23 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
                                                                                

In [27]:
trainingSummary = lrModel.summary
trainingSummary.predictions.show()



+--------------------+-----+--------------------+--------------------+----------+
|            features|Class|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[-56.407509631329...|  0.0|[9.52387236537979...|[0.99992691924159...|       0.0|
|[-32.962809811697...|  0.0|[5.05457448802386...|[0.99366036639015...|       0.0|
|[-32.273469750819...|  0.0|[4.65184777410612...|[0.99054627531802...|       0.0|
|[-31.746662562458...|  0.0|[4.25224378785582...|[0.98596745106469...|       0.0|
|[-29.876365513976...|  1.0|[3.57277903953836...|[0.97268911130478...|       0.0|
|[-29.200328590574...|  1.0|[3.15262851033074...|[0.95901216758645...|       0.0|
|[-28.524267593840...|  1.0|[2.73247623434137...|[0.93891601044362...|       0.0|
|[-28.344757250015...|  0.0|[4.76840743870144...|[0.99157764218996...|       0.0|
|[-27.670568881696...|  0.0|[4.60639697071819...|[0.99011102879541...|       0.0|
|[-27.1436784229

[Stage 142:>                                                        (0 + 1) / 1]                                                                                

In [29]:
results = lrModel.transform(test_data)

In [30]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [31]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='Class')

In [32]:
results.select('Class','prediction').show()

+-----+----------+
|Class|prediction|
+-----+----------+
|    1|       0.0|
|    1|       0.0|
|    0|       0.0|
|    1|       0.0|
|    0|       0.0|
|    1|       1.0|
|    1|       1.0|
|    0|       0.0|
|    1|       1.0|
|    1|       1.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       1.0|
|    1|       1.0|
|    1|       1.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
+-----+----------+
only showing top 20 rows



In [33]:
AUC = my_eval.evaluate(results)

                                                                                

In [36]:
AUC

0.8111008201990885

### We have achieved an AUC score of 81.1% on the testing data.<br>This shows the ability of our model in separating the frauds and normal transactions.