In [1]:
! pip install kaggle



In [2]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
! mkdir ~/.kaggle

In [4]:
!cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/kaggle.json

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
! kaggle datasets download mlg-ulb/creditcardfraud

Downloading creditcardfraud.zip to /content
 94% 62.0M/66.0M [00:02<00:00, 36.8MB/s]
100% 66.0M/66.0M [00:02<00:00, 26.7MB/s]


In [7]:
! unzip creditcardfraud.zip

Archive:  creditcardfraud.zip
  inflating: creditcard.csv          


In [8]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop3.2.tgz
!tar xf spark-3.0.1-bin-hadoop3.2.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop3.2"
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()
sc = pyspark.SparkContext.getOrCreate();

In [9]:
sc

In [10]:
spark

In [11]:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [12]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("example").config("spark.sql.legacy.timeParserPolicy", "LEGACY").getOrCreate()

In [13]:
df = spark.read.csv("/content/creditcard.csv",inferSchema=True, header=True)
df.show(5)

+----+------------------+-------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+
|Time|                V1|                 V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|               V17|                V18|               V19|                V20|                 V21|                V22|     

In [14]:
analysis, val_data, test_data = df.randomSplit([0.1, 0.7, 0.2], seed=42)

In [16]:
import plotly.express as px
from pyspark.sql.functions import count
fraud_counts = analysis.groupBy("Class").agg(count("*").alias("count"))

print(fraud_counts.collect())
# collect counts as list
fraud_or_not = [int(row["count"]) for row in fraud_counts.collect()]
labels = ["Fraud","notFraud"]

# create pie chart
fig = px.pie(values=fraud_or_not, names=labels, width=700, height=400, color_discrete_sequence=["skyblue","black"], title="notFraud vs Fraud")
fig.show()


[Row(Class=1, count=63), Row(Class=0, count=28378)]


In [18]:
import pyspark.sql.functions as fn
from pyspark.sql.functions import *
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

In [19]:
assembler = VectorAssembler(inputCols=["V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27","V28","Amount"], outputCol="features")

X = assembler.transform(df)

In [20]:
X.show(5)

+----+------------------+-------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+--------------------+
|Time|                V1|                 V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|               V17|                V18|               V19|                V20|                 V21|    

In [21]:
trainDF,testDF = X.randomSplit([.8,.2],seed=42)

In [22]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [23]:
rf = RandomForestClassifier(labelCol="Class", featuresCol="features", numTrees=10)


In [24]:
model = rf.fit(trainDF)


In [25]:
predictions = model.transform(testDF)

In [27]:
evaluator = MulticlassClassificationEvaluator(labelCol="Class", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)

Accuracy = 0.999229


In [28]:
# Calculate precision
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
print("Precision: ", precision)

# Calculate recall
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
print("Recall: ", recall)

# Calculate F1 score
f1score = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
print("F1 Score: ", f1score)

Precision:  0.9991815467526846
Recall:  0.9992286250241055
F1 Score:  0.999189734425383
