In [None]:
!pip install pyspark
!pip install findspark

In [None]:
import findspark
import pyspark

import os
import functools as reduce
from pyspark.context import SparkContext
from pyspark.sql import DataFrame, SQLContext, SparkSession, Window
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Importing other MLlib Libraries
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

conf = pyspark.SparkConf().setAppName('Credit Card Fraud Detection').setMaster('Kaggle')
sc = pyspark.SparkContext(conf = conf)
spark = SparkSession(sc)
sqlContext = SQLContext(sc)

In [None]:
# Importing other relevant libraries
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
spark

In [None]:
# Dataset Link: spark-practice-de.s3.amazonaws.com/creditcard_fraud.rar

In [None]:
# Reading the dataset
df = spark.read.csv("../input/creditcardfraud/creditcard.csv", header=True, inferSchema=True)

In [None]:
df.printSchema()

In [None]:
# Count of Fraudulent and Non-Fraudulent Transactions
df.groupby("Class").count().show()

In [None]:
df.columns

In [None]:
# Getting distinct counts of the columns in the dataset
df.agg(*(countDistinct(col(c)).alias(c) for c in df.columns)).show()

In [None]:
assembler = VectorAssembler(inputCols=[col for col in df.columns if col != "Class"],
                           outputCol="features")

In [None]:
dataset = assembler.transform(df)

In [None]:
dataset.show(5)

In [None]:
# Since, we only need label and features column for model building. Hence, selecting only relevant columns
model_data = dataset.select(["features", "Class"])
model_data = model_data.withColumnRenamed("Class", "label")
model_data.show()

In [None]:
# Model Building
p_train = [0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95]
train_ROC = []
test_ROC = []

for p in p_train:
    
    print("Training Split at", str(p),"%")
    # Splitting the dataset in train and test
    train, test = model_data.randomSplit([p, 1-p])
    lr = LogisticRegression(featuresCol="features", labelCol="label")
    model = lr.fit(train)

    # Plot on Train dataset
    trainingSummary = model.summary
    roc = trainingSummary.roc.toPandas()
    plt.plot(roc["FPR"], roc["TPR"])
    plt.ylabel("False Positive Rate")
    plt.xlabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.show()

    print("Training set areaUnderCurve: " + str(trainingSummary.areaUnderROC))
    train_ROC.append(trainingSummary.areaUnderROC)
    
    # Precision Recall Curve
    pr = trainingSummary.pr.toPandas()
    plt.plot(pr["recall"], pr["precision"])
    plt.ylabel("Precision")
    plt.xlabel("Recall")
    plt.show()

    # Evaluating on test dataset
    summary = model.evaluate(test)
    summary.accuracy

    output = model.transform(test)

    evaluator = BinaryClassificationEvaluator()
    print("Test Area under ROC", evaluator.evaluate(output))
    
    test_ROC.append(evaluator.evaluate(output))

In [None]:
# Plotting
plt.plot(p_train, train_ROC, "ro-", label = "Train")
plt.plot(p_train, test_ROC, "g", label = "Test")
plt.xlabel("Training Size")
plt.ylabel("Area under ROC Curve")
plt.legend()
plt.show()