# Linear Regression Creadit Card Fraud Detection

In [1]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('linear_regression_fraud').getOrCreate()

# If you're getting an error with numpy, please type 'sudo pip install numpy --user' into the EC2 console.
from pyspark.ml.regression import LinearRegression

In [2]:
# Use Spark to read in the Ecommerce Customers csv file. You can infer csv schemas. 
data = spark.read.csv("E:\INFOSYS 722\Assignments\Assignments\Iter4\Iteration4\CleanData.csv",inferSchema=True,header=True)

In [3]:
data.printSchema()

root
 |-- Time: double (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: double (nullable = true)
 |-- V27: double (nullable = true)
 |-- V28: double (nulla

In [4]:
# Let's focus on one row to make it easier to read.
data.head()

Row(Time=114781.0, V1=0.018689871, V2=0.7716607790000001, V3=-0.749187624, V4=-0.9511676490000001, V5=1.7773783890000001, V6=1.253345665, V7=0.572302829, V8=0.538009135, V9=-0.162268511, V10=-0.775618639, V11=0.980027323, V12=0.22255239899999998, V13=-0.7702471340000001, V14=-0.625276077, V15=-0.05020235, V16=-0.135271817, V17=0.7676895859999999, V18=-0.7785133000000001, V19=-0.7771844370000001, V20=-0.094226555, V21=-0.257252841, V22=-0.554723238, V23=0.10122239400000001, V24=-1.06849104, V25=-0.611554036, V26=0.22649766300000002, V27=0.262847327, V28=0.055777892, Amount=2.69, Class=0)

In [5]:
# A simple for loop allows us to make it even clearer. 
for item in data.head():
    print(item)

114781.0
0.018689871
0.7716607790000001
-0.749187624
-0.9511676490000001
1.7773783890000001
1.253345665
0.572302829
0.538009135
-0.162268511
-0.775618639
0.980027323
0.22255239899999998
-0.7702471340000001
-0.625276077
-0.05020235
-0.135271817
0.7676895859999999
-0.7785133000000001
-0.7771844370000001
-0.094226555
-0.257252841
-0.554723238
0.10122239400000001
-1.06849104
-0.611554036
0.22649766300000002
0.262847327
0.055777892
2.69
0


# Setting Up a DataFrame for Machine Learning (MLlib)

We need to do a few things before Spark can accept the data for machine learning. First of all, it needs to be in the form of two columns: label and features. Unlike the documentation example, this data is messy. We'll need to combine all of the features into a single vector. VectorAssembler simplifies the process.

In [6]:
# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [8]:
print(data.columns)

['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']


In [9]:
# The input columns are the feature column names, and the output column is what you'd like the new column to be named.
assembler = VectorAssembler(
    inputCols=['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
    outputCol="features")

In [14]:
# Using print schema, you see that the features output column has been added. 
output = assembler.transform(data)

In [15]:
output.printSchema()

root
 |-- Time: double (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: double (nullable = true)
 |-- V27: double (nullable = true)
 |-- V28: double (nulla

In [16]:
# You can see that the features column is a dense vector that combines the various features as expected.
output.head(1)

[Row(Time=114781.0, V1=0.018689871, V2=0.7716607790000001, V3=-0.749187624, V4=-0.9511676490000001, V5=1.7773783890000001, V6=1.253345665, V7=0.572302829, V8=0.538009135, V9=-0.162268511, V10=-0.775618639, V11=0.980027323, V12=0.22255239899999998, V13=-0.7702471340000001, V14=-0.625276077, V15=-0.05020235, V16=-0.135271817, V17=0.7676895859999999, V18=-0.7785133000000001, V19=-0.7771844370000001, V20=-0.094226555, V21=-0.257252841, V22=-0.554723238, V23=0.10122239400000001, V24=-1.06849104, V25=-0.611554036, V26=0.22649766300000002, V27=0.262847327, V28=0.055777892, Amount=2.69, Class=0, features=DenseVector([114781.0, 0.0187, 0.7717, -0.7492, -0.9512, 1.7774, 1.2533, 0.5723, 0.538, -0.1623, -0.7756, 0.98, 0.2226, -0.7702, -0.6253, -0.0502, -0.1353, 0.7677, -0.7785, -0.7772, -0.0942, -0.2573, -0.5547, 0.1012, -1.0685, -0.6116, 0.2265, 0.2628, 0.0558, 2.69]))]

In [65]:
for item in output.head():
    print(item)

114781.0
0.018689871
0.7716607790000001
-0.749187624
-0.9511676490000001
1.7773783890000001
1.253345665
0.572302829
0.538009135
-0.162268511
-0.775618639
0.980027323
0.22255239899999998
-0.7702471340000001
-0.625276077
-0.05020235
-0.135271817
0.7676895859999999
-0.7785133000000001
-0.7771844370000001
-0.094226555
-0.257252841
-0.554723238
0.10122239400000001
-1.06849104
-0.611554036
0.22649766300000002
0.262847327
0.055777892
2.69
0
[114781.0,0.018689871,0.7716607790000001,-0.749187624,-0.9511676490000001,1.7773783890000001,1.253345665,0.572302829,0.538009135,-0.162268511,-0.775618639,0.980027323,0.22255239899999998,-0.7702471340000001,-0.625276077,-0.05020235,-0.135271817,0.7676895859999999,-0.7785133000000001,-0.7771844370000001,-0.094226555,-0.257252841,-0.554723238,0.10122239400000001,-1.06849104,-0.611554036,0.22649766300000002,0.262847327,0.055777892,2.69,0.0]


In [17]:
# Let's select two columns (the feature and predictor).
# This is now in the appropriate format to be processed by Spark.
final_data = output.select("features", "Class")
final_data.show()

+--------------------+-----+
|            features|Class|
+--------------------+-----+
|[114781.0,0.01868...|    0|
|[48085.0,-1.17863...|    0|
|[84380.0,1.116270...|    0|
|[80374.0,-0.42396...|    0|
|[68284.0,0.083971...|    0|
|[127819.0,2.04654...|    0|
|[80775.0,-0.66946...|    0|
|[124360.0,1.61669...|    0|
|[125685.0,1.94626...|    0|
|[142746.0,1.87460...|    0|
|[68322.0,-0.46750...|    0|
|[74903.0,1.243635...|    0|
|[62814.0,1.252808...|    0|
|[834.0,0.95624797...|    0|
|[73282.0,-0.33819...|    0|
|[22305.0,1.041925...|    0|
|[128772.0,1.31398...|    0|
|[146618.0,1.95809...|    0|
|[19912.0,1.362146...|    0|
|[157659.0,-0.3994...|    0|
+--------------------+-----+
only showing top 20 rows



In [18]:
final_data.groupBy('Class').count().show()

+-----+-----+
|Class|count|
+-----+-----+
|    1|  492|
|    0|  492|
+-----+-----+



In [21]:
# Let's do a randomised 70/30 split. 
# Remember, you can use other splits depending on how easy/difficult it is to train your model.
train_data, test_data = final_data.randomSplit([0.8, 0.2])

In [20]:
train_data.show()
test_data.show()

+--------------------+-----+
|            features|Class|
+--------------------+-----+
|[406.0,-2.3122265...|    1|
|[413.0,1.14659355...|    0|
|[472.0,-3.0435406...|    1|
|[925.0,1.00043338...|    0|
|[951.0,-1.2835989...|    0|
|[957.0,-0.9225089...|    0|
|[1658.0,-0.395424...|    0|
|[2923.0,-0.145408...|    0|
|[3749.0,1.4138528...|    0|
|[4462.0,-2.303349...|    1|
|[6986.0,-4.397974...|    1|
|[7526.0,0.0084303...|    1|
|[7535.0,0.0267792...|    1|
|[7543.0,0.3295943...|    1|
|[7610.0,0.7256457...|    1|
|[7672.0,0.7027099...|    1|
|[7740.0,1.0238738...|    1|
|[7891.0,-1.585505...|    1|
|[8090.0,-1.783228...|    1|
|[8169.0,0.8573210...|    1|
+--------------------+-----+
only showing top 20 rows

+--------------------+-----+
|            features|Class|
+--------------------+-----+
|[834.0,0.95624797...|    0|
|[1418.0,-0.814336...|    0|
|[2172.0,-0.799465...|    0|
|[7519.0,1.2342350...|    1|
|[7551.0,0.316459,...|    1|
|[8878.0,-2.661802...|    1|
|[11078.0,1.24738

Now we can create a Linear Regression Model object. Because the feature column is named 'features', we don't have to worry about it. However, as the labelCol isn't the default name, we have to specify it's name (Yearly Amount Spent).

In [22]:
lr = LinearRegression(labelCol="Class")

In [23]:
# Fit the model to the data.
lrModel = lr.fit(train_data)

In [24]:
# Print the coefficients and intercept for linear regression.
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients, lrModel.intercept))

Coefficients: [-3.7701099941949353e-07,-0.02471338220092171,0.0012073644699758556,-0.006534378830713157,0.05306015417240854,0.010504402220235968,-0.019515275549974783,0.02557690512283267,-0.016381574075062535,0.014452093543413352,-0.024096399055767857,-0.007246818022490211,0.00827328153218933,-0.013318231729938878,-0.05656920525128712,-0.014872907588062702,0.005180593461493127,-0.0038881817961142046,0.018293533565858618,0.01052514823707217,-0.039329419243583096,0.0012669852704469047,0.04565410249125588,-0.011215062827843684,0.03437713759693678,-0.0028607694469831276,-0.07401713947937595,0.011205468531631435,0.1273984346221925,0.0001462745737649403] Intercept: 0.20325105182683417


In [25]:
test_results = lrModel.evaluate(test_data)

In [26]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|-0.31072329264341086|
|-0.20544283227905868|
|-0.10201536500274666|
| -0.2909994893283203|
|-0.23848993038826682|
| -0.6202318984709281|
|-0.09525329942129801|
| -0.1040756559156586|
| 0.28149391650395605|
|-0.03437429821413196|
|-0.02086664233267649|
|-0.39557373262028794|
|-0.08715894396408691|
|  0.5762509076535609|
|-0.10319054560227726|
| 0.03423091880725937|
|-0.26999841076758024|
|   0.142016792598461|
| 0.15589739787536327|
| 0.22210047223859042|
+--------------------+
only showing top 20 rows



In [30]:
# Let's get some evaluation metrics (as discussed in the previous linear regression notebook).
print("RSME: {}".format(test_results.rootMeanSquaredError))
print("R2: {}".format(test_results.r2))

RSME: 0.28786647184360503
R2: 0.6657204703552018


Looking at RMSE and R2, we can see that the model is quite accurate. The RMSE shows that, on average, there's only a very tiny discrepancy between the actual and predicted results. Comparing this to the table below, the average amount spent (\\$499) and standard deviation (\\$79), a \\$10 error is surprisingly good.

The R2 also shows that the model accounts for 100% of the variance in the data.