## Fraud Consumer metric


In [None]:
import pandas as pd
import numpy as np
import os
import re

from pyspark.sql import SparkSession
from pyspark.shell import spark
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
import matplotlib.pyplot as plt

spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

# Regression

In [None]:
consumer_fraud_sdf = spark.read.parquet("../data/temp/consumer_fraud")
transaction_sdf1 = spark.read.parquet("../data/tables/transactions_20210228_20210827_snapshot")
transaction_sdf2 = spark.read.parquet("../data/tables/transactions_20210828_20220227_snapshot")
transaction_sdf3 = spark.read.parquet("../data/tables/transactions_20220228_20220828_snapshot")
transaction = transaction_sdf1.union(transaction_sdf2).union(transaction_sdf3)

In [None]:
transaction_sdf1.printSchema()

### plot on transaction 1

In [None]:
from pyspark.sql import functions as F

consumer_agg_daily_sdf1 = transaction_sdf1.groupBy(["user_id","order_datetime"]).agg(
    F.sum(F.col('dollar_value')).alias('total_amount'),
    F.countDistinct(F.col('order_id')).alias("total_order")
)

consumer_fraud_join_df1 = consumer_agg_daily_sdf1.join(consumer_fraud_sdf,on=["user_id","order_datetime"]).toPandas()

In [None]:
import matplotlib.pyplot as plt
plt.scatter(x=consumer_fraud_join_df1["total_amount"],y=consumer_fraud_join_df1["fraud_probability"])
plt.xlabel('total_amount')
plt.ylabel('fraud_probability') 
plt.title('total_amount vs fraud_probability')


In [None]:
plt.scatter(x=consumer_fraud_join_df1["total_order"],y=consumer_fraud_join_df1["fraud_probability"])
plt.xlabel('total_order')
plt.ylabel('fraud_probability') 
plt.title('total_order vs fraud_probability')

### plot on transaction 2

In [None]:
consumer_agg_daily_sdf2 = transaction_sdf2.groupBy(["user_id","order_datetime"]).agg(
    F.sum(F.col('dollar_value')).alias('total_amount'),
    F.countDistinct(F.col('order_id')).alias("total_order")
)

consumer_fraud_join_df2 = consumer_agg_daily_sdf2.join(consumer_fraud_sdf,on=["user_id","order_datetime"]).toPandas()

In [None]:
plt.scatter(x=consumer_fraud_join_df2["total_amount"],y=consumer_fraud_join_df2["fraud_probability"])
plt.xlabel('total_amount')
plt.ylabel('fraud_probability') 
plt.title('total_amount vs fraud_probability')

In [None]:
plt.scatter(x=consumer_fraud_join_df2["total_order"],y=consumer_fraud_join_df2["fraud_probability"])
plt.xlabel('total_order')
plt.ylabel('fraud_probability') 
plt.title('total_order vs fraud_probability')

## check distribution of normal transaction

In [None]:
consumer_agg_daily_df1 = consumer_agg_daily_sdf1.toPandas()

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(111)

ax1.scatter(x=consumer_agg_daily_df1['total_amount'], y=[0]*consumer_agg_daily_df1['total_amount'].count(),label="transaction 1")
ax1.scatter(x=consumer_fraud_join_df2["total_amount"],y=consumer_fraud_join_df2["fraud_probability"],label="fraud transaction")
plt.legend(loc='upper left')
plt.show()

# plt.scatter(x=consumer_agg_daily_df1['total_amount'], y=[np.mean(consumer_agg_daily_df1['total_amount'])]*consumer_agg_daily_df1['total_amount'].count())

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(111)

ax1.hist(x=consumer_agg_daily_df1['total_amount'],label="transaction 1")
ax1.hist(x=consumer_fraud_join_df2["total_amount"],label="fraud transaction")
plt.legend(loc='upper left')
plt.show()

# Regression on fraud probability

In [None]:
from statsmodels.formula.api import ols, glm
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

consumer_fraud_join_df_1_2 = consumer_fraud_join_df1.append(consumer_fraud_join_df2, ignore_index=True)

fit = ols(
    formula="fraud_probability ~ total_amount + total_order + total_amount/total_order",
    data=consumer_fraud_join_df_1_2
).fit()

print(fit.summary())

total_order is no longer significant, but the interaction term is.
the interaction term explains why the scatter plot has a log like shape. 
When total amount is the same, less avg dollar value meaning less fraud prob

### Formula: fraud_probability ~ total_amount + total_amount/total_order

## Train model with pyspark model
 consider scalability we will use pyspark module instead

In [None]:
consumer_agg_daily_train = spark.createDataFrame(consumer_fraud_join_df_1_2).withColumn(
    "avg_dollar_value_per_order",
    F.col("total_amount")/F.col("total_order") # interaction feature added
)

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Interaction
# consumer_regression = consumer_agg_daily_all[[]].apply(lambda row:join_prob_p_val(row[0],row[1],row[2],row[3],row[4],row[5]),axis=1)
features = 'features'
input_cols = ['total_amount','avg_dollar_value_per_order'] 
# assembler = Interaction()
# assembler.setInputCols(["total_amount", "total_order"])
# assembler.setOutputCol("interaction")
assembler = VectorAssembler(
    # which column to combine
    inputCols=input_cols, 
    # How should the combined columns be named
    outputCol=features
)

model_sdf = assembler.transform(consumer_agg_daily_train.dropna('any'))
# Display the features and targets for our model
model_sdf.select('features').head(5), model_sdf.select('fraud_probability').head(5)



In [None]:
lm = LinearRegression(
    featuresCol='features', 
    labelCol='fraud_probability',
    maxIter=1000
).fit(model_sdf)


In [None]:
print(lm.coefficients)
print(lm.intercept)
# coefficient is different, investigate

In [None]:
# save model

model_path = "../models" + "/lr_model"
lm.save(model_path)


## apply the pyspark model to all data

In [None]:
consumer_agg_daily_all = transaction.groupBy(["user_id","order_datetime"]).agg(
    F.sum(F.col('dollar_value')).alias('total_amount'),
    F.countDistinct(F.col('order_id')).alias("total_order")
).withColumn(
    "avg_dollar_value_per_order",
    F.col("total_amount")/F.col("total_order") # interaction feature added
)
#consumer_agg_daily_all.count() # 9 mil data
consumer_agg_daily_all.head(5)

In [None]:
from pyspark.ml.regression import LinearRegressionModel
model_path = "../models" + "/lr_model"
lm = LinearRegressionModel.load(model_path)
features = 'features'
input_cols = ['total_amount','avg_dollar_value_per_order'] 
# assembler = Interaction()
# assembler.setInputCols(["total_amount", "total_order"])
# assembler.setOutputCol("interaction")
assembler = VectorAssembler(
    # which column to combine
    inputCols=input_cols, 
    # How should the combined columns be named
    outputCol=features
)

predict_sdf = assembler.transform(consumer_agg_daily_all).select(features)
result_sdf = lm.transform(predict_sdf)
result_sdf.head()

In [None]:
user_datetime_predict_sdf = consumer_agg_daily_all.join(result_sdf.select("prediction"))
user_datetime_predict_sdf= user_datetime_predict_sdf.withColumnRenamed(
    "prediction",
    "fraud_prob"
)

In [None]:
user_datetime_predict_sdf.printSchema()

In [None]:
# cant save it
#user_datetime_predict_sdf.write.mode('overwrite').parquet('../data/curated/user_datetime_predict_sdf.parquet')

## 

In [None]:
#user_datetime_predict_sdf = spark.read.parquet("../data/curated/user_datetime_predict_sdf.parquet")

## calculate the discounted revenue

In [None]:
transaction.printSchema()

In [None]:
from pyspark.sql.functions import udf, array
from pyspark.sql.types import FloatType
THRESHHOLD = lm.intercept

def calculate_discounted_revenue(rate):
    if rate < THRESHHOLD:
        return 1
    else: 
        return 1-rate
       
# sdf = sdf.withColumn(
#     'transformed_col',
#     some_udf(F.col('raw_col'))
# )
convert_rate = udf(lambda z: calculate_discounted_revenue(z),FloatType())

In [26]:
# run slice by slice
merchant_user_agg_sdf = transaction_sdf1.groupBy(["merchant_abn","order_datetime","user_id"]).agg(
    F.countDistinct(F.col("order_id")).alias("no_order"),
    F.sum(F.col("dollar_value")).alias("dollar_amount")
)

user_discounted_spending_sdf = merchant_user_agg_sdf \
    .join(user_datetime_predict_sdf,on=["user_id","order_datetime"]) \
    .withColumn(
        "convert_rate",
        convert_rate(F.col("fraud_prob"))
    ).withColumn(
        "discounted_spending",
        F.col("convert_rate") * F.col("dollar_amount")
    )


In [27]:
merchant_discounted_daily_revenue = user_discounted_spending_sdf.groupBy(["merchant_abn","order_datetime"]).agg(
    F.sum(F.col("discounted_spending")).alias("discounted_daily_revenue")
)

In [None]:
merchant_discounted_daily_revenue.printSchema()

In [28]:
merchant_discounted_daily_revenue.head() # cant run

[Stage 31:> (0 + 0) / 200][Stage 32:(0 + 10) / 40000][Stage 35:>  (0 + 0) / 30]

### Remove Fraud data from the transaction set(discarded)
since fraud data is only a small subset of the data, consider remove all transaction entries with fraud probability


In [None]:
# convert merchant fraud, consumer fraud to spark df
merchant_fraud_df = pd.read_csv("../data/tables/merchant_fraud_probability.csv")
merchant_fraud_df.to_parquet("../data/temp/merchant_fraud")
merchant_fraud_sdf = spark.read.parquet("../data/temp/merchant_fraud")

consumer_fraud_df = pd.read_csv("../data/tables/consumer_fraud_probability.csv")
consumer_fraud_df.to_parquet("../data/temp/consumer_fraud")
consumer_fraud_sdf = spark.read.parquet("../data/temp/consumer_fraud")

In [None]:
# save all transaction data to a spark dataframe: transaction
transaction_sdf1 = spark.read.parquet("../data/tables/transactions_20210228_20210827_snapshot")
transaction_sdf2 = spark.read.parquet("../data/tables/transactions_20210828_20220227_snapshot")
transaction_sdf3 = spark.read.parquet("../data/tables/transactions_20220228_20220828_snapshot")
transaction = transaction_sdf1.union(transaction_sdf2).union(transaction_sdf3)

In [None]:
# find all transaction on merchant fraud day
merchant_fraud_sdf = spark.read.parquet("../data/temp/merchant_fraud")
merchant_transaction_on_fraud_day = transaction.join(merchant_fraud_sdf.select(["merchant_abn","order_datetime"]), on=["merchant_abn","order_datetime"])

In [None]:
transaction.printSchema()

In [None]:
merchant_transaction_on_fraud_day.printSchema()

In [None]:
# find all transaction on consumer fraud day
consumer_fraud_sdf = spark.read.parquet("../data/temp/consumer_fraud")
consumer_transaction_on_fraud_day = transaction.join(consumer_fraud_sdf.select(["user_id","order_datetime"]), on=["user_id","order_datetime"])

In [None]:
# filter fraud transactions

order = ["user_id","merchant_abn","dollar_value","order_id","order_datetime"]
transaction_fraud_rm = transaction.subtract(merchant_transaction_on_fraud_day.select(order)).subtract(consumer_transaction_on_fraud_day.select(order))

transaction_fraud_rm can be used for further analysis

### Derive fraud rate and other features

#### Definition:

Merchant Fraud rate: weighted fraud orders / total orders
Fraud 
