# Imputation

This notebook involves the imputation of missing values in consumer and merchant fraud probabilities respectively, using Self-training Gradient Boosting.

---

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import*
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.regression import GBTRegressor
import pandas as pd
from pyspark.sql.functions import col

In [2]:
spark = (
    SparkSession.builder.appName("Imputation")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.debug.maxToStringFields", 3000)
    .config("spark.network.timeout", "300s")
    .config("spark.driver.maxResultSize", "8g")
    .config("spark.rpc.askTimeout", "300s")
    .config("spark.driver.memory", "8G")
    .config("spark.executor.memory", "8G")
    .config("spark.executorEnv.LD_LIBRARY_PATH", "/opt/homebrew/opt/openblas/lib")
    .config("spark.driverEnv.LD_LIBRARY_PATH", "/opt/homebrew/opt/openblas/lib")
    .getOrCreate()
)

24/09/20 02:23:08 WARN Utils: Your hostname, Cocos-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.16.33.67 instead (on interface en0)
24/09/20 02:23:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/20 02:23:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/20 02:23:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Read datasets

In [3]:
# read datasets
merchant_full_expanded = spark.read.parquet('../data/curated/merchant_full_expanded')
merchant_fraud = spark.read.csv('../data/tables/tables 1/merchant_fraud_probability.csv', header=True, inferSchema=True)
consumer_full_expanded = spark.read.parquet('../data/curated/consumer_full_expanded')
consumer_fraud = spark.read.csv('../data/tables/tables 1/consumer_fraud_probability.csv', header=True, inferSchema=True)

In [4]:
consumer_full_expanded.show(5)

+--------+-------+--------------+-----------+------------------+-----+------------+------------------+--------------------+-----------------+------------------+---------------+--------------------------+------------------------+---------------+----------------------+------------------------+----------------------+------------------+---------------------+--------+---------+--------+---------+--------+---------+---------+---------+------------------+-------------+-----------+---------------+-------------------+
|postcode|user_id|order_datetime|consumer_id|              name|state|merchant_abn|      dollar_value|            order_id|fraud_probability|average_population|avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|avg_unemployment_rate|state_NT|state_ACT|state_SA|state_TAS|state_WA|state_QLD|state_VIC|state_NSW|gender_Undisclosed|gender_Female|gender_Male|order_t

                                                                                

In [5]:
merchant_full_expanded.show(5)

+--------------------+------------+--------------+------------------+---------+-----------------+-------------------+------------------+---------------+---------------+---------------+---------------+---------------+----------------+------------+-------------------+--------------+--------------+---------------+--------------+---------------+---------------+-----------------+---------------+----------------+----------------+-------------------+------------+----------------+------------------+-------------+-------------+---------------+---------------+-------------+--------------+--------------------+--------------+-------------------+-----------------+--------------+-----------------+--------------+-------------------+--------------+---------------+---------------+-------------+--------------+-------------------+--------------+--------------+----------------+-------------+-------------+---------------+---------------+----------------+----------------+---------------+------------------+-

In [6]:
merchant_fraud.show(5)

+------------+--------------+------------------+
|merchant_abn|order_datetime| fraud_probability|
+------------+--------------+------------------+
| 19492220327|    2021-11-28|44.403658647495355|
| 31334588839|    2021-10-02| 42.75530083865367|
| 19492220327|    2021-12-22|38.867790051131095|
| 82999039227|    2021-12-19|  94.1347004808891|
| 90918180829|    2021-09-02| 43.32551731714902|
+------------+--------------+------------------+
only showing top 5 rows



# Consumer fraud probability imputation

### Impute NULLs for the consumers already have fraud probability

We averaged the existed fraud probabilities for each consumer, then imputed the missing values for the consumer who already have history fraud probabilities by their average fraud probabilities.

In [4]:
consumer_fraud_avg_df =  consumer_fraud.groupBy('user_id')\
                                       .agg(avg('fraud_probability')\
                                            .alias('avg_fraud_probability'))
consumer_fraud_avg_df.show(5)


+-------+---------------------+
|user_id|avg_fraud_probability|
+-------+---------------------+
|  15790|    43.91710485358618|
|   5803|   30.845402987676184|
|  16386|    61.39390975781492|
|  12027|   28.783838366838232|
|   9427|    44.94234071844568|
+-------+---------------------+
only showing top 5 rows



In [5]:
consumer_fraud_avg_df.count()

20128

In [6]:
consumer_full_expanded.count()

11372745

Perform a left join of the `consumer_full_expanded` DataFrame with the `consumer_fraud_avg_df` DataFrame:

In [7]:
# the join is based on the `user_id column
imputed_consumer = consumer_full_expanded.join(
    consumer_fraud_avg_df,
    on='user_id',
    how='left'
)

# fill null values in the `fraud_probability` column with the values from the `avg_fraud_probability` column
# the 'coalesce' function returns the first non-null value
imputed_consumer = imputed_consumer.withColumn(
    'fraud_probability',
    coalesce(col('fraud_probability'), col('avg_fraud_probability'))
)

# drop the temporary `avg_fraud_probability` column, which is no longer needed
imputed_consumer = imputed_consumer.drop('avg_fraud_probability')

In [8]:
imputed_consumer.count()

11372745

Check nulls:

In [9]:
# create a dictionary with column names and their respective null counts
null_count_dict = {col_name: sum(col(col_name).isNull().cast("int")).alias(col_name) for col_name in imputed_consumer.columns}

# use agg() to calculate null counts for each column
null_counts_df = imputed_consumer.agg(*null_count_dict.values())
null_counts_df.show()



+-------+--------+--------------+-----------+----+-----+------------+------------+--------+-----------------+------------------+---------------+--------------------------+------------------------+---------------+----------------------+------------------------+----------------------+------------------+---------------------+--------+---------+--------+---------+--------+---------+---------+---------+------------------+-------------+-----------+---------------+---------------+
|user_id|postcode|order_datetime|consumer_id|name|state|merchant_abn|dollar_value|order_id|fraud_probability|average_population|avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|avg_unemployment_rate|state_NT|state_ACT|state_SA|state_TAS|state_WA|state_QLD|state_VIC|state_NSW|gender_Undisclosed|gender_Female|gender_Male|order_timestamp|hashed_postcode|
+-------+--------+--------------+---------

                                                                                

This means there are 1,880,923 consumer fraud probability need to be imputed by machine learning models.

### Impute NULLs for the consumers without fraud probability

Impute NULLs for the consumers without fraud probability using Self-training Gradient Boosting.

Select the labeled and unlabed data:

In [10]:
# remove NULL fraud_probability used for training
consumer_labeled_data = imputed_consumer.filter(col("fraud_probability").isNotNull())

# use NULL fraud_probability for testing
consumer_unlabeled_data = imputed_consumer.filter(col("fraud_probability").isNull())

Compute the correlation of selected features with `fraud_probability`:

In [11]:
# select features
feature_columns = [
    'postcode', 'order_timestamp', 'dollar_value', 'average_population',
    'avg_age_persons', 'avg_mortgage_repay_monthly', 'avg_tot_prsnl_inc_weekly', 
    'avg_rent_weekly', 'avg_tot_fam_inc_weekly', 'avg_num_psns_per_bedroom', 
    'avg_tot_hhd_inc_weekly', 'avg_household_size', 'avg_unemployment_rate'
] + [col for col in imputed_consumer.columns if col.startswith('gender_')] + [
    col for col in imputed_consumer.columns if col.startswith('state_')]

# calculate the correlation between each feature and fraud_probability
correlations = []
for feature in feature_columns:
    corr_value = imputed_consumer.select(corr(col(feature), 
                                              col("fraud_probability"))).collect()[0][0]
    correlations.append((feature, corr_value))

# print correlation table
correlation_df = pd.DataFrame(correlations, columns=['Feature', 
                                                     'Correlation']).set_index('Feature')
print(correlation_df)

                            Correlation
Feature                                
postcode                      -0.003491
order_timestamp                0.000373
dollar_value                   0.016485
average_population            -0.005810
avg_age_persons               -0.000764
avg_mortgage_repay_monthly    -0.003567
avg_tot_prsnl_inc_weekly      -0.011556
avg_rent_weekly               -0.003091
avg_tot_fam_inc_weekly        -0.007922
avg_num_psns_per_bedroom       0.005687
avg_tot_hhd_inc_weekly        -0.004246
avg_household_size             0.004015
avg_unemployment_rate          0.009363
gender_Undisclosed             0.002739
gender_Female                 -0.011553
gender_Male                    0.009862
state_NT                       0.001789
state_ACT                     -0.009079
state_SA                      -0.001510
state_TAS                      0.004775
state_WA                      -0.010130
state_QLD                      0.002624
state_VIC                      0.004411


Train the initial model to do feature selection based on feature importances:

In [12]:
# select features for training
feature_columns = [
    'order_timestamp', 'dollar_value', 'average_population',
    'avg_age_persons', 'avg_mortgage_repay_monthly', 'avg_tot_prsnl_inc_weekly', 
    'avg_rent_weekly', 'avg_tot_fam_inc_weekly', 'avg_num_psns_per_bedroom', 
    'avg_tot_hhd_inc_weekly', 'avg_household_size', 'avg_unemployment_rate'
] + [col for col in imputed_consumer.columns if col.startswith('gender_')] + [
    col for col in imputed_consumer.columns if col.startswith('state_')]

# create feature vectors
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
labeled_data = assembler.transform(consumer_labeled_data.sample(fraction=0.1, seed=28))
unlabeled_data = assembler.transform(consumer_unlabeled_data.sample(fraction=0.1, seed=28))

# standardise features
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", 
                        withMean=True, withStd=True)

# standardise labeled and unlabeled datasets
scaler_model = scaler.fit(labeled_data)
labeled_data = scaler_model.transform(labeled_data)
unlabeled_data = scaler_model.transform(unlabeled_data)

# train the initial model
gbt = GBTRegressor(featuresCol='scaled_features', labelCol='fraud_probability', maxIter=100)
model = gbt.fit(labeled_data)

# get the feature importances
feature_importances = model.featureImportances
feature_importance_dict = dict(zip(feature_columns, feature_importances))

# sort the feature importances
sorted_feature_importances = sorted(feature_importance_dict.items(), 
                                    key=lambda x: x[1], reverse=True)
print("Feature importance:")
for feature, importance in sorted_feature_importances:
    print(f"{feature}: {importance}")

                                                                                

Feature importance:
average_population: 0.13608788803926591
avg_age_persons: 0.09967358332007537
avg_unemployment_rate: 0.08050185291571943
avg_mortgage_repay_monthly: 0.0803406703336747
avg_rent_weekly: 0.07844855656281224
avg_tot_hhd_inc_weekly: 0.07219214830497706
avg_tot_prsnl_inc_weekly: 0.06830037934007921
avg_tot_fam_inc_weekly: 0.05991186033483644
avg_household_size: 0.05976023978916807
gender_Female: 0.05690937060915915
gender_Undisclosed: 0.05509439583313023
gender_Male: 0.05487266996259641
avg_num_psns_per_bedroom: 0.04536731130800019
state_TAS: 0.010374273000643532
state_VIC: 0.009201425972263106
state_QLD: 0.008071944420716895
state_SA: 0.007892753840143254
state_WA: 0.005690241801330983
state_NSW: 0.0042889056075008595
order_timestamp: 0.002668222894765487
state_NT: 0.0018247151912945765
dollar_value: 0.0018124929694942422
state_ACT: 0.0007140976483525273


Select the features whose feature importance is greater than 0.005, then re-train the model:

In [13]:
# select features with importance greater than 0.005
selected_features = [feature for feature, importance in feature_importance_dict.items() if importance > 0.005]

# create feature vectors with a unique name
assembler = VectorAssembler(inputCols=selected_features, outputCol='selected_features')

# update datasets with the selected features
labeled_data = assembler.transform(consumer_labeled_data)
unlabeled_data = assembler.transform(consumer_unlabeled_data)

# standardize features
scaler = StandardScaler(inputCol="selected_features", 
                        outputCol="selected_scaled_features", withMean=True, withStd=True)

# fit and transform both labeled and unlabeled datasets
scaler_model = scaler.fit(labeled_data)
labeled_data = scaler_model.transform(labeled_data)
unlabeled_data = scaler_model.transform(unlabeled_data)

# initialize the GBT model
gbt = GBTRegressor(featuresCol='selected_scaled_features', 
                   labelCol='fraud_probability', maxIter=100)

# train the initial model
model = gbt.fit(labeled_data)

# self-training process
max_iterations = 5  
confidence_threshold = 0.9 

for i in range(max_iterations):
    # make predictions on unlabeled data
    predictions = model.transform(unlabeled_data)
    
    # filter out high-confidence prediction samples to be added to labeled data
    confident_predictions = predictions.filter(col("prediction") > confidence_threshold)
    
    # stop iteration if no more high-confidence samples are available
    if confident_predictions.count() == 0:
        break
    
    # add high-confidence samples to labeled data
    confident_predictions = confident_predictions.withColumn("fraud_probability", 
                                                             col("prediction"))
    labeled_data = labeled_data.union(confident_predictions.select(labeled_data.columns))
    
    # remove predicted samples from unlabeled dataset
    remaining_unlabeled_data = unlabeled_data.join(confident_predictions, 
                                                   on="merchant_abn", how="leftanti")
    
    # retrain the model
    model = gbt.fit(labeled_data)
    
    # update unlabeled data
    unlabeled_data = remaining_unlabeled_data

# final model predicts the remaining unlabeled data
final_predictions = model.transform(unlabeled_data)

24/09/19 16:16:36 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

In [14]:
# apply final predicted fraud_probability to unlabeled data
final_imputed_consumer = final_predictions.withColumn('fraud_probability', col('prediction'))

# combined filled results
final_imputed_consumer = labeled_data.union(final_imputed_consumer.select(labeled_data.columns))
final_imputed_consumer.show(5)



+-------+--------+--------------+-----------+------------------+-----+------------+------------------+--------------------+------------------+------------------+---------------+--------------------------+------------------------+---------------+----------------------+------------------------+----------------------+------------------+---------------------+--------+---------+--------+---------+--------+---------+---------+---------+------------------+-------------+-----------+---------------+-------------------+--------------------+------------------------+
|user_id|postcode|order_datetime|consumer_id|              name|state|merchant_abn|      dollar_value|            order_id| fraud_probability|average_population|avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|avg_unemployment_rate|state_NT|state_ACT|state_SA|state_TAS|state_WA|state_QLD|state_VIC|state_NSW|gend

                                                                                

In [15]:
# save as a parquet file
final_imputed_consumer = final_imputed_consumer.drop('selected_features', 'selected_scaled_features')
final_imputed_consumer.write.parquet('../data/curated/final_imputed_consumer', mode='overwrite')

                                                                                

Confirm there is no nulls:

In [16]:
# create a dictionary with column names and their respective null counts
null_count_dict = {col_name: sum(col(col_name).isNull().cast("int")).alias(col_name) for col_name in final_imputed_consumer.columns}

# use agg() to calculate null counts for each column
null_counts_df = final_imputed_consumer.agg(*null_count_dict.values())
null_counts_df.show()



+-------+--------+--------------+-----------+----+-----+------------+------------+--------+-----------------+------------------+---------------+--------------------------+------------------------+---------------+----------------------+------------------------+----------------------+------------------+---------------------+--------+---------+--------+---------+--------+---------+---------+---------+------------------+-------------+-----------+---------------+---------------+
|user_id|postcode|order_datetime|consumer_id|name|state|merchant_abn|dollar_value|order_id|fraud_probability|average_population|avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|avg_unemployment_rate|state_NT|state_ACT|state_SA|state_TAS|state_WA|state_QLD|state_VIC|state_NSW|gender_Undisclosed|gender_Female|gender_Male|order_timestamp|hashed_postcode|
+-------+--------+--------------+---------

                                                                                

In [17]:
final_imputed_consumer.count()

                                                                                

11372745

# Merchant fraud probability imputation

### Impute NULLs for the merchants already have fraud probability

We averaged the existed fraud probabilities for each merchant, then imputed the missing values for the merchant who already have history fraud probabilities by their average fraud probabilities.

In [4]:
# calculate the mean of 'fraud_prob' for each unique 'merchant_abn'
merchant_fraud_avg_df = merchant_fraud.groupBy('merchant_abn')\
                                      .agg(avg('fraud_probability')\
                                           .alias('avg_fraud_probability'))
merchant_fraud_avg_df.show(5)


+------------+---------------------+
|merchant_abn|avg_fraud_probability|
+------------+---------------------+
| 99989036621|    18.21089142894488|
| 90568944804|    30.72298492113958|
| 29674997261|    44.43787807900268|
| 27093785141|    28.88064813052203|
| 19492220327|   31.958306675667547|
+------------+---------------------+
only showing top 5 rows



In [5]:
# select only the relevant columns from merchant_fraud_avg_df
merchant_fraud_avg_df = merchant_fraud_avg_df.select('merchant_abn', 'avg_fraud_probability')

# perform a left join between merchant_df2 and merchant_fraud_avg_df on merchant_abn
imputed_merchant = merchant_full_expanded.join(
    merchant_fraud_avg_df,
    on='merchant_abn',
    how='left'
)

# fill null values in fraud_probability with avg_fraud_prob
imputed_merchant= imputed_merchant.withColumn(
    'fraud_probability',
    coalesce(col('fraud_probability'), col('avg_fraud_probability'))
)

# drop the temporary avg_fraud_prob column
imputed_merchant = imputed_merchant.drop('avg_fraud_probability')
imputed_merchant.show(5)

+------------+--------------------+--------------+------------------+---------+-----------------+-------------------+------------------+---------------+---------------+---------------+---------------+---------------+----------------+------------+-------------------+--------------+--------------+---------------+--------------+---------------+---------------+-----------------+---------------+----------------+----------------+-------------------+------------+----------------+------------------+-------------+-------------+---------------+---------------+-------------+--------------+--------------------+--------------+-------------------+-----------------+--------------+-----------------+--------------+-------------------+--------------+---------------+---------------+-------------+--------------+-------------------+--------------+--------------+----------------+-------------+-------------+---------------+---------------+----------------+----------------+---------------+------------------+-

                                                                                

In [6]:
imputed_merchant.count()

11372745

In [7]:
# create a dictionary with column names and their respective null counts
null_count_dict = {col_name: sum(col(col_name).isNull().cast("int")).alias(col_name) for col_name in imputed_merchant.columns}

# use agg() to calculate null counts for each column
null_counts_df = imputed_merchant.agg(*null_count_dict.values())
null_counts_df.show()



+------------+--------+--------------+------------+---------+-----------------+-------------------+------------+---------------+---------------+---------------+---------------+---------------+----------------+------------+-------------------+--------------+--------------+---------------+--------------+---------------+---------------+-----------------+---------------+----------------+----------------+-------------------+------------+----------------+------------------+-------------+-------------+---------------+---------------+-------------+--------------+--------------------+--------------+-------------------+-----------------+--------------+-----------------+--------------+-------------------+--------------+---------------+---------------+-------------+--------------+-------------------+--------------+--------------+----------------+-------------+-------------+---------------+---------------+----------------+----------------+---------------+------------------+----------------+--------

                                                                                

This means there are 10,884,814 merchant fraud probability need to be imputed by machine learning models.

### Impute NULLs for the merchants without fraud probability

Impute NULLs for the merchants without fraud probability using Self-training Gradient Boosting.

Select the labeled and unlabed data:

In [8]:
# remove NULL fraud_probability used for training
merchant_labeled_data = imputed_merchant.filter(col("fraud_probability").isNotNull())

# use NULL fraud_probability for testing
merchant_unlabeled_data = imputed_merchant.filter(col("fraud_probability").isNull())

Train the initial model to do feature selection based on feature importances:

In [9]:
# select features for training
feature_columns = [
    'transaction_revenue', 'order_timestamp',
    'revenue_level_e', 'revenue_level_d', 'revenue_level_c', 'revenue_level_b', 'revenue_level_a'
] + [col for col in imputed_merchant.columns if col.startswith('category_')]

# create feature vectors
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
labeled_data = assembler.transform(merchant_labeled_data)
unlabeled_data = assembler.transform(merchant_unlabeled_data)

# standardise features
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", 
                        withMean=True, withStd=True)

# standardise labeled and unlabeled datasets
scaler_model = scaler.fit(labeled_data)
labeled_data = scaler_model.transform(labeled_data)
unlabeled_data = scaler_model.transform(unlabeled_data)

# train the initial model
gbt = GBTRegressor(featuresCol='scaled_features', labelCol='fraud_probability', maxIter=100)
model = gbt.fit(labeled_data)

# get the feature importances
feature_importances = model.featureImportances
feature_importance_dict = dict(zip(feature_columns, feature_importances))

# sort the feature importances
sorted_feature_importances = sorted(feature_importance_dict.items(), 
                                    key=lambda x: x[1], reverse=True)
print("Feature importance:")
for feature, importance in sorted_feature_importances:
    print(f"{feature}: {importance}")

                                                                                

Feature importance:
revenue_level_c: 0.3196450588025708
revenue_level_b: 0.13121213251244154
category_beauty: 0.10602245269747404
revenue_level_a: 0.06354516794501573
category_furniture: 0.056126046388914295
category_supply: 0.04819241719980726
category_software: 0.03977970141294704
category_souvenir: 0.03933244773156999
category_service: 0.025136680391322965
revenue_level_e: 0.024533076405067522
category_optical: 0.02285272993284839
revenue_level_d: 0.021836405462819005
category_stock: 0.02010631938555155
category_equipment: 0.016039690297983106
category_jewelry: 0.012911985700468866
category_sale: 0.009681213265161732
category_newspaper: 0.008320891215423301
category_shoe: 0.008111756751337358
category_art: 0.00529404153014528
category_television: 0.00492584646273396
transaction_revenue: 0.004460131611061774
category_telecom: 0.002684266885595962
category_craft: 0.002620912195595103
order_timestamp: 0.0025872842274343804
category_silverware: 0.0025806677023705563
category_repair: 0.0

Select the features whose feature importance is greater than 0, then re-train the model:

In [10]:
# select features with importance greater than 0
feature_columns = [feature for feature, importance in feature_importance_dict.items() if importance > 0]

# create feature vectors with a unique name
assembler = VectorAssembler(inputCols=feature_columns, outputCol='selected_features')

# update datasets with the selected features
labeled_data = assembler.transform(merchant_labeled_data)
unlabeled_data = assembler.transform(merchant_unlabeled_data)

# standardize features
scaler = StandardScaler(inputCol="selected_features", outputCol="selected_scaled_features", 
                        withMean=True, withStd=True)

# fit and transform both labeled and unlabeled datasets
scaler_model = scaler.fit(labeled_data)
labeled_data = scaler_model.transform(labeled_data)
unlabeled_data = scaler_model.transform(unlabeled_data)

# initialize the GBT model
gbt = GBTRegressor(featuresCol='selected_scaled_features', labelCol='fraud_probability', 
                   maxIter=100)

# train the initial model
model = gbt.fit(labeled_data)

# self-training process
max_iterations = 5  
confidence_threshold = 0.9 

for i in range(max_iterations):
    # make predictions on unlabeled data
    predictions = model.transform(unlabeled_data)
    
    # filter out high-confidence prediction samples to be added to labeled data
    confident_predictions = predictions.filter(col("prediction") > confidence_threshold)
    
    # stop iteration if no more high-confidence samples are available
    if confident_predictions.count() == 0:
        break
    
    # add high-confidence samples to labeled data
    confident_predictions = confident_predictions.withColumn("fraud_probability", 
                                                             col("prediction"))
    labeled_data = labeled_data.union(confident_predictions.select(labeled_data.columns))
    
    # remove predicted samples from unlabeled dataset
    remaining_unlabeled_data = unlabeled_data.join(confident_predictions, 
                                                   on="merchant_abn", how="leftanti")
    
    # retrain the model
    model = gbt.fit(labeled_data)
    
    # update unlabeled data
    unlabeled_data = remaining_unlabeled_data

# final model predicts the remaining unlabeled data
final_predictions = model.transform(unlabeled_data)

24/09/19 16:52:09 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

In [11]:
# apply final predicted fraud_probability to unlabeled data
final_imputed_merchant = final_predictions.withColumn('fraud_probability', col('prediction'))

# combined filled results
final_imputed_merchant = labeled_data.union(final_imputed_merchant.select(labeled_data.columns))
final_imputed_merchant.show(5)



+------------+--------------------+--------------+------------------+---------+------------------+-------------------+------------------+---------------+---------------+---------------+---------------+---------------+----------------+------------+-------------------+--------------+--------------+---------------+--------------+---------------+---------------+-----------------+---------------+----------------+----------------+-------------------+------------+----------------+------------------+-------------+-------------+---------------+---------------+-------------+--------------+--------------------+--------------+-------------------+-----------------+--------------+-----------------+--------------+-------------------+--------------+---------------+---------------+-------------+--------------+-------------------+--------------+--------------+----------------+-------------+-------------+---------------+---------------+----------------+----------------+---------------+------------------+

                                                                                

In [12]:
# save as a parquet file
final_imputed_merchant = final_imputed_merchant.drop('selected_features', 'selected_scaled_features')
final_imputed_merchant.write.parquet('../data/curated/final_imputed_merchant', mode='overwrite')

                                                                                

Confirm there is no nulls:

In [13]:
# create a dictionary with column names and their respective null counts
null_count_dict = {col_name: sum(col(col_name).isNull().cast("int")).alias(col_name) for col_name in final_imputed_merchant.columns}

# use agg() to calculate null counts for each column
null_counts_df = final_imputed_merchant.agg(*null_count_dict.values())
null_counts_df.show()



+------------+--------+--------------+------------+---------+-----------------+-------------------+------------+---------------+---------------+---------------+---------------+---------------+----------------+------------+-------------------+--------------+--------------+---------------+--------------+---------------+---------------+-----------------+---------------+----------------+----------------+-------------------+------------+----------------+------------------+-------------+-------------+---------------+---------------+-------------+--------------+--------------------+--------------+-------------------+-----------------+--------------+-----------------+--------------+-------------------+--------------+---------------+---------------+-------------+--------------+-------------------+--------------+--------------+----------------+-------------+-------------+---------------+---------------+----------------+----------------+---------------+------------------+----------------+--------

                                                                                

In [4]:
final_imputed_merchant.count()

11372745