In [None]:
import pandas as pd
from pyspark.sql.functions import *

In [None]:
from pyspark.sql import SparkSession

# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true") 
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

## Adding postcode income as feature 

This notebook finds each merchant's main consumer postcodes and adds that postcodes' median weekly personal income as a feature of the merchat for further ranking and testing.

First reduce some of the transactions, as the income dataframe have some missing postcodes.

In [None]:
# Read the transaction dataframe
transaction_df = pd.read_parquet('../data/curated/transactions_detail.parquet')

In [None]:
transaction_df

In [None]:
# Read the income dataframe
income_df = pd.read_csv("../data/income/2021_income.csv")
income_df['POA_CODE_2021'] = income_df['POA_CODE_2021'].astype(int)

In [None]:
# Merge the income and transaction dataframe
transaction_df['postcode'] = transaction_df['postcode'].astype(int)
merged_df = transaction_df \
    .merge(income_df[['POA_CODE_2021']], left_on='postcode', right_on='POA_CODE_2021') \
    .drop('POA_CODE_2021', axis=1)
merged_df

In [None]:
merged_df.to_parquet(r'../data/curated/reduced_transactions.parquet')

In [None]:
transaction_df = spark.read.parquet('../data/curated/reduced_transactions.parquet')
transaction_df

Now count and see which postcode of each merchant has the highest number of transactions.

In [None]:
transaction_count_df = transaction_df.groupBy('merchant_name', 'postcode').count()
transaction_count_df = transaction_count_df.withColumnRenamed("count","transaction_count")

In [None]:
from pyspark.sql import Window
import pyspark.sql.functions as f

w = Window.partitionBy('merchant_name')
max_count_df = transaction_count_df.withColumn('max_count', f.max('transaction_count').over(w))\
                .where(f.col('transaction_count') == f.col('max_count'))\
                .drop('max_count')\
                .dropDuplicates(['merchant_name'])


Then include the postcode's income into the merchant detail, as a new feature.

In [None]:
merchant_detail = spark.read.parquet('../data/curated/merchant_detail.parquet')

In [None]:
merchant_detail = merchant_detail.join(max_count_df.drop(col("transaction_count")),['merchant_name'],how='inner')

In [None]:
merchant_detail = merchant_detail.toPandas()
merchant_detail['postcode'] = merchant_detail['postcode'].astype(int)

In [None]:
merged_df = merchant_detail \
    .merge(income_df[['POA_CODE_2021', 'Median_tot_prsnl_inc_weekly']], left_on='postcode', right_on='POA_CODE_2021') \
    .drop('POA_CODE_2021', axis=1)


In [None]:
merged_df

In [None]:
merged_df.to_csv(r'../data/curated/merchant_detail_with_income.csv', index = False, header=True)