In [None]:
from pyspark.sql import DataFrame, Row
from functools import reduce
from pyspark.sql.types import *
from pyspark.sql.functions import *
from graphframes import *
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
import uuid

In [None]:
conf=SparkConf()
conf.set("spark.executor.memory", "2g")
conf.set("spark.driver.memory", "12g")
conf.set("spark.cores.max", "6")

sc = SparkContext.getOrCreate(conf)

spark = SQLContext(sc)

In [None]:
class MatchPruner:
    def __init__(self, mdw: DataFrame, match_suggestion: DataFrame, config: dict) -> None:
        self.mdw = mdw
        self.match_suggestion = match_suggestion
        self.config = config
        
    def preprocessing_matches(self, matches: DataFrame):
        matches = matches.withColumn("seller_type", lower(col("seller_type")))\
            .withColumn("match_type", when( col("match_type").isNotNull(), lower(col("match_type")) ).otherwise(col("match_type")) )
        return matches
    
    def prune_1p_3p_match_suggestions(self, match_suggestion: DataFrame):
        filtered_matches = None
        pruned_matches = None
        if self.config['seller_type'] == '1p':
            filtered_matches = match_suggestion.filter(col('seller_type') == '1p')
            pruned_matches = match_suggestion.filter(col('seller_type') == '3p')
            pruned_matches = pruned_matches.withColumn("STATUS", lit("pruned_by_seller_type_1p"))
        elif self.config['seller_type'] == '1p':
            filtered_matches = match_suggestion.filter(col('seller_type') == '3p')
            pruned_matches = match_suggestion.filter(col('seller_type') == '1p')
            pruned_matches = pruned_matches.withColumn("STATUS", lit("pruned_by_seller_type_3p"))
        else:
            filtered_matches = match_suggestion
        return filtered_matches, pruned_matches
    
    def separate_audited_unaudited_base_product_matches(self, match_suggestion:DataFrame, mdw:DataFrame):
        unaudited_base_product_matches = match_suggestion.join(mdw, 'base_sku_uuid', 'left_anti')
        audited_base_product_matches = match_suggestion.join(mdw, 'base_sku_uuid', 'inner')
        return unaudited_base_product_matches, audited_base_product_matches
    
    def separate_audited_and_unaudited_matches(self, audited_base_product_matches:DataFrame, mdw:DataFrame):
        audited_matches = audited_base_product_matches.join(mdw, 'pair_id', 'inner')
        similar_matches = audited_matches.filter(col('match_type')in(['equivalent', 'similar']))
        similar_matches = similar_matches.withColumn('status', lit(f"pruned_by_{col("pair_id")}"))
        exact_matches = audited_matches.filter(col('match_type')in(['exact_match']))
        exact_matches = exact_matches.withColumn('status', lit(f"pruned_by_{col("pair_id")}"))
        audited_base_product_unaudited_matches = audited_base_product_matches.join(mdw, 'pair_id', 'left_anti')
        return similar_matches, exact_matches, audited_base_product_unaudited_matches
    
    def prune_match_based_on_match_type(self, similar_matches:DataFrame , exact_matches:DataFrame , audited_base_product_unaudited_matches:DataFrame ):
        pruned_matches = None
        match_suggestion = None
        if self.config['match_type'] == 'exact':
            
            if self.config['seller_type'] == '1p':
                #  audited_base_product_unaudited_matches only contain 1p match suggestion
                potential_match_suggestion = audited_base_product_unaudited_matches.join(exact_matches, 'base_sku_uuid', 'left')
                if self.config['cardinality'] == '1':
                    # we already have one 1p exact match for the customer product so other 1p exact match cant exist 
                    pruned_matches = potential_match_suggestion
                if self.config['cardinality'] == 'n':
                    # we already have one 1p exact match for the customer product so other 1p exact match cant exist 
                    pruned_matches = potential_match_suggestion
            if self.config['seller_type'] == '3p':
                #  audited_base_product_unaudited_matches only contain 3p match suggestion
                potential_match_suggestion = audited_base_product_unaudited_matches.join(exact_matches, 'base_sku_uuid', 'left')
                if self.config['cardinality'] == '1':
                    # we already have one exact match for the customer product  
                    pruned_matches = potential_match_suggestion
                if self.config['cardinality'] == 'n':
                    # we have one 3p exact match for the customer product but we can find more 3p product from the same source
                    match_suggestion = potential_match_suggestion
            if self.config['seller_type'] == '1p_or_3p':
                #  audited_base_product_unaudited_matches 1p and 3p match suggestion
                if self.config['cardinality'] == '1':
                    # we already have one exact match for the customer product
                    pruned_matches = audited_base_product_unaudited_matches.join(exact_matches, 'base_sku_uuid', 'left')
                if self.config['cardinality'] == 'n':
                    # we already have one 1p/3p exact match for the customer product but we can have more 3p exact matches but not 1p match as we already have an exact match for that product
                    pruned_matches = audited_base_product_unaudited_matches.join(exact_matches.filter(col("seller_type") == '1p'), 'base_sku_uuid', 'left')
                    match_suggestion = audited_base_product_unaudited_matches.join(exact_matches.filter(col("seller_type") == '1p'), 'base_sku_uuid', 'left_anti')
            if self.config['seller_type'] == '1p_over_3p':
                #  audited_base_product_unaudited_matches 1p and 3p match suggestion
                if self.config['cardinality'] == '1':
                    # if we already have one 1p exact match for the customer product then we can never have any more 1p matches so all 1p/3p suggestion are pruned 
                    # if we already have one 3p exact match for the customer product then we can have one 1p match but no more 3p matches are needed
                    exact_matches_1p_products = exact_matches.filter(col("seller_type") == '1p')
                    exact_matches_3p_products = exact_matches.filter(col("seller_type") == '3p').join(exact_matches_1p, 'base_sku_uuid', 'left_anti')# for 3p matches there might be some 1p products as well
                    pruned_matches = audited_base_product_unaudited_matches.join(exact_matches_1p_products, 'base_sku_uuid', 'left')
                    match_suggestion = audited_base_product_unaudited_matches.join(exact_matches_3p_products, 'base_sku_uuid', 'left')
                if self.config['cardinality'] == 'n':
                    # if we already have one 1p exact match for the customer product then we can never have any more 1p matches so all 1p/3p suggestion are pruned 
                    # if we already have one 3p exact match for the customer product then we can never have one 1p match also more 3p can be present
                    exact_matches_1p = exact_matches.filter(col("seller_type") == '1p')
                    exact_matches_3p = exact_matches.filter(col("seller_type") == '3p').join(exact_matches_1p, 'base_sku_uuid', 'left_anti')
                    pruned_matches = audited_base_product_unaudited_matches.join(exact_matches_1p, 'base_sku_uuid', 'left')
                    match_suggestion = audited_base_product_unaudited_matches.join(exact_matches_3p, 'base_sku_uuid', 'left')
        elif self.config['similar'] == 'similar' or self.config['match_type'] == 'exact_or_similar':
            if self.config['seller_type'] == '1p':
                if self.config['cardinality'] == '1':
                    pass
                if self.config['cardinality'] == 'n':
                    pass
            if self.config['seller_type'] == '3p':
                if self.config['cardinality'] == '1':
                    pass
                if self.config['cardinality'] == 'n':
                    pass
            if self.config['seller_type'] == '1p_or_3p':
                if self.config['cardinality'] == '1':
                    pass
                if self.config['cardinality'] == 'n':
                    pass
            if self.config['seller_type'] == '1p_over_3p':
                if self.config['cardinality'] == '1':
                    pass
                if self.config['cardinality'] == 'n':
                    pass
        elif self.config['match_type'] == 'exact_over_similar':
            if self.config['seller_type'] == '1p':
                if self.config['cardinality'] == '1':
                    pass
                if self.config['cardinality'] == 'n':
                    pass
            if self.config['seller_type'] == '3p':
                if self.config['cardinality'] == '1':
                    pass
                if self.config['cardinality'] == 'n':
                    pass
            if self.config['seller_type'] == '1p_or_3p':
                if self.config['cardinality'] == '1':
                    pass
                if self.config['cardinality'] == 'n':
                    pass
            if self.config['seller_type'] == '1p_over_3p':
                if self.config['cardinality'] == '1':
                    pass
                if self.config['cardinality'] == 'n':
                    pass
        return
        
    