In [None]:
from pyspark.sql import DataFrame, Row
from functools import reduce
from pyspark.sql.types import *
from pyspark.sql.functions import *
from graphframes import *
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
import uuid

In [None]:
conf=SparkConf()
conf.set("spark.executor.memory", "2g")
conf.set("spark.driver.memory", "12g")
conf.set("spark.cores.max", "6")

sc = SparkContext.getOrCreate(conf)

spark = SQLContext(sc)

In [None]:
class MatchPruner:
    def __init__(self, mdw: DataFrame, match_suggestion: DataFrame, config: dict) -> None:
        self.mdw = mdw
        self.match_suggestion = match_suggestion
        self.config = config
        
    def preprocessing_matches(self, matches: DataFrame):
        matches = matches.withColumn("seller_type", lower(col("seller_type")))\
            .withColumn("match_type", when( col("match_type").isNotNull(), lower(col("match_type")) ).otherwise(col("match_type")) )
        return matches
    
    def prune_1p_3p_match_suggestions(self, match_suggestion: DataFrame):
        filtered_matches = None
        pruned_matches = None
        if self.config['seller_type'] == '1p':
            filtered_matches = match_suggestion.filter(col('seller_type') == '1p')
            pruned_matches = match_suggestion.filter(col('seller_type') == '3p')
            pruned_matches = pruned_matches.withColumn("STATUS", lit("pruned_by_seller_type_1p"))
        elif self.config['seller_type'] == '1p':
            filtered_matches = match_suggestion.filter(col('seller_type') == '3p')
            pruned_matches = match_suggestion.filter(col('seller_type') == '1p')
            pruned_matches = pruned_matches.withColumn("STATUS", lit("pruned_by_seller_type_3p"))
        else:
            filtered_matches = match_suggestion
        return filtered_matches, pruned_matches
    
    def separate_audited_unaudited_base_product_matches(self, match_suggestion:DataFrame, mdw:DataFrame):
        unaudited_base_product_matches = match_suggestion.join(mdw, 'base_sku_uuid', 'left_anti')
        audited_base_product_matches = match_suggestion.join(mdw, 'base_sku_uuid', 'inner')
        return unaudited_base_product_matches, audited_base_product_matches
    
    def separate_audited_and_unaudited_matches(self, audited_base_product_matches:DataFrame, mdw:DataFrame):
        audited_matches = audited_base_product_matches.join(mdw, 'pair_id', 'inner')
        similar_matches = audited_matches.filter(col('match_type')in(['equivalent', 'similar']))\
                        .select('base_sku_uuid', 'base_seller_type', 'audited_match_comp_seller_type', 'match_type')\
                        .groupBy('base_sku_uuid', 'base_seller_type', 'match_type')\
                            .agg(concat_ws('_', array_sort(collect_set(col("audited_match_comp_seller_type")))).alias("audited_match_comp_seller_type"))\
                        .withColumn('status', lit(f"pruned_by_{col("base_sku_uuid")}"))
        
        exact_matches = audited_matches.filter(col('match_type')in(['exact']))\
                        .select('base_sku_uuid', 'base_seller_type', 'audited_match_comp_seller_type', 'match_type')\
                        .groupBy('base_sku_uuid', 'base_seller_type', 'match_type')\
                            .agg( when( array_contains( collect_set(col("audited_match_comp_seller_type")), "1p"), lit("1p") ).otherwise( lit("3p") ).alias("audited_match_comp_seller_type") )\
                        .withColumn('status', lit(f"pruned_by_{col("base_sku_uuid")}"))
        audited_base_product_unaudited_matches = audited_base_product_matches.join(mdw, 'pair_id', 'left_anti')
        return similar_matches, exact_matches, audited_base_product_unaudited_matches
    
    # def seggregate_1p_suggestions_for_1p_exact_matched_base_product_from_match_suggestion(self, match_suggetion:DataFrame, exact_matches:DataFrame):
    #     # exact_match_1p = exact_match.filter(col("audited_match_comp_seller_type").contains("1p"))
    #     # pruned_matches = match_suggetion.filter(col('comp_seller_type') == '1p').join(exact_match_1p, 'base_sku_uuid', 'inner')
    #     # potential_match_suggestion = match_suggetion.join(pruned_matches, 'base_sku_uuid', 'left_anti')

    #     audited_base_product_unaudited_matches = audited_base_product_unaudited_matches.join(exact_matches, "base_sku_uuid", "left")
    #     pruned_matches = audited_base_product_unaudited_matches.filter( (col("audited_match_comp_seller_type").contains("1p")) & ( col('comp_seller_type') == '1p' ) )
    #     match_suggestion = audited_base_product_unaudited_matches.filter( ~((col("audited_match_comp_seller_type").contains("1p")) & ( col('comp_seller_type') == '1p' )) ) audited_match_comp_seller_type = 3p , comp_seller_type = 3p
    #     return pruned_matches, match_suggestion
    
    def prune_match_based_on_match_type(self, similar_matches:DataFrame , exact_matches:DataFrame , audited_base_product_unaudited_matches:DataFrame ):
        # audited_base_product_unaudited_matches contains only unaudited matches 
        pruned_matches = None
        match_suggestion = None
        if self.config['match_type'] == 'exact':
            audited_base_product_unaudited_matches = audited_base_product_unaudited_matches.join(exact_matches, "base_sku_uuid", "left")
            if self.config['seller_type'] == '1p':
                if self.config['cardinality'] == '1':
                    pruned_matches = potential_match_suggestion
                if self.config['cardinality'] == 'n':
                    pruned_matches = potential_match_suggestion

            elif self.config['seller_type'] == '3p':
                if self.config['cardinality'] == '1':
                    pruned_matches = potential_match_suggestion
                if self.config['cardinality'] == 'n':
                    match_suggestion = potential_match_suggestion

            elif self.config['seller_type'] == '1p_or_3p':
                audited_base_product_unaudited_matches = audited_base_product_unaudited_matches.join(exact_matches, "base_sku_uuid", "left")
                if self.config['cardinality'] == '1':
                    prune_suggestion = audited_base_product_unaudited_matches
                    match_suggestion = None
                if self.config['cardinality'] == 'n':
                    prune_suggestion = audited_base_product_unaudited_matches.filter( (col("audited_match_comp_seller_type") == "1p") & ( col('comp_seller_type') == '1p' ) )
                    match_suggestion = audited_base_product_unaudited_matches.filter( ~((col("audited_match_comp_seller_type") == "1p") & ( col('comp_seller_type') == '1p' )) ) 

            elif self.config['seller_type'] == '1p_over_3p':
                audited_base_product_unaudited_matches = audited_base_product_unaudited_matches.join(exact_matches, "base_sku_uuid", "left")
                if self.config['cardinality'] == '1':
                    prune_suggestion = audited_base_product_unaudited_matches.filter( ~((col("audited_match_comp_seller_type") == "3p") & ( col('comp_seller_type') == '1p' )) )
                    match_suggestion = audited_base_product_unaudited_matches.filter( (col("audited_match_comp_seller_type") == "3p") & ( col('comp_seller_type') == '1p' ) )
                if self.config['cardinality'] == 'n':
                    prune_suggestion = audited_base_product_unaudited_matches.filter( (col("audited_match_comp_seller_type") == "1p") & ( col('comp_seller_type') == '1p' ) )
                    match_suggestion = audited_base_product_unaudited_matches.filter( ~((col("audited_match_comp_seller_type") == "1p") & ( col('comp_seller_type') == '1p' )) )

        elif self.config['similar'] == 'similar' or self.config['match_type'] == 'exact_or_similar':
            #  audited_base_product_unaudited_matches only contain 1p match suggestion
            exact_and_similar_match = exact_matches.union(similar_matches)
            if self.config['seller_type'] == '1p':
                if self.config['cardinality'] == '1':
                    # we already have one 1p exact/similar match for the customer product so other 1p exact/similar are not needed
                    pruned_matches = audited_base_product_unaudited_matches.join(exact_and_similar_match, 'base_sku_uuid', 'left')
                if self.config['cardinality'] == 'n':
                    # we have one exact/similar match for the customer product but we can find more similar/exact 3p matches but not 1p exact match
                    pruned_matches, match_suggestion = self.seggregate_1p_suggestions_for_1p_exact_matched_base_product_from_match_suggestion(audited_base_product_unaudited_matches, exact_matches)
            elif self.config['seller_type'] == '3p':
                #  audited_base_product_unaudited_matches only contain 3p match suggestion
                potential_match_suggestion = audited_base_product_unaudited_matches.join(exact_and_similar_match, 'base_sku_uuid', 'left')
                if self.config['cardinality'] == '1':
                    # we already have one exact/sim 3p match for the customer product
                    pruned_matches = potential_match_suggestion
                if self.config['cardinality'] == 'n':
                    # we have one 3p exact/sim match for the customer product but we can find more 3p product from the same source
                    match_suggestion = potential_match_suggestion
            elif self.config['seller_type'] == '1p_or_3p':
                #  audited_base_product_unaudited_matches contain both 1p/3p match suggestion
                if self.config['cardinality'] == '1':
                    # we already have one exact/sim 1p/3p match for the customer product
                    pruned_matches = audited_base_product_unaudited_matches.join(exact_and_similar_match, 'base_sku_uuid', 'left')
                if self.config['cardinality'] == 'n':
                    # we already have one 1p/3p exact/sim match for the customer product but we can have more 3p exact/sim and 1p sim matches but not 1p exact match as we already have an exact match for that product
                    pruned_matches, match_suggestion = self.seggregate_1p_suggestions_for_1p_exact_matched_base_product_from_match_suggestion(audited_base_product_unaudited_matches, exact_matches)
            elif self.config['seller_type'] == '1p_over_3p':
                #  audited_base_product_unaudited_matches 1p and 3p match suggestion
                if self.config['cardinality'] == '1':
                    # if we already have one 1p exact match for a customer product then we cannot have any more 1p exact matches so all 1p/3p suggestion are pruned for that product
                    # if we already have one 3p exact match for a customer product then we can have 1p exact match so all 3p suggestion are pruned for that product
                    # if we already have one 1p/3p sim match for a customer product then we can have 1p/3p exact match
                    only_similar_match = similar_matches.join(exact_matches, 'base_sku_uuid', 'left_anti')
                    exact_matched_base_product_match_suggestion = audited_base_product_unaudited_matches.join(exact_matches, 'base_sku_uuid', 'inner')
                    similar_matched_base_product_match_suggestion = audited_base_product_unaudited_matches.join(exact_matched_base_product_match_suggestion, 'base_sku_uuid', 'left_anti')
                    pruned_by_3p_seller_matches = exact_matched_base_product_match_suggestion.filter(col('audited_match_comp_seller_type') == '3p')
                    potential_1p_match_suggestion = exact_matched_base_product_match_suggestion.filter(col('audited_match_comp_seller_type') == '1p')
                    pruned_by_1p_exact_match_matches = potential_1p_match_suggestion.join(exact_matches.filter(col("audited_match_comp_seller_type").contains('1p')), 'base_sku_uuid', 'inner')
                    match_suggestion = potential_1p_match_suggestion.join(exact_matches.filter(col("audited_match_comp_seller_type") == '3p'), 'base_sku_uuid', 'inner')

                    pass
                if self.config['cardinality'] == 'n':
                    # if we already have one 1p exact match for a customer product then we cannot have any more 1p exact matches but we can have 3p exact/sim mathes
                    # if we already have one 3p exact match for a customer product then we can have 1p exact match so more 3p exact/sim match
                    # if we already have one 1p/3p sim match for a customer product then we can have 1p/3p exact match 
                    pass
        return
        
    