In [1]:
import pandas as pd
import math
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
transactions = pd.read_parquet('transactions_train.parquet')
customers = pd.read_parquet('customers.parquet')
articles = pd.read_parquet('articles.parquet')

In [3]:
# Calculate the next week number after the current maximum week in the data
test_week = transactions.week.max() + 1

# Filter the transactions to include only those from the last 10 weeks
transactions = transactions[transactions.week > transactions.week.max() - 10]

In [4]:
# Counting transactions for each customer_id
transaction_counts = transactions['customer_id'].value_counts()

# Calculating the 80th quantile as the threshold
threshold = transaction_counts.quantile(0.90)

# Labeling frequent buyers in the transactions DataFrame
transactions['frequent_buyer'] = transactions['customer_id'].map(lambda x: transaction_counts[x] > threshold)

# Counting unique frequent buyers
num_frequent_buyers = transactions[transactions['frequent_buyer']]['customer_id'].nunique()
print(f"Number of frequent buyers: {num_frequent_buyers}")

# Filtering transactions for frequent buyers
frequent_buyer_transactions = transactions[transactions['frequent_buyer']]

# Merging with articles data
frequent_transactions = frequent_buyer_transactions.merge(articles[['article_id', 'product_group_name', 'product_type_name', 'colour_group_name']], on='article_id', how='left')

print("merged: \n", frequent_transactions)

# Extract unique frequent buyer IDs
unique_frequent_buyers = list(frequent_buyer_transactions['customer_id'].unique())

Number of frequent buyers: 39460
merged: 
             t_dat           customer_id  article_id     price  \
0      2020-07-15       272412481300040   778064028  0.008458   
1      2020-07-15       272412481300040   816592008  0.016932   
2      2020-07-15       272412481300040   621381021  0.033881   
3      2020-07-15       272412481300040   817477003  0.025407   
4      2020-07-15       272412481300040   899088002  0.025407   
...           ...                   ...         ...       ...   
949985 2020-09-22  18421675981536870956   749699002  0.025407   
949986 2020-09-22  18426621781275797575   572998013  0.042356   
949987 2020-09-22  18426621781275797575   788575004  0.042356   
949988 2020-09-22  18426621781275797575   914441003  0.033881   
949989 2020-09-22  18426621781275797575   896848001  0.030492   

        sales_channel_id  week  frequent_buyer  product_group_name  \
0                      1    95            True                   0   
1                      1    95      

In [5]:
# Assume 'price' column exists in your DataFrame
Q1 = frequent_transactions['price'].quantile(0.20)
Q3 = frequent_transactions['price'].quantile(0.80)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outlier_price_candidates = frequent_transactions[(frequent_transactions['price'] < lower_bound) | (frequent_transactions['price'] > upper_bound)]

print(outlier_price_candidates)

            t_dat           customer_id  article_id     price  \
596    2020-07-15    867806996788384472   773471010  0.125847   
723    2020-07-15   1002684848125350632   851993001  0.101678   
724    2020-07-15   1002684848125350632   851993001  0.101678   
1124   2020-07-15   1485995793756406162   859874001  0.122017   
1287   2020-07-15   1632437192178381894   707075008  0.101678   
...           ...                   ...         ...       ...   
949785 2020-09-22  18063942235003628498   887464003  0.101678   
949787 2020-09-22  18063942235003628498   887464002  0.101678   
949788 2020-09-22  18063942235003628498   887464002  0.101678   
949789 2020-09-22  18063942235003628498   887464002  0.101678   
949966 2020-09-22  18394381115614748074   901318001  0.101678   

        sales_channel_id  week  frequent_buyer  product_group_name  \
596                    2    95            True                   1   
723                    2    95            True                   2   
724      

In [6]:
# Calculate the least common product group, product type and color for each customer
least_common_categories = frequent_transactions.groupby('customer_id')['product_group_name'].apply(lambda x: x.value_counts().nsmallest(1).index.tolist())
# Calculate the 4 least common product types for each customer
least_common_product_types = frequent_transactions.groupby('customer_id')['product_type_name'].apply(lambda x: x.value_counts().nsmallest(1).index.tolist())
# Calculate the 4 least common colors for each customer
least_common_colors = frequent_transactions.groupby('customer_id')['colour_group_name'].apply(lambda x: x.value_counts().nsmallest(1).index.tolist())

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Assuming 'articles' is your DataFrame with article details
# Convert categorical attributes to one-hot encoded vectors
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(articles[['product_group_name', 'product_type_name', 'colour_group_name']])

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(encoded_features)

# Convert similarity matrix to DataFrame for easier handling
similarity_df = pd.DataFrame(similarity_matrix, index=articles['article_id'], columns=articles['article_id'])

def find_outlier_like_items(customer_id, transactions, similarity_df, threshold=0.5):
    # Get items interacted with by the customer
    interacted_items = transactions[transactions['customer_id'] == customer_id]['article_id'].unique()

    # Dictionary to hold potential outlier-like items
    outlier_like_items = {}

    for item in interacted_items:
        # Get similarity scores for the item with all other items
        sim_scores = similarity_df[item]

        # Filter items based on threshold and exclude items already interacted with
        similar_items = sim_scores[(sim_scores > threshold) & (sim_scores < 1.0) & (~sim_scores.index.isin(interacted_items))].index.tolist()

        outlier_like_items[item] = similar_items

    return outlier_like_items

# Example usage
customer_id = 867806996788384472
outlier_like_items = find_outlier_like_items(customer_id, transactions, similarity_df)

print(outlier_like_items)


In [26]:
article_candidate_ids = {}

for customer in unique_frequent_buyers:
    # Initialize an empty set for the customer
    article_candidate_ids[customer] = set()

    for category in least_common_categories[customer]:
        for color in least_common_colors[customer]:
            for product_type in least_common_product_types[customer]:

                # Filter the articles dataframe for each combination of two criteria
                matching_articles_category_color = articles[
                    (articles['product_group_name'] == category) &
                    (articles['colour_group_name'] == color)
                ]

                matching_articles_category_type = articles[
                    (articles['product_group_name'] == category) &
                    (articles['product_type_name'] == product_type)
                ]

                matching_articles_color_type = articles[
                    (articles['colour_group_name'] == color) &
                    (articles['product_type_name'] == product_type)
                ]

                # Combine the results and add the found article IDs to the set for this customer
                combined_articles = pd.concat([
                    matching_articles_category_color,
                    matching_articles_category_type,
                    matching_articles_color_type
                ]).drop_duplicates('article_id')

                article_candidate_ids[customer].update(combined_articles['article_id'].unique())

# Assuming 'merged_df' is your DataFrame after the merge
# This will remove rows where the combination of 'customer_id' and 'article_id' is duplicated
#print(article_candidate_ids)


'''
    For each customer, we now have several articles which we would consider to be an outlier for them

    Now we would like to find transactions in the big transaction database. Ideally transactions made from that customer, otherwise other transaction from another customer. Just take the latest and maybe set the week to test week?

'''

'\n    For each customer, we now have several articles which we would consider to be an outlier for them\n\n    Now we would like to find transactions in the big transaction database. Ideally transactions made from that customer, otherwise other transaction from another customer. Just take the latest and maybe set the week to test week?\n\n'

In [None]:
# Assuming 'transactions' is your larger transaction database
# and 'test_week' is the week number you want to set for these transactions

outlier_transactions = pd.DataFrame()

for customer, articles in article_candidate_ids.items():
    customer_transactions = transactions[transactions['customer_id'] == customer]
    other_transactions = transactions[transactions['customer_id'] != customer]

    for article_id in articles:
        # Check if the customer has transactions for this article
        customer_trans = customer_transactions[customer_transactions['article_id'] == article_id]

        if not customer_trans.empty:
            outlier_transactions = pd.concat([outlier_transactions, customer_trans])
        else:
            # If not, find the latest transaction by any other customer for this article
            other_trans = other_transactions[other_transactions['article_id'] == article_id].nlargest(1, 't_dat')
            outlier_transactions = pd.concat([outlier_transactions, other_trans])

# Set the week to test week
outlier_transactions['week'] = test_week

# Display the result
print(outlier_transactions.head(10))