# Read input files

In [1]:
%%time
import pandas as pd

pad = "/kaggle/input/makeparquet"
transactions = pd.read_parquet(pad+'/transactions_train.parquet')
customers = pd.read_parquet(pad+'/customers.parquet')
articles = pd.read_parquet(pad+'/articles.parquet')

CPU times: user 4.03 s, sys: 4.1 s, total: 8.13 s
Wall time: 6.93 s


In [2]:
import pandas as pd

# Step 1: Merge the transactions and articles dataframes on 'article_id'
merged_df = transactions.merge(articles[['article_id', 'product_code']], on='article_id', how='inner')

# Step 2: Group by 'customer_id' and 'product_code', and count unique articles
customer_product_counts = merged_df.groupby(['customer_id', 'product_code'])['article_id'].nunique().reset_index()

# Step 3: Filter customers who bought multiple articles with the same product code
customers_with_multiple_purchases = customer_product_counts[customer_product_counts['article_id'] > 1]

# Display the result
print(customers_with_multiple_purchases)

                   customer_id  product_code  article_id
11              23962613628581        732842           3
12              23962613628581        739618           2
13              23962613628581        752512           2
56              28847241659200        759871           2
59              28847241659200        762846           2
...                        ...           ...         ...
24414665  18446737527580148316        699923           2
24414666  18446737527580148316        701134           3
24414675  18446737527580148316        762600           2
24414676  18446737527580148316        763988           2
24414687  18446737527580148316        845790           2

[2338909 rows x 3 columns]


In [3]:
import pandas as pd

# Step 1: Merge the transactions and articles dataframes on 'article_id'
merged_df2 = transactions.merge(articles[['article_id', 'product_code']], on='article_id', how='inner')

# Step 2: Group by 'customer_id', 'product_code', and aggregate lists of article_ids and dates
grouped = merged_df2.groupby(['customer_id', 'product_code']).agg({
    'article_id': list,
    't_dat': list
}).reset_index()

# Display the result
print(grouped)


                   customer_id  product_code              article_id  \
0                4245900472157        715624             [715624010]   
1                4245900472157        803757             [803757011]   
2               23962613628581        594264             [594264006]   
3               23962613628581        602540             [602540001]   
4               23962613628581        638629             [638629002]   
...                        ...           ...                     ...   
24414685  18446737527580148316        832481             [832481001]   
24414686  18446737527580148316        840566             [840566001]   
24414687  18446737527580148316        845790  [845790001, 845790004]   
24414688  18446737527580148316        850015             [850015002]   
24414689  18446737527580148316        859416             [859416011]   

                                               t_dat  
0                              [2020-04-10 00:00:00]  
1                        

In [4]:
# Filter the grouped dataset to include only entries with at least 2 articles
filtered_grouped = grouped[grouped['article_id'].apply(len) >= 2]

# Display the filtered result
print(filtered_grouped)

                   customer_id  product_code  \
8               23962613628581        721966   
11              23962613628581        732842   
12              23962613628581        739618   
13              23962613628581        752512   
36              28847241659200        672598   
...                        ...           ...   
24414666  18446737527580148316        701134   
24414675  18446737527580148316        762600   
24414676  18446737527580148316        763988   
24414683  18446737527580148316        827968   
24414687  18446737527580148316        845790   

                                            article_id  \
8                               [721966002, 721966002]   
11        [732842001, 732842001, 732842002, 732842004]   
12                              [739618001, 739618002]   
13                              [752512006, 752512008]   
36                              [672598002, 672598002]   
...                                                ...   
24414666         

In [5]:
print(len(grouped))
print(len(filtered_grouped))

24414690
4946329


In [6]:
# Function to check if there are at least two different dates in the list
def has_at_least_two_different_dates(date_list):
    unique_dates = set(date_list)
    return len(unique_dates) >= 2

# Filter the dataset to include only entries with at least two different dates
filtered_grouped_with_diff_dates = filtered_grouped[filtered_grouped['t_dat'].apply(has_at_least_two_different_dates)]

# Display the result
print(filtered_grouped_with_diff_dates)

                   customer_id  product_code  \
11              23962613628581        732842   
36              28847241659200        672598   
59              28847241659200        762846   
72              28847241659200        838825   
76              28847241659200        855080   
...                        ...           ...   
24414650  18446737527580148316        596400   
24414651  18446737527580148316        608776   
24414656  18446737527580148316        636587   
24414658  18446737527580148316        678942   
24414683  18446737527580148316        827968   

                                                 article_id  \
11             [732842001, 732842001, 732842002, 732842004]   
36                                   [672598002, 672598002]   
59                        [762846001, 762846001, 762846007]   
72                                   [838825001, 838825003]   
76                        [855080001, 855080004, 855080002]   
...                                          

In [7]:
print("All used combinations of customer_id and product_code:")
print(len(grouped))
print("Filtered at least 2 different article_ids in these combinations: ")
print(len(filtered_grouped))
print("Filtered 2 different article_ids in at least 2 different dates: ")
print(len(filtered_grouped_with_diff_dates))

All used combinations of customer_id and product_code:
24414690
Filtered at least 2 different article_ids in these combinations: 
4946329
Filtered 2 different article_ids in at least 2 different dates: 
1708722
