In [11]:
import pandas as pd

# Step 1: Load the four datasets
groceries_df = pd.read_csv("groceries_dataset.csv")
online_retail_df = pd.read_csv("online_retail.csv", encoding='ISO-8859-1')
winequality_df = pd.read_csv("winequality-red.csv")
order_products_prior_df = pd.read_csv("order_products__prior.csv")

In [12]:
# Step 2: Modify the item_column and transaction_column for each dataset
groceries_item_column = 'itemDescription'  # Replace with actual column name in dataset
groceries_transaction_column = 'Member_number'  # Replace with actual column name in dataset

online_retail_item_column = 'StockCode'
online_retail_transaction_column = 'InvoiceNo'

# Winequality-red dataset: Create a new column as transaction ID
winequality_df['transaction_id'] = winequality_df.index  # Use index as transaction ID
winequality_item_column = 'quality'
winequality_transaction_column = 'transaction_id'  # Use the new column as transaction column

order_products_prior_item_column = 'product_id'
order_products_prior_transaction_column = 'order_id'



In [13]:
# Step 3: Define the functions for adjusting unique items and transactions
def adjust_unique_items(df, item_column, transaction_column, target_unique_items):
    unique_items = df[item_column].nunique()
    if unique_items > target_unique_items:
        item_counts = df[item_column].value_counts().head(target_unique_items).index
        df = df[df[item_column].isin(item_counts)]
    return df

def adjust_transactions(df, transaction_column, target_transactions):
    if len(df[transaction_column].unique()) > target_transactions:
        sampled_transactions = df[transaction_column].drop_duplicates().sample(target_transactions)
        df = df[df[transaction_column].isin(sampled_transactions)]
    return df

def control_transaction_width(df, item_column, transaction_column, target_width):
    transaction_widths = df.groupby(transaction_column)[item_column].count()
    for transaction, width in transaction_widths.items():
        if width > target_width:
            items_to_keep = df[df[transaction_column] == transaction].sample(target_width)
            df = df[~((df[transaction_column] == transaction) & (~df.index.isin(items_to_keep.index)))]
        elif width < target_width:
            items_to_add = df[df[transaction_column] == transaction].sample(target_width - width, replace=True)
            df = pd.concat([df, items_to_add])
    return df



In [15]:
# Step 4: Determine the target number of unique items based on the minimum dataset
unique_counts = {
    'groceries': groceries_df[groceries_item_column].nunique(),
    'online_retail': online_retail_df[online_retail_item_column].nunique(),
    'winequality': winequality_df[winequality_item_column].nunique(),
    'order_products': order_products_prior_df[order_products_prior_item_column].nunique()
}

target_unique_items = min(unique_counts.values())

# Adjust unique items for each dataset
groceries_df = adjust_unique_items(groceries_df, groceries_item_column, groceries_transaction_column, target_unique_items)
online_retail_df = adjust_unique_items(online_retail_df, online_retail_item_column, online_retail_transaction_column, target_unique_items)
winequality_df = adjust_unique_items(winequality_df, winequality_item_column, winequality_transaction_column, target_unique_items)
order_products_prior_df = adjust_unique_items(order_products_prior_df, order_products_prior_item_column, order_products_prior_transaction_column, target_unique_items)

# 计算平均宽度，并取整
average_width = int(np.round((
    groceries_df.groupby(groceries_transaction_column)[groceries_item_column].count().mean() +
    online_retail_df.groupby(online_retail_transaction_column)[online_retail_item_column].count().mean() +
    winequality_df.groupby(winequality_transaction_column).size().mean() +
    order_products_prior_df.groupby(order_products_prior_transaction_column)[order_products_prior_item_column].count().mean()
) / 4))

# 控制每个数据集的事务宽度
groceries_df = control_transaction_width(groceries_df, groceries_item_column, groceries_transaction_column, target_width=average_width)
online_retail_df = control_transaction_width(online_retail_df, online_retail_item_column, online_retail_transaction_column, target_width=average_width)
winequality_df = control_transaction_width(winequality_df, winequality_item_column, winequality_transaction_column, target_width=average_width)
order_products_prior_df = control_transaction_width(order_products_prior_df, order_products_prior_item_column, order_products_prior_transaction_column, target_width=average_width)


KeyboardInterrupt: 

In [18]:
# Step 4: Determine the target number of unique items based on the minimum dataset
target_unique_items = min(
    groceries_df[groceries_item_column].nunique(),
    online_retail_df[online_retail_item_column].nunique(),
    winequality_df[winequality_item_column].nunique(),
    order_products_prior_df[order_products_prior_item_column].nunique()
)

# Adjust unique items for each dataset with progress bar
for df in tqdm([groceries_df, online_retail_df, winequality_df, order_products_prior_df], desc="Adjusting Unique Items"):
    if df is groceries_df:
        groceries_df = adjust_unique_items(groceries_df, groceries_item_column, groceries_transaction_column, target_unique_items)
    elif df is online_retail_df:
        online_retail_df = adjust_unique_items(online_retail_df, online_retail_item_column, online_retail_transaction_column, target_unique_items)
    elif df is winequality_df:
        winequality_df = adjust_unique_items(winequality_df, winequality_item_column, winequality_transaction_column, target_unique_items)
    elif df is order_products_prior_df:
        order_products_prior_df = adjust_unique_items(order_products_prior_df, order_products_prior_item_column, order_products_prior_transaction_column, target_unique_items)

# 计算平均宽度，并取整
average_width = int(np.round((
    groceries_df.groupby(groceries_transaction_column)[groceries_item_column].count().mean() +
    online_retail_df.groupby(online_retail_transaction_column)[online_retail_item_column].count().mean() +
    winequality_df.groupby(winequality_transaction_column).size().mean() +
    order_products_prior_df.groupby(order_products_prior_transaction_column)[order_products_prior_item_column].count().mean()
) / 4))

# 控制每个数据集的事务宽度
groceries_df = control_transaction_width(groceries_df, groceries_item_column, groceries_transaction_column, target_width=average_width)
online_retail_df = control_transaction_width(online_retail_df, online_retail_item_column, online_retail_transaction_column, target_width=average_width)
winequality_df = control_transaction_width(winequality_df, winequality_item_column, winequality_transaction_column, target_width=average_width)
order_products_prior_df = control_transaction_width(order_products_prior_df, order_products_prior_item_column, order_products_prior_transaction_column, target_width=average_width)


Adjusting Unique Items: 100%|██████████| 4/4 [00:00<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# Step 5: Preview the modified datasets
print("Groceries Dataset:")
print(groceries_df.head())
print("\nOnline Retail Dataset:")
print(online_retail_df.head())
print("\nWine Quality Dataset:")
print(winequality_df.head())
print("\nOrder Products Prior Dataset:")
print(order_products_prior_df.head())