In [35]:
import os
os.chdir("/Users/karol/Desktop/Antwerp/ai_project")
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from scipy.sparse import csr_matrix
from data_reader import load_data_mf, matrix_representation, load_customers_articles, customers_diversification, articles_diversification
from model import TwoTowerCustomer, TwoTowerFinal
from helper import train_two_tower
from recommenders import recommender_two_towers_final, recommender_two_towers_customer

In [36]:
# read customers and transactions data
customers = pd.read_csv("data/preprocessed/customers.csv") 
transactions = pd.read_csv("data/preprocessed/transactions.csv")
articles = pd.read_csv("data/preprocessed/articles.csv") 

# Get seasonal sales information

In [33]:
def assign_season(x):
    if x in [12,1,2]:
        return 1
    elif x in [3,4,5]:
        return 2
    elif x in [6,7,8]:
        return 3
    else:
        return 4 

In [13]:
def seasonal_sales(a, t):
    # get seasons
    t["t_dat"] = pd.to_datetime(t["t_dat"])
    t["month"] = t["t_dat"].dt.month 
    # get function to apply seasons
    t["season"] = t["month"].apply(assign_season)
    grouped = t.groupby(["article_id", "season"])["customer_id"].count()
    # get percentages
    percentages = grouped / grouped.groupby(level=0).transform("sum")
    # create winter sale var
    winter_sale = percentages[percentages.index.get_level_values('season') == 1]
    winter_sale = winter_sale.rename("winter_sale")
    a = a.merge(winter_sale, how="left", on="article_id")
    a["winter_sale"] = a["winter_sale"].fillna(0)
    # create spring sale var
    spring_sale = percentages[percentages.index.get_level_values('season') == 2]
    spring_sale = spring_sale.rename("spring_sale")
    a = a.merge(spring_sale, how="left", on="article_id")
    a["spring_sale"] = a["spring_sale"].fillna(0)
    # create summer sale var
    summer_sale = percentages[percentages.index.get_level_values('season') == 3]
    summer_sale = summer_sale.rename("summer_sale")
    a = a.merge(summer_sale, how="left", on="article_id")
    a["summer_sale"] = a["summer_sale"].fillna(0)
    # create autumn sale var
    autumn_sale = percentages[percentages.index.get_level_values('season') == 4]
    autumn_sale = autumn_sale.rename("autumn_sale")
    a = a.merge(autumn_sale, how="left", on="article_id")
    a["autumn_sale"] = a["autumn_sale"].fillna(0)
    return a



# Get average prices

In [14]:
def get_avg_price(a, t):
    grouped = t.groupby("article_id")["price"].mean()
    grouped = grouped.rename("avg_price")
    a = a.merge(grouped, how="left", on="article_id")
    a["avg_price"] = a["avg_price"].fillna(-1)
    return a

# Yearly seasonal bestsellers

In [15]:
def seasonal_bestseller_ranking(a, t):
    # get seasons
    t["t_dat"] = pd.to_datetime(t["t_dat"])
    t["month"] = t["t_dat"].dt.month 
    # get function to apply seasons
    t["season"] = t["month"].apply(assign_season)
    t["year"] = t["t_dat"].dt.year 
    # Create a new DataFrame with the count of t for each (year, season, article_id) combination
    transaction_counts = t.groupby(["year", "season", "article_id"])["customer_id"].count().reset_index()
    transaction_counts.rename(columns={"customer_id": "transaction_count"}, inplace=True)

    # Create rankings within each (year, season) group based on transaction counts
    transaction_counts['article_rank'] = transaction_counts.groupby(["year", "season"])['transaction_count'].rank(ascending=True, method='dense')
    for year in transaction_counts.year.unique():
        for season in transaction_counts[transaction_counts.year==year].season.unique():
            t = transaction_counts[(transaction_counts.year==year) & (transaction_counts.season==season)]
            a = a.merge(t[["article_id","article_rank"]], how="left", on="article_id")
            a["article_rank"] = a["article_rank"].fillna(0)
            new_name = {"article_rank":"rank_"+str(season)+"_"+str(year)}
            a = a.rename(columns=new_name)
    return a


# Articles preferences based on the customers age group

In [16]:
def age_articles_preference(a,t,c):
    bins = [0,25,40,55,float("inf")]
    labels = ["young_preference","adult_preferences","middle_aged_preference","senior_preference"]
    c["age_group"] = pd.cut(c["age"], bins=bins, labels=labels, right=False)
    print("AGE GROUP DISTRIBUTION\n")
    print(c["age_group"].value_counts())
    t = t.merge(c[["customer_id","age_group"]], how="left", on="customer_id")
    grouped = t.groupby(["article_id", "age_group"])["customer_id"].count()
    percentages = grouped / grouped.groupby(level=0).transform("sum")
    for label in labels:
    # merge young
        preference = percentages[percentages.index.get_level_values('age_group') == label]
        preference = preference.rename(label)
        a = a.merge(preference, how="left", on="article_id")
        a[label] = a[label].fillna(0)
    return a
    

# Articles preference depending on the sales channel

In [17]:
def articles_sales_channel(a,t):
    grouped = t.groupby(["article_id", "sales_channel_id"])["customer_id"].count()
    percentages = grouped / grouped.groupby(level=0).transform("sum")
    for channel in t["sales_channel_id"].unique():
        preference = percentages[percentages.index.get_level_values('sales_channel_id') == channel]
        name = "sales_channel_"+str(channel)
        preference = preference.rename(name)
        a = a.merge(preference, how="left", on="article_id")
        a[name] = a[name].fillna(0)
    return a


# Generate Features

In [18]:
articles = seasonal_sales(articles, transactions)
articles = get_avg_price(articles, transactions)
articles = seasonal_bestseller_ranking(articles, transactions)
articles = age_articles_preference(articles, transactions, customers)
articles = articles_sales_channel(articles,transactions)
articles

AGE GROUP DISTRIBUTION

age_group
adult_preferences         492701
young_preference          357169
middle_aged_preference    339444
senior_preference         182666
Name: count, dtype: int64


  grouped = t.groupby(["article_id", "age_group"])["customer_id"].count()


Unnamed: 0,article_id,product_type_name,graphical_appearance_name,perceived_colour_master_name,department_name,index_name,section_name,garment_group_name,winter_sale,spring_sale,...,rank_1_2020,rank_2_2020,rank_3_2020,rank_4_2020,young_preference,adult_preferences,middle_aged_preference,senior_preference,sales_channel_2,sales_channel_1
0,0,0,0,0,0,0,0,0,0.366756,0.258094,...,936.0,1409.0,1467.0,105541.0,0.181902,0.528826,0.218153,0.071119,0.770778,0.229222
1,1,0,0,1,0,0,0,0,0.256966,0.420552,...,909.0,1286.0,1329.0,603.0,0.168690,0.484690,0.244138,0.102483,0.710207,0.289793
2,2,0,1,1,0,0,0,0,0.027907,0.004651,...,105541.0,105541.0,105541.0,105541.0,0.158140,0.534884,0.190698,0.116279,0.995349,0.004651
3,3,1,0,0,1,1,1,1,0.252874,0.125479,...,926.0,1422.0,1458.0,105541.0,0.136015,0.405172,0.363985,0.094828,0.375479,0.624521
4,4,1,0,1,1,1,1,1,0.543599,0.081633,...,940.0,1422.0,1464.0,105541.0,0.152134,0.319109,0.419295,0.109462,0.654917,0.345083
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105537,105537,3,18,0,60,3,4,2,0.000000,0.000000,...,105541.0,105541.0,105541.0,596.0,0.117647,0.647059,0.176471,0.058824,1.000000,0.000000
105538,105538,0,0,0,5,0,24,3,0.000000,0.000000,...,105541.0,105541.0,105541.0,578.0,0.114286,0.628571,0.200000,0.057143,1.000000,0.000000
105539,105539,25,0,0,5,0,28,3,0.000000,0.000000,...,105541.0,105541.0,105541.0,592.0,0.095238,0.142857,0.285714,0.476190,1.000000,0.000000
105540,105540,8,0,0,30,7,13,4,0.000000,0.000000,...,105541.0,105541.0,105541.0,105541.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [229]:
# one hot encoding 
articles = articles.set_index("article_id")
customers = customers.set_index("customer_id")

articles_categorical = ["product_type_name","graphical_appearance_name",
                        "perceived_colour_master_name","department_name",
                        "index_name","section_name","garment_group_name"]

articles_cont = ['winter_sale', 'spring_sale','summer_sale', 'autumn_sale',
                'avg_price','rank_3_2020', 'rank_4_2020', 'young_preference', 
                'adult_preferences', 'middle_aged_preference','senior_preference', 
                'sales_channel_2', 'sales_channel_1']


preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=True), articles_categorical)
        ,('cont', 'passthrough', articles_cont)  # 'passthrough' means no transformation for continuous variables
    ],
    remainder='drop'  # Drop any columns not explicitly transformed
)

articles = csr_matrix(preprocessor.fit_transform(articles))

customers_categorical = ["FN",'Active',"club_member_status", "fashion_news_frequency"]
customers_cont = ["age"]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=True), customers_categorical),
        ('cont', 'passthrough', customers_cont)  # 'passthrough' means no transformation for continuous variables
    ],
    remainder='drop'  # Drop any columns not explicitly transformed
)
customers = csr_matrix(preprocessor.fit_transform(customers))

# Train the model

In [231]:
# deep architecture
transactions_negatives = pd.read_csv("data/preprocessed/transactions_negatives.csv")
train_dataloader, val_dataloader, test_customers = load_data_mf(transactions_negatives, batch_size=1000)
input_article_dim = articles.shape[1]
input_customer_dim = customers.shape[1]
model = TwoTowerFinal(input_article_dim, input_customer_dim, output_dim=10)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
save_dir = "AI_project/RQ1/models/TwoTowerArticles.pt"
val_loss_tower = train_two_tower(model, customers, articles, train_dataloader, val_dataloader, criterion, optimizer, save_dir, num_epochs=3)

100%|██████████| 55595/55595 [14:37<00:00, 63.39it/s]


Epoch [1/3] - Train Loss: 0.5242, Validation Loss: 0.4850


100%|██████████| 55595/55595 [14:44<00:00, 62.87it/s]


Epoch [2/3] - Train Loss: 0.5242, Validation Loss: 0.4850


100%|██████████| 55595/55595 [14:36<00:00, 63.41it/s]


Epoch [3/3] - Train Loss: 0.5242, Validation Loss: 0.4850


# Evaluate Recommendations

In [232]:
# load model
TwoTower = torch.load("AI_project/RQ1/models/TwoTowerArticles.pt")
matrix_full = matrix_representation(transactions, train_test=False)
targets = matrix_full[test_customers]
targets[targets>1] = 1
# dataloader
dataloader_cust, dataloader_art = load_customers_articles(customers, articles, test_customers=test_customers, batch_size=100)
# get restrictions
last_sold = transactions.groupby("article_id")["t_dat"].max()
articles_recently_sold = [last_sold[last_sold > '2020-08-22'].index.tolist()]
# generate recommendations
recommendations, recall, precision = recommender_two_towers_final(TwoTower, dataloader_cust, dataloader_art, targets, articles_recently_sold, evaluate=True, top_k=12)
print(f"Precision: {precision}\nRecall: {recall}")

Generate Customer Embeddings...


100%|██████████| 1363/1363 [00:10<00:00, 126.94it/s]


Generate Articles Embeddings...


100%|██████████| 1056/1056 [00:04<00:00, 211.51it/s]


Get recommendations...


100%|██████████| 137/137 [00:27<00:00,  4.91it/s]


Precision: 0.0024297322890133527
Recall: 0.0014536421017498234


# Add customers features

In [3]:
# read customers and transactions data
customers = pd.read_csv("data/preprocessed/customers.csv") 
transactions = pd.read_csv("data/preprocessed/transactions.csv")
articles = pd.read_csv("data/preprocessed/articles.csv") 

In [4]:
customers = customers_diversification(customers, transactions, articles)
articles = articles_diversification(articles, transactions, customers)

# one hot encoding 
articles = articles.set_index("article_id")
customers = customers.set_index("customer_id")

articles_categorical = ["product_type_name","graphical_appearance_name",
                        "perceived_colour_master_name","department_name",
                        "index_name","section_name","garment_group_name"]

articles_cont = ['winter_sale', 'spring_sale','summer_sale', 'autumn_sale',
                'avg_price','rank_3_2020', 'rank_4_2020', 'young_preference', 
                'adult_preferences', 'middle_aged_preference','senior_preference', 
                'sales_channel_2', 'sales_channel_1']


preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=True), articles_categorical)
        ,('cont', 'passthrough', articles_cont)  # 'passthrough' means no transformation for continuous variables
    ],
    remainder='drop'  # Drop any columns not explicitly transformed
)

articles = csr_matrix(preprocessor.fit_transform(articles))

customers_categorical = ["FN",'Active',"club_member_status", "fashion_news_frequency", "favourite_color", "preferred_garment"]
customers_cont = ["age","first_channel", "second_channel", "avg_price", "amount_purchases","manswear","ladieswear", "kids"]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=True), customers_categorical),
        ('cont', 'passthrough', customers_cont)  # 'passthrough' means no transformation for continuous variables
    ],
    remainder='drop'  # Drop any columns not explicitly transformed
)
customers = csr_matrix(preprocessor.fit_transform(customers))

AGE GROUP DISTRIBUTION

age_group
adult_preferences         492701
young_preference          357169
middle_aged_preference    339444
senior_preference         182666
Name: count, dtype: int64


  grouped = t.groupby(["article_id", "age_group"])["customer_id"].count()


In [7]:
# deep architecture
transactions_negatives = pd.read_csv("data/preprocessed/transactions_negatives.csv")
train_dataloader, val_dataloader, test_customers = load_data_mf(transactions_negatives, batch_size=1000)
input_article_dim = articles.shape[1]
input_customer_dim = customers.shape[1]
model = TwoTowerCustomer(input_article_dim, input_customer_dim, output_dim=10)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
save_dir = "AI_project/RQ1/models/TwoTowerArtCust.pt"
val_loss_tower = train_two_tower(model, customers, articles, train_dataloader, val_dataloader, criterion, optimizer, save_dir, num_epochs=3)

100%|██████████| 55595/55595 [16:21<00:00, 56.62it/s]


Epoch [1/3] - Train Loss: 0.5242, Validation Loss: 0.4850


100%|██████████| 55595/55595 [16:26<00:00, 56.36it/s]


Epoch [2/3] - Train Loss: 0.5242, Validation Loss: 0.4850


100%|██████████| 55595/55595 [16:37<00:00, 55.72it/s]


Epoch [3/3] - Train Loss: 0.5242, Validation Loss: 0.4850


In [8]:
# load model
TwoTower = torch.load("AI_project/RQ1/models/TwoTowerArtCust.pt")
matrix_full = matrix_representation(transactions, train_test=False)
targets = matrix_full[test_customers]
targets[targets>1] = 1
# dataloader
dataloader_cust, dataloader_art = load_customers_articles(customers, articles, test_customers=test_customers, batch_size=100)
# get restrictions
last_sold = transactions.groupby("article_id")["t_dat"].max()
articles_recently_sold = [last_sold[last_sold > '2020-08-22'].index.tolist()]
# generate recommendations
recommendations, recall, precision = recommender_two_towers_customer(TwoTower, dataloader_cust, dataloader_art, targets, articles_recently_sold, evaluate=True, top_k=12)
print(f"Precision: {precision}\nRecall: {recall}")

Generate Customer Embeddings...


100%|██████████| 1363/1363 [00:10<00:00, 126.03it/s]


Generate Articles Embeddings...


100%|██████████| 1056/1056 [00:04<00:00, 224.17it/s]


Get recommendations...


100%|██████████| 137/137 [00:27<00:00,  4.92it/s]


Precision: 0.0024297322890133527
Recall: 0.0014536421017498234


# Further Tests

In [34]:
# one hot encoding 
articles = articles.set_index("article_id")
customers = customers.set_index("customer_id")

articles_categorical = ["product_type_name","graphical_appearance_name",
                        "perceived_colour_master_name","department_name",
                        "index_name","section_name","garment_group_name"]

articles_cont = ['winter_sale', 'spring_sale','summer_sale', 'autumn_sale',
                'avg_price', 'young_preference', 
                'adult_preferences', 'middle_aged_preference','senior_preference', 
                'sales_channel_2', 'sales_channel_1']


preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=True), articles_categorical)
        ,('cont', 'passthrough', articles_cont)  # 'passthrough' means no transformation for continuous variables
    ],
    remainder='drop'  # Drop any columns not explicitly transformed
)

articles = csr_matrix(preprocessor.fit_transform(articles))

customers_categorical = ["FN",'Active',"club_member_status", "fashion_news_frequency"]
customers_cont = ["age"]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=True), customers_categorical),
        ('cont', 'passthrough', customers_cont)  # 'passthrough' means no transformation for continuous variables
    ],
    remainder='drop'  # Drop any columns not explicitly transformed
)
customers = csr_matrix(preprocessor.fit_transform(customers))

AGE GROUP DISTRIBUTION

age_group
adult_preferences         492701
young_preference          357169
middle_aged_preference    339444
senior_preference         182666
Name: count, dtype: int64


  grouped = t.groupby(["article_id", "age_group"])["customer_id"].count()


In [20]:
# deep architecture
transactions_negatives = pd.read_csv("data/preprocessed/transactions_negatives.csv")
train_dataloader, val_dataloader, test_customers = load_data_mf(transactions_negatives, batch_size=1000)
input_article_dim = articles.shape[1]
input_customer_dim = customers.shape[1]
model = TwoTowerFinal(input_article_dim, input_customer_dim, output_dim=10)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
save_dir = "AI_project/RQ1/models/TwoTowerArticlesTest.pt"
val_loss_tower = train_two_tower(model, customers, articles, train_dataloader, val_dataloader, criterion, optimizer, save_dir, num_epochs=3)

100%|██████████| 55595/55595 [14:19<00:00, 64.68it/s]


Epoch [1/3] - Train Loss: 0.1434, Validation Loss: 0.1402


100%|██████████| 55595/55595 [15:59<00:00, 57.92it/s]


Epoch [2/3] - Train Loss: 0.1402, Validation Loss: 0.1388


100%|██████████| 55595/55595 [17:14<00:00, 53.74it/s] 


Epoch [3/3] - Train Loss: 0.1420, Validation Loss: 0.1382


# Final Article model

In [37]:
articles = articles_diversification(articles, transactions, customers)

# one hot encoding 
articles = articles.set_index("article_id")
customers = customers.set_index("customer_id")

articles_categorical = ["product_type_name","graphical_appearance_name",
                        "perceived_colour_master_name","department_name",
                        "index_name","section_name","garment_group_name"]

articles_cont = ['winter_sale', 'spring_sale','summer_sale', 'autumn_sale',
                'avg_price','young_preference', 'adult_preferences', 
                'middle_aged_preference','senior_preference', 
                'sales_channel_2', 'sales_channel_1']

min_max_cols = ['rank_3_2020', 'rank_4_2020']


preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=True), articles_categorical),
        ('cont', 'passthrough', articles_cont),
        ("min_max", MinMaxScaler(), min_max_cols)  # 'passthrough' means no transformation for continuous variables
    ],
    remainder='drop'  # Drop any columns not explicitly transformed
)

articles = csr_matrix(preprocessor.fit_transform(articles))

customers_categorical = ["FN",'Active',"club_member_status", "fashion_news_frequency"]
customers_cont = ["age"]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=True), customers_categorical),
        ('cont', 'passthrough', customers_cont)  # 'passthrough' means no transformation for continuous variables
    ],
    remainder='drop'  # Drop any columns not explicitly transformed
)
customers = csr_matrix(preprocessor.fit_transform(customers))

AGE GROUP DISTRIBUTION

age_group
adult_preferences         492701
young_preference          357169
middle_aged_preference    339444
senior_preference         182666
Name: count, dtype: int64


  grouped = t.groupby(["article_id", "age_group"])["customer_id"].count()


In [38]:
# deep architecture
transactions_negatives = pd.read_csv("data/preprocessed/transactions_negatives.csv")
train_dataloader, val_dataloader, test_customers = load_data_mf(transactions_negatives, batch_size=1000)
input_article_dim = articles.shape[1]
input_customer_dim = customers.shape[1]
model = TwoTowerFinal(input_article_dim, input_customer_dim, output_dim=10)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
save_dir = "AI_project/RQ1/models/TwoTowerArticles.pt"
val_loss_tower = train_two_tower(model, customers, articles, train_dataloader, val_dataloader, criterion, optimizer, save_dir, num_epochs=3)

100%|██████████| 55595/55595 [14:31<00:00, 63.80it/s]


Epoch [1/3] - Train Loss: 0.1449, Validation Loss: 0.1399


100%|██████████| 55595/55595 [14:25<00:00, 64.24it/s]


Epoch [2/3] - Train Loss: 0.1429, Validation Loss: 0.1386


100%|██████████| 55595/55595 [14:23<00:00, 64.37it/s]


Epoch [3/3] - Train Loss: 0.1403, Validation Loss: 0.1381


In [40]:
# load model
TwoTower = torch.load("AI_project/RQ1/models/TwoTowerArticles.pt")
matrix_full = matrix_representation(transactions, train_test=False)
targets = matrix_full[test_customers]
targets[targets>1] = 1
# dataloader
dataloader_cust, dataloader_art = load_customers_articles(customers, articles, test_customers=test_customers, batch_size=100)
# get restrictions
last_sold = transactions.groupby("article_id")["t_dat"].max()
articles_recently_sold = [last_sold[last_sold > '2020-08-22'].index.tolist()]
# generate recommendations
recommendations, recall, precision = recommender_two_towers_final(TwoTower, dataloader_cust, dataloader_art, targets, articles_recently_sold, evaluate=True, top_k=12)
print(f"Precision: {precision}\nRecall: {recall}")

Generate Customer Embeddings...


100%|██████████| 1363/1363 [00:11<00:00, 118.36it/s]


Generate Articles Embeddings...


100%|██████████| 1056/1056 [00:04<00:00, 224.53it/s]


Get recommendations...


100%|██████████| 137/137 [00:34<00:00,  3.96it/s]


Precision: 0.011841580476012204
Recall: 0.007084492398205711
