In [1]:
import pandas as pd

# 1. Reading Training Data

## 1.1 Users

In [2]:
train_users = pd.read_csv("train_users.csv")

In [3]:
train_users.head(2)

Unnamed: 0,user_id
0,1
1,2


## 1.2 Items

In [4]:
train_items = pd.read_csv("train_items.csv")

In [5]:
train_items.head(2)

Unnamed: 0,item_ID,item_name,brand,channel,unit_price,category,size,color,discount,gender,...,brand_location_map,materials_map,sex,country_size,country_in_stock,tag,price_range,country,style_50,color_50
0,1001920,Fendi女子皮制鞋12184-X3474,FENDI,17,299.0,单鞋,39,,0.059443,女,...,,,,意大利 (IT),,奢华&经典&现代,5.0,意大利 (IT),奢华,
1,1003191,【自营】BCBG-MAXAZRIA女士皮制手提书包 【特卖货品，缺货退款，延迟发货见谅】,BCBG,15,89.0,手提包,30*26*22,西瓜红色,0.032014,女,...,,,,,,高端&时尚&女性化,,,优雅,红色系列


## 1.3 Interactions

In [6]:
train_interactions = pd.read_csv("train_interactions.csv")

In [7]:
train_interactions.head()

Unnamed: 0,user_id,item_ID,order_time
0,209227,1595587,2024-03-14
1,243637,1639001,2024-03-14
2,265503,1675222,2024-03-14
3,183614,1662640,2024-03-14
4,188753,1662540,2024-03-14


# 2. Featurization

## 2.1 Item Features

In this tutorial, we only use unit_price and discount as features. You can try more.

In [8]:
train_item_feas = train_items[["item_ID", "unit_price", "discount"]]

Missing value imputation.

In [9]:
train_item_feas = train_item_feas.fillna(train_item_feas.mean())

In [10]:
train_item_feas.head(2)

Unnamed: 0,item_ID,unit_price,discount
0,1001920,299.0,0.059443
1,1003191,89.0,0.032014


## 2.2 User Features

An user can be represented with the averaged price and discount of the items he/she purchased.

In [11]:
train_user_feas = train_interactions.drop("order_time", axis=1)

In [12]:
train_user_feas.head(2)

Unnamed: 0,user_id,item_ID
0,209227,1595587
1,243637,1639001


In [13]:
train_user_feas = train_user_feas.merge(train_item_feas, on="item_ID")

In [14]:
train_user_feas.head(2)

Unnamed: 0,user_id,item_ID,unit_price,discount
0,209227,1595587,2880.0,0.6
1,243637,1639001,519.0,0.524242


In [15]:
train_user_feas = train_user_feas.drop("item_ID", axis=1)

In [16]:
train_user_feas.head(2)

Unnamed: 0,user_id,unit_price,discount
0,209227,2880.0,0.6
1,243637,519.0,0.524242


In [17]:
train_user_feas = train_user_feas.groupby("user_id", as_index=False).mean()

In [18]:
train_user_feas.head(2)

Unnamed: 0,user_id,unit_price,discount
0,1,1211.494,0.64371
1,2,895.027108,0.310057


# 3. Labeling

## 3.1 Positive Labels

All the happened purchases are positive.

In [19]:
pos_samples = train_interactions.drop("order_time", axis=1)

In [20]:
pos_samples.head(2)

Unnamed: 0,user_id,item_ID
0,209227,1595587
1,243637,1639001


In [21]:
pos_samples["label"] = 1

In [22]:
pos_samples.head(2)

Unnamed: 0,user_id,item_ID,label
0,209227,1595587,1
1,243637,1639001,1


## 3.2 Negative Sampling

We can randomly pair an user and an item and assume this purchase will not happen (negative data). Definitly some of these negatively labeled data would be the same as some positive data. We assume the amount would be limited and it would not hurt the result.

In [23]:
pos_num = len(pos_samples)

In [24]:
import random

In [25]:
neg_samples = {"user_id":[], "item_ID":[]}
for i in range(pos_num):
    uid = random.choice(train_users["user_id"])
    neg_samples["user_id"].append(uid)
    
    iid = random.choice(train_items["item_ID"])
    neg_samples["item_ID"].append(iid)

In [26]:
neg_samples = pd.DataFrame(neg_samples)

In [27]:
neg_samples["label"] = 0

In [28]:
neg_samples.head(2)

Unnamed: 0,user_id,item_ID,label
0,215561,1453386,0
1,176886,1656047,0


# 4. Training Data Preparation

## 4.1 Concatenation

In [29]:
train_data = pd.concat([pos_samples, neg_samples]).reset_index(drop=True)

In [30]:
train_data.head()

Unnamed: 0,user_id,item_ID,label
0,209227,1595587,1
1,243637,1639001,1
2,265503,1675222,1
3,183614,1662640,1
4,188753,1662540,1


## 4.2 Adding features

In [31]:
train_data = train_data.merge(train_item_feas, on="item_ID")

In [32]:
train_data.head()

Unnamed: 0,user_id,item_ID,label,unit_price,discount
0,209227,1595587,1,2880.0,0.6
1,243637,1639001,1,519.0,0.524242
2,265503,1675222,1,899.0,0.3596
3,183614,1662640,1,985.0,0.299392
4,188753,1662540,1,199.0,0.406122


In [33]:
train_data = train_data.merge(train_user_feas, on="user_id", suffixes=("_item", "_user"))

In [34]:
train_data.head()

Unnamed: 0,user_id,item_ID,label,unit_price_item,discount_item,unit_price_user,discount_user
0,209227,1595587,1,2880.0,0.6,2533.333333,0.682828
1,243637,1639001,1,519.0,0.524242,519.0,0.524242
2,265503,1675222,1,899.0,0.3596,643.0,0.2337
3,183614,1662640,1,985.0,0.299392,632.941364,0.393443
4,188753,1662540,1,199.0,0.406122,199.0,0.406122


## 4.3 Splitting Features and Labels

In [35]:
y_train = train_data["label"]

In [36]:
features = list(train_data.columns)
features.remove("label")
features.remove("user_id")
features.remove("item_ID")

In [37]:
X_train = train_data[features]

# 5. Modeling

In [38]:
from sklearn.linear_model import LogisticRegression

In [39]:
lr = LogisticRegression()

In [40]:
model = lr.fit(X_train, y_train)

# 6. Test Dataset Preparation

## 6.1 Item Features

In [41]:
test_items = pd.read_csv("test_items.csv")

In [42]:
test_items.head(2)

Unnamed: 0,item_ID,item_name,brand,channel,unit_price,category,size,color,discount,gender,...,brand_location_map,materials_map,sex,country_size,country_in_stock,tag,price_range,country,style_50,color_50
0,1657680,Alexia Sandra 牛仔系列 女士简约松紧腰牛仔长裤【477948NZ4K9605 ...,ALEXIA SANDRA,54,871.46,牛仔裤,S,蓝色,0.738525,女,...,,,,,,现代&女性化&精致,,,优雅,蓝色系列
1,1525546,"CAMPER 女士经典黑色简约日常圆头针扣系带休闲凉鞋，【K200573-012 36】,",CAMPER,54,635.34,凉鞋,36,黑色,0.635976,女,...,SPAIN,"牛皮革, 真皮, 其它",女,,SPAIN,后现代&返璞归真&地中海&艺术,3.0,意大利 (IT),户外风,黑色系列


In [43]:
test_item_feas = test_items[["item_ID", "unit_price", "discount"]]

In [44]:
test_item_feas = test_item_feas.fillna(train_item_feas.mean())

In [45]:
test_item_feas.head(2)

Unnamed: 0,item_ID,unit_price,discount
0,1657680,871.46,0.738525
1,1525546,635.34,0.635976


## 6.2 User Features

In [46]:
test_users = pd.read_csv("test_users.csv")

In [47]:
test_users.head(2)

Unnamed: 0,user_id
0,1
1,2


In [48]:
test_user_feas = test_users.merge(train_user_feas, how="right", on="user_id")

In [49]:
test_user_feas.head()

Unnamed: 0,user_id,unit_price,discount
0,1,1211.494,0.64371
1,2,895.027108,0.310057
2,3,3002.775556,0.461823
3,4,858.140791,0.251688
4,5,1453.97443,0.33743


# 7. Recommend items for one user

## 7.1 Create interaction data for an user

In [50]:
uid = 1

In [51]:
test_interactions = pd.DataFrame()
test_interactions["item_ID"] = test_item_feas["item_ID"]
test_interactions["user_id"] = uid

In [52]:
test_interactions.head(2)

Unnamed: 0,item_ID,user_id
0,1657680,1
1,1525546,1


## 7.2 Add features.

In [53]:
test_data = test_interactions.merge(test_item_feas, on="item_ID")

In [54]:
test_data = test_data.merge(test_user_feas, on="user_id", suffixes=("_item", "_user"))

In [55]:
test_data.head()

Unnamed: 0,item_ID,user_id,unit_price_item,discount_item,unit_price_user,discount_user
0,1657680,1,871.46,0.738525,1211.494,0.64371
1,1525546,1,635.34,0.635976,1211.494,0.64371
2,1525194,1,400.26,0.588618,1211.494,0.64371
3,1523948,1,358.62,0.587902,1211.494,0.64371
4,1534750,1,1432.53,0.761984,1211.494,0.64371


## 7.3 Prepare for model predicting

In [56]:
features = list(test_data.columns)
features.remove("item_ID")
features.remove("user_id")

In [57]:
test_X = test_data[features]

In [58]:
test_X.head(2)

Unnamed: 0,unit_price_item,discount_item,unit_price_user,discount_user
0,871.46,0.738525,1211.494,0.64371
1,635.34,0.635976,1211.494,0.64371


## 7.4 Predict the probabilities that the purchases would happen

In [59]:
y_pred = lr.predict_proba(test_X)[:, 1]

In [60]:
y_pred

array([0.52153351, 0.52429167, 0.53149692, ..., 0.45639645, 0.39551747,
       0.42569783])

## 7.5 Get the 20 items which would most possibily purchased

In [61]:
test_data["score"] = y_pred

In [62]:
test_data.head(2)

Unnamed: 0,item_ID,user_id,unit_price_item,discount_item,unit_price_user,discount_user,score
0,1657680,1,871.46,0.738525,1211.494,0.64371,0.521534
1,1525546,1,635.34,0.635976,1211.494,0.64371,0.524292


In [63]:
test_data = test_data.sort_values(by=["score"], axis=0, ascending=False)

In [64]:
test_data.head(2)

Unnamed: 0,item_ID,user_id,unit_price_item,discount_item,unit_price_user,discount_user,score
823,1664858,1,88.9,0.998876,1211.494,0.64371,0.579118
2223,1604949,1,99.0,1.0,1211.494,0.64371,0.578742


In [65]:
list(test_data["item_ID"][:20])

[1664858,
 1604949,
 1604950,
 1604872,
 1610209,
 1610210,
 1674976,
 1674959,
 1674958,
 1604952,
 1604909,
 1604951,
 1674981,
 1690433,
 1654976,
 1671026,
 1664845,
 1676199,
 1676196,
 1607974]

# 8. Get predictions for all users

## 8.1 Recommend items to all the users

Put all the codes relevant to one-user recommendation into a function.

In [66]:
def get_top_K(uid, model, test_user_feas, test_item_feas, features, K=20):
    test_feas = test_user_feas[test_user_feas["user_id"] == uid]
    
    test_interactions = pd.DataFrame()
    test_interactions["item_ID"] = test_item_feas["item_ID"]
    test_interactions["user_id"] = uid
    
    test_data = test_interactions
    test_data = test_data.merge(test_item_feas, on="item_ID")
    test_data = test_data.merge(test_user_feas, on="user_id", suffixes=("_item", "_user"))
    
    test_X = test_data[features]
    
    y_pred = model.predict_proba(test_X)[:, 1]
    
    test_data["score"] = y_pred
    
    top_K = test_data.sort_values(by=["score"], axis=0, ascending=False)["item_ID"][:K]
    
    return list(top_K)

In [67]:
results = []
for uid in test_users["user_id"]:
    result = get_top_K(uid, lr, test_user_feas, test_item_feas, features)
    result = [uid] + result
    results.append(result)

## 8.2 Load the results into a dataframe

In [68]:
df = pd.DataFrame(results)

In [69]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,1,1664858,1604949,1604950,1604872,1610209,1610210,1674976,1674959,1674958,...,1604909,1604951,1674981,1690433,1654976,1671026,1664845,1676199,1676196,1607974
1,2,1664858,1604949,1604950,1604872,1610209,1610210,1674976,1674959,1674958,...,1604909,1604951,1674981,1690433,1654976,1671026,1664845,1676199,1676196,1607974
2,4,1664858,1604949,1604950,1604872,1610209,1610210,1674976,1674959,1674958,...,1604909,1604951,1674981,1690433,1654976,1671026,1664845,1676199,1676196,1607974
3,50,1664858,1604949,1604950,1604872,1610209,1610210,1674976,1674959,1674958,...,1604909,1604951,1674981,1690433,1654976,1671026,1664845,1676199,1676196,1607974
4,65,1664858,1604949,1604950,1604872,1610209,1610210,1674976,1674959,1674958,...,1604909,1604951,1674981,1690433,1654976,1671026,1664845,1676199,1676196,1607974


In [70]:
cols = ["user_id"]
for i in range(20):
    cols.append("item_{}".format(i + 1))

In [71]:
df.columns = cols

In [72]:
df.head(2)

Unnamed: 0,user_id,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,...,item_11,item_12,item_13,item_14,item_15,item_16,item_17,item_18,item_19,item_20
0,1,1664858,1604949,1604950,1604872,1610209,1610210,1674976,1674959,1674958,...,1604909,1604951,1674981,1690433,1654976,1671026,1664845,1676199,1676196,1607974
1,2,1664858,1604949,1604950,1604872,1610209,1610210,1674976,1674959,1674958,...,1604909,1604951,1674981,1690433,1654976,1671026,1664845,1676199,1676196,1607974


## 8.3 Save the results into a file

In [73]:
df.to_csv("submission.csv", index=False)