In [None]:
# environment set up
!pip install implicit sentence-transformers ckip-transformers
import os
os.chdir("/content/drive/MyDrive/ADL/final_proj")

In [2]:
import csv
import pandas as pd
import numpy as np
from tqdm import trange
from scipy.sparse import csr_matrix
from implicit.gpu.als import AlternatingLeastSquares
from implicit.gpu.bpr import BayesianPersonalizedRanking

from sklearn.neighbors import NearestNeighbors

from utils import (
    mapk,
    predict,
    predict_topic_from_course,
    knn_course_predict,
    mix1_course_predict,
    mix2_course_predict
)

In [3]:
data_dir = './data'
df_train = pd.read_csv(f'{data_dir}/train.csv')
df_course = pd.read_csv(f'{data_dir}/courses.csv')
df_subgroup = pd.read_csv(f'{data_dir}/subgroups.csv')
df_val_seen = pd.read_csv(f'{data_dir}/val_seen.csv')
df_val_seen_group = pd.read_csv(f'{data_dir}/val_seen_group.csv')
df_test_seen = pd.read_csv(f'{data_dir}/test_seen.csv')
df_test_seen_group = pd.read_csv(f'{data_dir}/test_seen_group.csv')

course2idx = {
    id: idx
    for (idx, id) in enumerate(df_course['course_id'])
}

idx2course = {
    idx: id
    for (idx, id) in enumerate(df_course['course_id'])
}

user2idx = {
    id: idx
    for (idx, id) in enumerate(df_train['user_id'])
}

idx2user = {
    idx: id
    for (idx, id) in enumerate(df_train['user_id'])
}
num_items = len(course2idx)
num_users = len(user2idx)

num_records = 0
purch_hists = []
row_ind = []
col_ind = []
for i in range(num_users):
    purch_hist = df_train.iloc[i]['course_id'].split(' ')
    purch_hist = [course2idx[j] for j in purch_hist]
    purch_hists.append(purch_hist)
    num_purch = len(purch_hist)
    row_ind += [i]*num_purch
    col_ind += purch_hist
    num_records += num_purch

user_item_data = csr_matrix(
    (np.ones(num_records),
     (row_ind, col_ind)),
     shape=(num_users, num_items)
)

In [4]:
subgroup2idx = {
    name: idx
    for (name, idx) in zip(df_subgroup['subgroup_name'], df_subgroup['subgroup_id'])
}

course2subgroup = {}
for (idx, subgroup) in enumerate(df_course['sub_groups']):
    try:
        course2subgroup[idx] = [subgroup2idx[i] for i in subgroup.split(',')]
    except:
        course2subgroup[idx] = [0]

actuals = []
actuals_g = []
user_ids = []
for i in range(df_val_seen.shape[0]):
    sample = df_val_seen.iloc[i]
    actuals.append([course2idx[i] for i in sample['course_id'].split(' ')])
    try:
        sub = df_val_seen_group.iloc[i]['subgroup'].split(' ')
        actuals_g.append([int(i) for i in sub])
    except:
        actuals_g.append([0])
    user_ids.append(user2idx[sample['user_id']])

user_ids_test = [user2idx[i] for i in df_test_seen['user_id']]

## ALS

### validation - course

In [5]:
model_als = AlternatingLeastSquares(
    factors=1000,
    regularization=200,
    # alpha=1000,
    iterations=1,
    random_state=11112224
)
model_als.fit(user_item_data)

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
pred_als, score_als = model_als.recommend(user_ids, user_item_data[user_ids], N=50)
mapk(actuals, pred_als)

0.07454095354448023

### prediction - course

In [None]:
pred_als_test, score_als_test = model_als.recommend(
    user_ids_test,
    user_item_data[user_ids_test],
    N=50
)

pred_als_test = [[idx2course[j] for j in i] for i in pred_als_test]

predict(
    result=pred_als_test, 
    path='res/test_course_als.csv',
    user_ids=user_ids_test,
    idx2user=idx2user
)

### validation - topic

In [None]:
predg_als, scoreg_als = model_als.recommend(
    user_ids,
    user_item_data[user_ids],
    filter_already_liked_items=False,
    N=50
)

predg_als = predict_topic_from_course(
    result=predg_als,
    user_ids=user_ids,
    purch_hists=purch_hists,
    course2subgroup=course2subgroup
)

mapk(actuals_g, predg_als)

0.24260217496184097

### prediction - topic

In [None]:
predg_als_test, scoreg_als_test = model_als.recommend(
    user_ids_test,
    user_item_data[user_ids_test],
    filter_already_liked_items=False,
    N=50
)

predg_als_test = predict_topic_from_course(
    result=predg_als_test,
    user_ids=user_ids_test,
    purch_hists=purch_hists,
    course2subgroup=course2subgroup
)

predict(
    result=predg_als_test, 
    path='res/test_group_als.csv',
    user_ids=user_ids_test,
    idx2user=idx2user,
    domain='topic'
)

# create csr matrix (user-topic)
- note: the type of previous csr matrix is user-course

In [None]:
row_ind = []
col_ind = []
data = []
topic_hists = []
for i in range(num_users):
    topic_rec = dict()
    for j in purch_hists[i]:
        for k in course2subgroup[j]:
            if k in topic_rec:
                topic_rec[k] += 1
            else:
                topic_rec[k] = 1
    topics = list(topic_rec.keys())
    weights = list(topic_rec.keys())
    topic_hists.append([
        topics[j]
        for j in np.argsort(np.array(weights))[::-1]
    ])
    

    row_ind += [i]*len(topic_rec)
    col_ind += topics
    data += weights

user_topic_data = csr_matrix((data, (row_ind, col_ind)), shape=(num_users, len(subgroup2idx)+1))

In [None]:
model_als = AlternatingLeastSquares(
    factors=40,
    regularization=80,
    alpha=5,
    iterations=50,
    random_state=11112224
)
model_als.fit(user_topic_data)

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
predg_als, scoreg_als = model_als.recommend(
    user_ids, 
    user_topic_data[user_ids], 
    filter_already_liked_items=False, 
    N=50
)

mapk(actuals_g, predg_als)

0.21194703711104512

In [None]:
model_bayes = BayesianPersonalizedRanking(
    factors=600,
    learning_rate=3e-5,
    iterations=4000,
    random_state=11112224,
)
model_bayes.fit(user_topic_data)

  0%|          | 0/4000 [00:00<?, ?it/s]

In [None]:
predg_bayes, scoreg_bates = model_bayes.recommend(
    user_ids,
    user_topic_data[user_ids],
    filter_already_liked_items=False,
    N=50
)
mapk(actuals_g, predg_bayes)

0.20341035442168504

## BPR

### validation - course

In [None]:
model_bayes = BayesianPersonalizedRanking(
    factors=500,
    learning_rate=1e-5,
    iterations=10000,
    random_state=11112224,
)
model_bayes.fit(user_item_data)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [None]:
 pred_bayes, score_bayes = model_bayes.recommend(user_ids, user_item_data[user_ids], N=50)
 mapk(actuals, pred_bayes)

0.06610585445800089

### prediction - course

In [None]:
pred_bayes_test, score_bayes_test = model_bayes.recommend(
    user_ids_test,
    user_item_data[user_ids_test],
    N=50
)

pred_bayes_test = [[idx2course[j] for j in i] for i in pred_bayes_test]

predict(
    result=pred_bayes_test, 
    path='res/test_course_bayes.csv',
    user_ids=user_ids_test,
    idx2user=idx2user,
    domain='course'
)

### validation - topic

In [None]:
predg_bayes, scoreg_bayes = model_bayes.recommend(
    user_ids,
    user_item_data[user_ids],
    filter_already_liked_items=False,
    N=50
)

predg_bayes = predict_topic_from_course(
    result=predg_bayes,
    user_ids=user_ids,
    purch_hists=purch_hists,
    course2subgroup=course2subgroup
)

mapk(actuals_g, predg_bayes)

0.20731654453088663

### prediction - topic

In [None]:
predg_bayes_test, scoreg_bayes_test = model_bayes.recommend(
    user_ids_test,
    user_item_data[user_ids_test],
    filter_already_liked_items=False,
    N=50
)

predg_bayes_test = predict_topic_from_course(
    result=predg_bayes_test,
    user_ids=user_ids_test,
    purch_hists=purch_hists,
    course2subgroup=course2subgroup
)

predict(
    result=predg_bayes_test, 
    path='res/test_group_bayes.csv',
    user_ids=user_ids_test,
    idx2user=idx2user,
    domain='topic'
)

## KNN

### vaidation - course

In [None]:
k = 2500
nbrs = NearestNeighbors(n_neighbors=k, algorithm='brute').fit(user_item_data)
distances, indices = nbrs.kneighbors(user_item_data)

pred_knn = knn_course_predict(indices, purch_hists, user_ids)
mapk(actuals, pred_knn)

0.057348942439641996

### prediction - course

In [None]:
pred_knn_test = knn_course_predict(indices, purch_hists, user_ids_test)
pred_knn_test = [[idx2course[j] for j in i] for i in pred_knn_test]

predict(
    result=pred_knn_test,
    path='res/test_course_knn.csv',
    user_ids=user_ids_test,
    idx2user=idx2user,
    domain='course'
)

### validation - topic

In [None]:
predg_knn = predict_topic_from_course(
    result=pred_knn,
    user_ids=user_ids,
    purch_hists=purch_hists,
    course2subgroup=course2subgroup
)

mapk(actuals_g, predg_knn)

0.229043020518245

### prediction - topic

In [None]:
pred_knn_test = knn_course_predict(indices, purch_hists, user_ids_test)
predg_knn_test = predict_topic_from_course(
    result=pred_knn_test,
    user_ids=user_ids_test,
    purch_hists=purch_hists,
    course2subgroup=course2subgroup
)

predict(
    result=predg_knn_test,
    path='res/test_topic_knn.csv',
    user_ids=user_ids_test,
    idx2user=idx2user,
    domain='course'
)

# mixture

### validation - course

In [None]:
pred_knn_als = mix1_course_predict(pred_knn, pred_als)
mapk(actuals, pred_knn_als)

0.0715778181752492

In [None]:
pred_als_knn = mix1_course_predict(pred_als, pred_knn)
mapk(actuals, pred_als_knn)

0.0717445442289698

In [None]:
pred_knn_bayes = mix1_course_predict(pred_knn, pred_bayes)
mapk(actuals, pred_knn_bayes)

0.06805847244464315

In [None]:
pred_bayes_knn = mix1_course_predict(pred_bayes, pred_knn)
mapk(actuals, pred_bayes_knn)

0.06404686450794868

In [None]:
pred_als_bayes = mix1_course_predict(pred_als, pred_bayes)
mapk(actuals, pred_als_bayes)

0.07106045243827187

In [None]:
pred_bayes_als = mix1_course_predict(pred_bayes, pred_als)
mapk(actuals, pred_bayes_als)

0.06642191857299873

In [None]:
pred_knn_als_bayes = mix2_course_predict(pred_knn, pred_als, pred_bayes)
mapk(actuals, pred_knn_als_bayes)

0.0704282667378825

In [None]:
pred_als_bayes_knn = mix2_course_predict(pred_als, pred_bayes, pred_knn)
mapk(actuals, pred_als_bayes_knn)

0.06954050550819296

### prediction - course

In [None]:
pred_als_test, score_als_test = model_als.recommend(
    user_ids_test,
    user_item_data[user_ids_test],
    N=50
)
pred_bayes_test, score_bayes_test = model_bayes.recommend(
    user_ids_test,
    user_item_data[user_ids_test],
    N=50
)
pred_knn_test = knn_course_predict(indices, purch_hists, user_ids_test)

In [None]:
pred_als_bayes_test = mix1_course_predict(pred_als_test, pred_bayes_test)

pred_als_bayes_test = [[idx2course[j] for j in i] for i in pred_als_bayes_test]

predict(
    result=pred_als_bayes_test,
    path='res/test_course_alsbayes.csv',
    user_ids=user_ids_test,
    idx2user=idx2user,
    domain='course'
)

### validation - topic

In [None]:
predg_alsknn = predict_topic_from_course(
    result=pred_als_knn,
    user_ids=user_ids,
    purch_hists=purch_hists,
    course2subgroup=course2subgroup
)
mapk(actuals_g, predg_alsknn)

0.23840315131209286

### prediction - topic

In [None]:
pred_als_knn_test = mix1_course_predict(pred_als_test, pred_knn_test)

predg_alsknn_test = predict_topic_from_course(
    result=pred_als_knn_test,
    user_ids=user_ids_test,
    purch_hists=purch_hists,
    course2subgroup=course2subgroup
)

predict(
    result=predg_alsknn_test,
    path='res/test_group_alsknn.csv',
    user_ids=user_ids_test,
    idx2user=idx2user,
    domain='topic'
)

## sentence similarity

In [None]:
from sentence_transformers import SentenceTransformer, util
import re
import torch
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker

In [6]:
df_users = pd.read_csv(f'{data_dir}/users.csv')
train_users = df_train['user_id']
total_users = list(df_users['user_id'])
filter = []
for i in train_users:
        filter.append(total_users.index(i))
df_users_train = df_users.iloc[filter].copy()
df_users_train.reset_index(drop=True, inplace=True)

In [None]:
def filter1(x):
    string = ""
    try:
        for i in x.split(','):
            for j in i.split('_'):
                string += (j + ' ')
        return string.strip()
    except:
        return string
df_users_train['interests'] = df_users_train['interests'].apply(filter1)

def filter2(x):
    string = ""
    try:
        x = x.replace('、', '')
        for i in x.split(','):
            string += (i + ' ')
        return string.strip()
    except:
        return string
df_users_train['occupation_titles'] = df_users_train['occupation_titles'].apply(filter2)

def filter3(x):
    try:
        x_split = x.split(',')
        string = ""
        for i in x_split:
            string += (i + ' ')
        return string.strip()
    except:  # nan
        return ""
df_users_train['recreation_names'] = df_users_train['recreation_names'].apply(filter2)

def filter4(x):
    if x == 'female':
        return '女'
    elif x == 'male':
        return '男'
    elif x == 'other':
        return '其他性別'
    else:
        return ''
df_users_train['gender'] = df_users_train['gender'].apply(filter4)

df_users_train['info'] = [
    df_users_train['interests'][i]+df_users_train['recreation_names'][i]
    # df_users_train['occupation_titles'][i]+' '+df_users_train['gender'][i]
    for i in range(len(df_users_train))
]
df_users_train['info'] = df_users_train['info'].apply(lambda x: re.sub(' +',' ', x))
df_users_train = df_users_train[['user_id', 'info']]

In [None]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [None]:
df_course.columns

In [None]:
def find_chinese(x):
    x_split = re.findall("[\u4e00-\u9fa5\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]+", x)
    string = ""
    for i in x_split:
        string += i
    return string.strip()

df_course['info'] = df_course['description'].apply(find_chinese).copy()

df_course_train = df_course[['course_id', 'info']].copy()

In [None]:
def clean(sentence_ws, sentence_pos):
    short_sentence = []
    stop_pos = set(['Nep', 'Nh', 'Nb'])
    for word_ws, word_pos in zip(sentence_ws, sentence_pos):
        is_N = word_pos.startswith("N")  # retain noun
        is_not_stop_pos = word_pos not in stop_pos  # retain some pos
        is_not_one_charactor = not (len(word_ws) == 1)  # kick out one character 
    
        if is_N and is_not_stop_pos and is_not_one_charactor:
            short_sentence.append(f"{word_ws}")

    return short_sentence

### course - validation

In [None]:
df_user_val = df_users_train.iloc[user_ids].copy()

### als boost with similarity

In [None]:
ws_driver  = CkipWordSegmenter(model="albert-tiny", device=0)
pos_driver  = CkipPosTagger(model="albert-tiny", device=0)

docs = df_course_train['info']
ws = ws_driver(docs)
pos = pos_driver(ws)

Downloading:   0%|          | 0.00/832 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/15.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/301 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.89k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/16.0M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/301 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Tokenization: 100%|██████████| 728/728 [00:03<00:00, 198.80it/s]
Inference: 100%|██████████| 16/16 [00:03<00:00,  4.68it/s]
Tokenization: 100%|██████████| 728/728 [00:02<00:00, 304.12it/s]
Inference: 100%|██████████| 430/430 [01:41<00:00,  4.22it/s]


In [None]:
preds_sim = []
for i in trange(len(user_ids)):
    j = user_ids[i]
    query = model.encode(df_user_val.loc[j]['info'].split(' '))

    k = pred_als[i]
    cos_sim = np.zeros(len(k))
    for idx, l in enumerate(k):
        short = clean(ws[l], pos[l])
        cos_sim[idx] = torch.mean(util.cos_sim(query, model.encode(short)))
    indices = np.argsort(cos_sim)[::-1]
    preds_sim.append([k[l] for l in indices])

100%|██████████| 7748/7748 [12:14:48<00:00,  5.69s/it]


In [None]:
 mapk(actuals, preds_sim)

0.042896397759697134

### als