In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse


In [2]:
raw_tr = pd.read_json("./raw_data/train.json")
raw_te = pd.read_json("./태그, 노래 채우기/validation.json")
# raw_te = pd.read_json("./raw_data/val.json")

## playlist X tag binary matrix

In [3]:
from sklearn.preprocessing import MultiLabelBinarizer
test_df = raw_tr
raw_te_tags_mlb = MultiLabelBinarizer(sparse_output=True)
raw_te_tags = list(test_df.tags)
te_tag_df = raw_te_tags_mlb.fit_transform(raw_te_tags)
te_tag_classes = raw_te_tags_mlb.classes_

In [4]:
data_items = pd.DataFrame(te_tag_df.todense(), columns=te_tag_classes)

In [5]:
#------------------------
# ITEM-ITEM CALCULATIONS
#------------------------

# As a first step we normalize the user vectors to unit vectors.

# magnitude = sqrt(x2 + y2 + z2 + ...)
magnitude = np.sqrt(np.square(data_items).sum(axis=1))
# unitvector = (x / magnitude, y / magnitude, z / magnitude, ...)
data_items = data_items.divide(magnitude, axis='index')

def calculate_similarity(data_items):
    """Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    data_sparse = sparse.csr_matrix(data_items)
    similarities = cosine_similarity(data_sparse.transpose())
    sim = pd.DataFrame(data=similarities, index= data_items.columns, columns= data_items.columns)
    return sim

# Build the similarity matrix
data_matrix = calculate_similarity(data_items)

In [307]:
def get_similar_tags_top10(tags):
    return data_matrix.index[np.argsort(data_matrix.loc[tags[0]].values)[::-1][:10]].tolist()
# list(itertools.chain([i[: 10 // len(tags)+1] for i in ls]))

In [304]:
def get_similar_tags(tags):
    from collections import OrderedDict
    from itertools import repeat
    ls = []
    for tag in tags:        
        ls += data_matrix.index[np.argsort(data_matrix.loc[tag].values)[::-1][:10//len(tags)+3]].tolist()
    return list(OrderedDict(zip(ls, repeat(None))))
# list(itertools.chain([i[: 10 // len(tags)+1] for i in ls]))

In [305]:
from tqdm import tqdm
tqdm.pandas()

In [309]:
tag_ret = raw_te.tags.apply(lambda x : get_similar_tags_top10(x)).tolist()

## playlist X songs list in list 만들기

In [17]:
from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS
from implicit.bpr import BayesianPersonalizedRanking as BPR
import numpy as np
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from sklearn.utils import shuffle
from scipy.sparse import *
# import rec_util
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format


In [18]:
train_songs = raw_tr['songs'].tolist()
test_songs = raw_te['songs'].apply(lambda x : [float(i)for i in x]).tolist()
# train_tags = raw_tr['tags'].tolist()
# test_tags = raw_te['tags'].tolist()
test_ids = raw_te['id'].tolist()

### train 데이터 list to list 로 만들기

from itertools import groupby
tr = []
iid_to_idx = {}
idx = 0

for i, l in enumerate(train_songs):
    view = l
    for item_id in view:
        if item_id not in iid_to_idx:
            iid_to_idx[item_id] = idx
            idx += 1
    view = [iid_to_idx[x] for x in view]
    tr.append(view)

# idx = 0
# n_items = 615142
# tag_to_idx = {} 
# for i, tags in enumerate(train_tags):
#     for tag in tags:
#         if tag not in tag_to_idx:
#             tag_to_idx[tag] = n_items + idx
#             idx += 1
#     tr[i].extend([tag_to_idx[x] for x in tags])
n_items = len(iid_to_idx)
# n_tags = len(tag_to_idx)

### test 데이터 list to list 로 만들기

te = []

idx = 0
for i, l in enumerate(test_songs):
    view = l
    ret = [] 
    for item_id in view:
        if item_id not in iid_to_idx:
            continue
        ret.append(iid_to_idx[item_id])
    te.append(ret)
# idx = 0
# for i, tags in enumerate(test_tags):
#     ret = []
#     for tag in tags:
#         if tag not in tag_to_idx:
#             continue
#         ret.append(tag)
#     te[i].extend([tag_to_idx[x] for x in ret])

# 데이터 쓰까

# tr = shuffle(tr, random_state=10)



## 노래 및 태그 인덱싱 하기


In [19]:
idx_to_iid = {x:y for(y,x) in iid_to_idx.items()}
# idx_to_tag = {(x - n_items):y for(y,x) in tag_to_idx.items()}

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(tokenizer=lambda x: x, lowercase=False)
tr_array = cv.fit_transform(tr)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(tokenizer=lambda x: x, lowercase=False)
te_array = cv.fit_transform(te)

In [22]:
tr_csr = csr_matrix(tr_array, (len(tr), n_items))
te_csr = csr_matrix(te_array, (len(te), n_items))

In [23]:
import scipy.sparse
r = scipy.sparse.vstack([tr_csr,te_csr])
r = csr_matrix(r)
r

<138086x615142 sparse matrix of type '<class 'numpy.int64'>'
	with 5779814 stored elements in Compressed Sparse Row format>

In [None]:
als_model = ALS(factors=256, regularization=0.08, iterations=50)
als_model.fit(r.T * 15.0)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

In [None]:
item_model = ALS(use_gpu=False)
# tag_model = ALS(use_gpu=False)
item_model.user_factors = als_model.user_factors
# tag_model.user_factors = als_model.user_factors

item_model.item_factors = als_model.item_factors
# tag_model.item_factors = als_model.item_factors

In [None]:
item_rec_csr = tr_csr[:, :n_items]

In [None]:
item_ret = []
tag_ret = []
from tqdm.auto import tqdm
for u in tqdm(range(te_csr.shape[0])):
    item_rec = item_model.recommend(u, item_rec_csr, N=100)
    item_rec = [idx_to_iid[x[0]] for x in item_rec]
    # tag_rec = tag_model.recommend(u, tag_rec_csr, N=100)
    # tag_rec = [idx_to_tag[x[0]] for x in tag_rec if x[0] in idx_to_tag]
    item_ret.append(item_rec)
    # tag_ret.append(tag_rec)

In [244]:
for i in range(len(item_ret)):
    if len(item_ret[i]) != 100:
        print(i)

In [314]:
# 태그 중복 확인
check_list = []
for tags in tag_ret:
    if len(set(tags)) != 10:
        print(len(set(tags)))
        check_list.append(tags)

In [None]:
returnval = []
for _id, rec, tag_rec in zip(test_ids, item_ret, tag_ret):
    returnval.append({
        "id": _id,
        "songs": rec[:100],
        "tags": tag_rec[:10]
    })

In [None]:
import json
with open('results.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(returnval, ensure_ascii=False))