In [0]:
%%capture
# Special libraries used
!pip install pyreclab                     # For ALS based candidate generation
!pip install tf-nightly-gpu-2.0-preview   # For GPU usage, neural networks

In [2]:
!curl -L -o efficientnetb5.zip https://www.dropbox.com/s/lnwqi1ledieejk0/efficientnetb5.zip?dl=1
# Get EfficientNetB5 embeddings
!unzip -q efficientnetb5.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  172M  100  172M    0     0   783k      0  0:03:46  0:03:46 --:--:-- 14.2M
replace features/201408068.npy? [y]es, [n]o, [A]ll, [N]one, [r]ename: A


In [3]:
!curl -L -o quality.csv https://www.dropbox.com/s/rskwcrxg1gyhbw1/quality.csv?dl=1
# Get quality for each Wikimedia picture
!curl -L -o train_test.json https://www.dropbox.com/s/jt2nqszbctaw9ec/train_test.json?dl=1
# Get train and test f-ile
!curl -L -o training.csv https://www.dropbox.com/s/incixyku24e8hxx/training.csv?dl=1
# Get training data

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  912k  100  912k    0     0   273k      0  0:00:03  0:00:03 --:--:--  454k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 4976k  100 4976k    0     0   441k      0  0:00:11  0:00:11 --:--:--  501k
  % Total    % Received % Xferd  Average Speed   Ti

In [0]:
from sklearn.metrics.pairwise import cosine_similarity
import pyreclab as rec
import tensorflow as tf
import pandas as pd
import numpy as np
import json
import os

In [0]:
# Loading train and test data

with open("train_test.json", "r") as train_file: 
  train = json.load(train_file)
quality = pd.read_csv("quality.csv")
quality["id"] = quality["id"].astype(str)

# Fixing training data to match pyreclab requirements
training = pd.read_csv("training.csv")
training["rating"] = 1

try:
  training = training.drop(["timestamp"], axis=1)
except KeyError:
  pass

training.drop_duplicates(inplace=True)
training.to_csv("training.csv", index=False)

# training.head()

In [0]:
# Creating testing to match pyreclab
with open("testset.csv", "w") as test:
  test.write("user,image_id,rating\n")
  for item in train["test"].items():
    test.write("{},{},1\n".format(item[0], item[1][0]))

In [0]:
# Create dictionary containing features for each image
# Features appear in individual files
# Depending on file amount, this may take a while
features = dict()
for feature in os.listdir("features"):
  features.update({int(feature.split('.')[0]): np.load("features/" + feature)[0]})

In [0]:
# We now attempt to apply ALS, a CF algorithm to recommend images to users
# Because ALS is a type of Matrix factorization, this may also take a while
ALS = rec.IFAlsConjugateGradient(dataset = "training.csv",
                                 dlmchar=b',',
                                 header=True,
                                 usercol=0,
                                 itemcol=1,
                                 observationcol=2)

ALS.train(100, 20, 10)

In [10]:
topn = 100
base_total = 0
base_ret = 0
total = 0

for item in train["test"]:
  try:
    recs = ALS.recommend(item, topn)
    if str(train["test"][item][0]) in recs:
      total += 1
    base_total += topn
    base_ret += 1
  except ValueError:
    pass
    
pak = total/base_total
rak = total/base_ret
f1ak = 2 * (pak * rak) / (pak + rak)
print("Precision: {}\nRecall: {}\nF1-Score: {}\n".format(pak, rak, f1ak))

Precision: 0.0012974976830398518
Recall: 0.12974976830398516
F1-Score: 0.0025693023426531718



In [0]:
def get_user_likes(user, data, features):
  images = data[data["user"] == user]["image_id"]
  return np.matrix([x.flatten() for x in images.map(features)])

def get_corpus(CF, user, features, amount):
  corpus = CF.recommend(user, amount)
  feats = [(x, features[int(x)]) for x in corpus]
  return feats

def get_features_from_corpus(corpus):
  return np.matrix([x[1].flatten() for x in corpus])

def similarity(item, user_items):
  return np.asarray(cosine_similarity([item], user_items))

def average(available_items, user_items):
  sims = cosine_similarity(available_items, user_items)
  return np.mean(sims, axis=1)

In [0]:
def recommend(CF, user_id, data, embeddings, corpus_amount, amount):
  user_features = get_user_likes(int(user_id), data, embeddings)
  user_candidates = get_corpus(CF, user_id, embeddings, corpus_amount)
  sims = average(get_features_from_corpus(user_candidates), user_features)
  cands = {x[0]: sims[index] for index, x in enumerate(user_candidates)}
  values = pd.DataFrame.from_dict(cands, orient='index')
  return values.nlargest(amount, columns=0).index.values

#recommend(ALS, "11", training, features, 2000, 100)

In [133]:
topn = 100
base_total = 0
base_ret = 0
total = 0

for item in train["test"]:
  try:
    recs = recommend(ALS, item, training, features, 2000, topn)
    if str(train["test"][item][0]) in recs:
      total += 1
    base_total += topn
    base_ret += 1
  except ValueError:
    pass
    
pak = total/base_total
rak = total/base_ret
f1ak = 2 * (pak * rak) / (pak + rak)
print("Precision: {}\nRecall: {}\nF1-Score: {}\n".format(pak, rak, f1ak))

Precision: 0.00026876737720111214
Recall: 0.026876737720111215
F1-Score: 0.0005322126281210141



In [0]:
def recommend_quality(CF, user_id, data, embeddings, corpus_amount, amount, quality):
  user_features = get_user_likes(int(user_id), data, embeddings)
  user_candidates = get_corpus(CF, user_id, embeddings, corpus_amount)
  sims = average(get_features_from_corpus(user_candidates), user_features)
  cands = {str(x[0]): sims[index] for index, x in enumerate(user_candidates)}
  values = pd.DataFrame.from_dict(cands, orient='index', columns=["value"])
  xd = values.merge(quality, how="left", left_index=True, right_on="id")
  xd["final"] = xd["value"] * xd["quality"]
  return xd.nlargest(amount, columns=["final"]).id.values

#recommend_quality(ALS, "11", training, features, 2000, 100, quality)

In [218]:
topn = 100
base_total = 0
base_ret = 0
total = 0

for item in train["test"]:
  try:
    recs = recommend_quality(ALS, item, training, features, 2000, topn, quality)
    if str(train["test"][item][0]) in recs:
      total += 1
    base_total += topn
    base_ret += 1
  except ValueError:
    pass
    
pak = total/base_total
rak = total/base_ret
f1ak = 2 * (pak * rak) / (pak + rak)
print("Precision: {}\nRecall: {}\nF1-Score: {}\n".format(pak, rak, f1ak))

Precision: 0.0002502316960148285
Recall: 0.025023169601482854
F1-Score: 0.0004955083089402545



In [161]:
quality

Unnamed: 0,id,quality
0,201403060,16.543049
1,201412090,29.536209
2,201705034,6.660452
3,201402226,8.518917
4,201309046,26.670128
...,...,...
32947,201711147,29.262001
32948,201105170,16.012442
32949,201606045,33.806690
32950,201011105,11.696966
