In [4]:
import pandas as pd
import os

TRAIN_SUFFIX = '.train.csv'
VALIDATION_SUFFIX = '.validation.csv'
TEST_SUFFIX = '.test.csv'  # 测试集文件后缀

INFO_SUFFIX = '.info.json'  # 数据集统计信息文件后缀
USER_SUFFIX = '.user.csv'  # 数据集用户特征文件后缀
ITEM_SUFFIX = '.item.csv'  # 数据集物品特征文件后缀

TRAIN_POS_SUFFIX = '.train_pos.csv'  # 训练集用户正向交互按uid合并之后的文件后缀
VALIDATION_POS_SUFFIX = '.validation_pos.csv'  # 验证集用户正向交互按uid合并之后的文件后缀
TEST_POS_SUFFIX = '.test_pos.csv'  # 测试集用户正向交互按uid合并之后的文件后缀

TRAIN_NEG_SUFFIX = '.train_neg.csv'  # 训练集用户负向交互按uid合并之后的文件后缀
VALIDATION_NEG_SUFFIX = '.validation_neg.csv'  # 验证集用户负向交互按uid合并之后的文件后缀
TEST_NEG_SUFFIX = '.test_neg.csv'  # 测试集用户负向交互按uid合并之后的文件后缀


class DataLoader(object):

    def __init__(self, path, dataset, label, sep='\t'):
        self.dataset = dataset
        self.path = os.path.join(path, dataset)

        self.train_file = os.path.join(self.path, dataset + TRAIN_SUFFIX)
        self.validation_file = os.path.join(self.path, dataset + VALIDATION_SUFFIX)
        self.test_file = os.path.join(self.path, dataset + TEST_SUFFIX)

        self.info_file = os.path.join(self.path, dataset + INFO_SUFFIX)
        self.user_file = os.path.join(self.path, dataset + USER_SUFFIX)
        self.item_file = os.path.join(self.path, dataset + ITEM_SUFFIX)

        self.train_pos_file = os.path.join(self.path, dataset + TRAIN_POS_SUFFIX)
        self.validation_pos_file = os.path.join(self.path, dataset + VALIDATION_POS_SUFFIX)
        self.test_pos_file = os.path.join(self.path, dataset + TEST_POS_SUFFIX)

        self.train_neg_file = os.path.join(self.path, dataset + TRAIN_NEG_SUFFIX)
        self.validation_neg_file = os.path.join(self.path, dataset + VALIDATION_NEG_SUFFIX)
        self.test_neg_file = os.path.join(self.path, dataset + TEST_NEG_SUFFIX)

        self.label = label

        self.train_df, self.validation_df, self.test_df = None, None, None
        self.load_user_item()
        self.load_data()
        self.load_his()
        self.load_info()
        # self.save_info()

    def load_user_item(self):
        self.user_df, self.item_df = None, None
        if os.path.exists(self.user_file):
            self.user_df = pd.read_csv(self.user_file, sep='\t')
        if os.path.exists(self.item_file):
            self.item_df = pd.read_csv(self.item_file, sep='\t')

    def load_data(self):
        self.train_df = pd.read_csv(self.train_file, sep='\t')
        self.validation_df = pd.read_csv(self.validation_file, sep='\t')
        self.test_df = pd.read_csv(self.test_file, sep='\t')

    def load_his(self):
        # 把 df [uid, iids] 变成 dict {1: [iid, iid, ...] 2: [iid, iid, ...]}
        def build_his(df):
            uids = df['uid'].tolist()

            iids = df['iids'].astype(str).str.split(',').values

            iids = [[int(j) for j in i] for i in iids]
            user_his = dict(zip(uids, iids))
            return user_his

        # 把 df [uid, iids] 变成 dict {1: [iid, iid, ...] 2: [iid, iid, ...]}
        self.train_pos_df = pd.read_csv(self.train_pos_file, sep='\t')
        self.train_user_pos = build_his(self.train_pos_df)

        self.validation_pos_df = pd.read_csv(self.validation_pos_file, sep='\t')
        self.validation_user_pos = build_his(self.validation_pos_df)

        self.test_pos_df = pd.read_csv(self.test_pos_file, sep='\t')
        self.test_user_pos = build_his(self.test_pos_df)

        self.train_neg_df = pd.read_csv(self.train_neg_file, sep='\t')
        self.train_user_neg = build_his(self.train_neg_df)

        self.validation_neg_df = pd.read_csv(self.validation_neg_file, sep='\t')
        self.validation_user_neg = build_his(self.validation_neg_df)

        self.test_neg_df = pd.read_csv(self.test_neg_file, sep='\t')
        self.test_user_neg = build_his(self.test_neg_df)

    def append_his(self, max_his=10):
        # 包含了 train, validation, test 中的所有的 uid 和 其对应的 iids
        # 正样本是 iid, 负样本是 -iid
        his_dict = {}

        for df in [self.train_df, self.validation_df, self.test_df]:

            history = []  # 最后加入到 df 中

            uids, iids, labels = df['uid'].tolist(), df['iid'].tolist(), df['label'].tolist()

            for i, uid in enumerate(uids):
                iid, label = iids[i], labels[i]

                if uid not in his_dict:
                    his_dict[uid] = []

                tmp_his = his_dict[uid] if max_his <= 0 else his_dict[uid][-max_his:]
                #                 print(tmp_his)
                # 去除 [] 第一个元素是 ‘’， history中的元素是 str 类型
                history.append(str(tmp_his).replace(' ', '')[1:-1])

                if label <= 0:
                    his_dict[uid].append(-iid)
                else:
                    his_dict[uid].append(iid)
            df['history'] = history

    def load_info(self):
        max_dict, min_dict = {}, {}
        for df in [self.train_df, self.validation_df, self.test_df]:
            for c in df.columns:
                if c not in max_dict:
                    max_dict[c] = df[c].max()
                else:
                    max_dict[c] = max(df[c].max(), max_dict[c])

                if c not in min_dict:
                    min_dict[c] = df[c].min()
                else:
                    min_dict[c] = min(df[c].min(), min_dict[c])

        self.column_max = max_dict
        self.column_min = min_dict

        self.user_num, self.item_num = 0, 0
        if 'uid' in self.column_max:
            self.user_num = self.column_max['uid'] + 1
        if 'iid' in self.column_max:
            self.item_num = self.column_max['iid'] + 1

    def drop_neg(self):
        self.train_df = self.train_df[self.train_df['label'] > 0].reset_index(drop=True)
        self.validation_df = self.validation_df[self.validation_df['label'] > 0].reset_index(drop=True)
        self.test_df = self.test_df[self.test_df['label'] > 0].reset_index(drop=True)

In [126]:
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import torch

class DataProcessor(object):
    data_columns = ['uid', 'iid', 'x']
    info_columns = ['sample_id', 'time']

    def __init__(self, data_loader):
        self.data_loader = data_loader
        #         self.train_sample_n = train_sample_n
        #         self.test_sample_n = test_sample_n
        self.train_data = None

        self.pos_user_item_set = defaultdict(set)
        for uid in data_loader.train_user_pos.keys():
            self.pos_user_item_set[uid] = set(data_loader.train_user_pos[uid])

        for uid in data_loader.validation_user_pos.keys():
            if uid in self.pos_user_item_set:
                self.pos_user_item_set[uid] = self.pos_user_item_set[uid] | set(data_loader.validation_user_pos[uid])
            else:
                self.pos_user_item_set[uid] = set(data_loader.validation_user_pos[uid])

        for uid in data_loader.test_user_pos.keys():
            if uid in self.pos_user_item_set:
                self.pos_user_item_set[uid] = self.pos_user_item_set[uid] | set(data_loader.test_user_pos[uid])
            else:
                self.pos_user_item_set[uid] = set(data_loader.test_user_pos[uid])

            
        self.neg_user_item_set = defaultdict(set)
        for uid in data_loader.train_user_neg.keys():
            self.neg_user_item_set[uid] = set(data_loader.train_user_neg[uid])

        for uid in data_loader.validation_user_neg.keys():
            if uid in self.neg_user_item_set:
                self.neg_user_item_set[uid] = self.neg_user_item_set[uid] | set(data_loader.validation_user_neg[uid])
            else:
                self.neg_user_item_set[uid] = set(data_loader.validation_user_neg[uid])

        for uid in data_loader.test_user_neg.keys():
            if uid in self.neg_user_item_set:
                self.neg_user_item_set[uid] = self.neg_user_item_set[uid] | set(data_loader.test_user_neg[uid])
            else:
                self.neg_user_item_set[uid] = set(data_loader.test_user_neg[uid])

    def prepare_batches(self, data, batch_size):
        num_example = len(data['y'])

        total_batch = int((num_example + batch_size - 1) / batch_size)

        batches = []
        for batch in tqdm(range(total_batch)):
            batches.append(self.get_feed_dict(data=data, batch_start = batch * batch_size,
                                              batch_size=batch_size))
        return batches
    
    def numpy_to_torch(self, d, gpu=True, requires_grad=True):
        t = torch.from_numpy(d)
        if d.dtype is np.float:
            t.requires_grad = requires_grad
        if gpu and torch.cuda.device_count() > 0:
            t = t.cuda()
        return t

    def get_feed_dict(self, data, batch_start, batch_size):
        total_data_num = len(data['y'])
        batch_end = min(len(data['y']), batch_start + batch_size)
        real_batch_size = batch_end - batch_start

        feed_dict = {
            'real_batch_size': real_batch_size,
            'x_sample_num': 5,
        }
        
        feed_dict['y'] = self.numpy_to_torch(data['y'][batch_start:batch_start + real_batch_size])
        
        feed_dict['x'] = self.numpy_to_torch(data['x'][batch_start:batch_start + real_batch_size])
        
        feed_dict['iid'] = self.numpy_to_torch(data['iid'][batch_start:batch_start + real_batch_size])
        
        feed_dict['uid'] = self.numpy_to_torch(data['uid'][batch_start:batch_start + real_batch_size])
        
        return feed_dict

        # x: batch_size, x_sample_num * 2
        # iid
        # uid

    def get_validation_data(self):
        df = self.generate_x_samples(self.data_loader.test_df)
        self.test_data = self.format_data_dict(df)
        return self.test_data

    def get_test_data(self):
        df = self.generate_x_samples(self.data_loader.validation_df)
        self.validation_data = self.format_data_dict(df)
        return self.validation_data

    def get_train_data(self, epoch):
        if self.train_data is None:
            df = self.generate_x_samples(self.data_loader.train_df)
            self.train_data = self.format_data_dict(df)
        #             self.train_data['sample_id'] = np.arange(0, len(self.train_data['y']))

        if epoch >= 0:
            np.random.seed(10)  # 保证所有的column shuffle的顺序一样
            rng_state = np.random.get_state()
            for d in self.train_data:
                np.random.set_state(rng_state)
                np.random.shuffle(self.train_data[d])

        return self.train_data

    # 每个 uid iid 产生 5 个 正样本 和 5 个负样本
    def generate_x_samples(self, df, x_number=5, stage='train'):
        x_samples_list = []
        idx_del = []

        for i, uid in enumerate(df['uid'].values):

            iid = df['iid'].values[i]

            # get x_number positive samples
            pos_iids = self.pos_user_item_set[uid]
            if len(pos_iids) == 0:
                idx_del.append(i)
                continue
            
            if iid in pos_iids:
                pos_iids.remove(iid)
                if x_number < len(pos_iids):
                    pos_samples = np.random.choice(list(pos_iids), x_number, replace=False)
                else:
                    pos_samples = np.random.choice(list(pos_iids), x_number, replace=True)
                pos_iids.add(iid)
            else:
                if x_number < len(pos_iids):
                    pos_samples = np.random.choice(list(pos_iids), x_number, replace=False)
                else:
                    pos_samples = np.random.choice(list(pos_iids), x_number, replace=True)

            # get x_number negative samples (true negative and non-seen movies)
            neg_samples = set()
            for i in range(x_number):
                neg_iid = np.random.randint(1, self.data_loader.item_num)
                while neg_iid in neg_samples or neg_iid in pos_iids or neg_iid == iid:
                    neg_iid = np.random.randint(1, self.data_loader.item_num)
                neg_samples.add(neg_iid)
            neg_samples = list(neg_samples)
            
            
            x_samples = np.concatenate([pos_samples, neg_samples]).tolist()
            x_samples_list.append(x_samples)
            
#         print('uid delete')
#         for uid in uids_del:
#             print(uid)
        
        df = df.drop(idx_del)
        df['x'] = x_samples_list

        return df

    def format_data_dict(self, df):

        #df = df[df['history'].apply(lambda x: len(x) > 0)]
        data_loader = self.data_loader
        data = {}

        if 'uid' in df:
            data['uid'] = df['uid'].values
        if 'iid' in df:
            data['iid'] = df['iid'].values
        if 'time' in df:
            data['time'] = df['time'].values
        if 'label' in df:
            data['y'] = np.array(df['label'], dtype=np.float32)
        if 'x' in df:
            data['x'] = np.array(df['x'].values.tolist())

        return data

In [127]:
dataLoader = DataLoader('./dataset', 'ml100k01-1-5', 'label')
dataprocessor = DataProcessor(dataLoader)

In [128]:
train_data = dataprocessor.get_train_data(-1)

In [129]:
batches = dataprocessor.prepare_batches(train_data, 128)

100%|██████████| 741/741 [00:00<00:00, 27369.33it/s]


In [133]:
batches[-1]

{'real_batch_size': 40,
 'x_sample_num': 5,
 'y': tensor([1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1.,
         1., 1., 0., 0.]),
 'x': tensor([[ 316,  289,  511,  317,  607, 1157, 1480,  200,  855, 1305],
         [ 511,   62,  133,  127,   56,  578, 1574, 1549, 1234,  375],
         [ 272,  511,  268,  906,  321,  266,  749, 1075,  792, 1467],
         [ 127,  690,   56,   62,  321,  449,  807,  872,   16,  470],
         [  22,  132,  133,  887,  321, 1636,  170,   76,  598,  792],
         [ 354,  333,  689,  362,  322,   97, 1638,  216,  348, 1565],
         [ 305,  513,   56,  690,   22,   44,  282,  923, 1308, 1021],
         [ 322,  689,  300,  362,  748,  609,  464, 1041, 1046, 1565],
         [ 322,  300,  333,  748,  354,   64, 1247, 1583,  276,  607],
         [ 272,  322,  333,  362,  354, 1508,  558,  724,  507, 1532],
         [ 513,  127,  346,  315,  887,  867,  

In [9]:
train_df = dataLoader.train_df

In [40]:
train_df[train_df['uid'] == 685]

Unnamed: 0,uid,iid,label,time
24062,685,872,0,879447443
24063,685,286,0,879447443
24239,685,333,0,879451147
24240,685,288,0,879451147
24242,685,334,0,879451168
24244,685,886,0,879451211
24245,685,327,0,879451234
24246,685,289,0,879451253
24247,685,991,0,879451282
24249,685,873,0,879451401


In [42]:
test_df[test_df['uid'] == 685]

Unnamed: 0,uid,iid,label,time


In [22]:
dataLoader.column_max

{'uid': 943, 'iid': 1682, 'label': 1, 'time': 893286638}

In [23]:
dataLoader.column_min

{'uid': 1, 'iid': 1, 'label': 0, 'time': 874724710}

In [12]:
dataLoader.item_num

1683

In [20]:
dataLoader.user_num

944

In [18]:
test_df = dataLoader.test_df
validation_df = dataLoader.validation_df

In [147]:
prediction = np.load('./prediction.npy')
y = np.load('./y.npy')
data = np.load('./test_data.npy')

ValueError: Object arrays cannot be loaded when allow_pickle=False

In [137]:
len(prediction)

2253

In [145]:
np.around(prediction)

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [0.]], dtype=float32)

In [143]:
prediction = np.where(prediction > 0.5, 1, 0)

In [146]:
from sklearn.metrics import *
accuracy_score(y, np.around(prediction))

0.46515756768752775

In [173]:
uids = np.array([1,1, 1, 1, 2, 2, 2, 3, 3, 3])
p = np.array([0.5, 0.6, 0.7, 0.8, 0.7, 0.5, 0.6, 0.9, 0.7, 0.7])
l = np.array([1, 1, 1, 0, 0, 0, 1, 0, 1, 1])

In [174]:
sorted_idx = np.lexsort((-l, -p, uids))

In [175]:
sorted_uid = uids[sorted_idx]

In [176]:
sorted_key, sorted_spl = np.unique(sorted_uid, return_index=True)

In [177]:
sorted_key

array([1, 2, 3])

In [178]:
sorted_spl

array([0, 4, 7])

In [179]:
sorted_l, sorted_p = l[sorted_idx], p[sorted_idx]

In [180]:
split_l = np.split(sorted_l, sorted_spl[1:])

In [165]:
split_l_sum = [np.sum((d > 0).astype(float)) for d in split_l]

In [166]:
split_l_sum

[2.0, 1.0, 0.0, 0.0, 1.0, 2.0]

In [181]:
for d in split_l:
    print((d > 0).astype(float))

[0. 1. 1. 1.]
[0. 1. 0.]
[0. 1. 1.]


In [183]:
k_data = [d[:3] for d in split_l]
k_data_dict = defaultdict(list)
for d in k_data:
    k_data_dict[len(d)].append(d)

In [184]:
k_data_dict

defaultdict(list, {3: [array([0, 1, 1]), array([0, 1, 0]), array([0, 1, 1])]})

In [190]:
precisions = [np.average((np.array(d) > 0).astype(float), axis=1) for d in k_data_dict.values()]

In [192]:
np.average(np.concatenate(precisions))

0.5555555555555555

In [17]:
import pandas as pd
df = pd.read_excel('./Thesis_Test.xlsx', index_col=False)

In [18]:
df.columns

Index(['Unnamed: 0', 'Image', 'Gender', 'Emotion', 'Race', 'Age'], dtype='object')

In [19]:
df = df.drop(columns=['Unnamed: 0'])

In [20]:
from pathlib import Path
import cv2
import numpy as np

In [21]:
def yield_images_from_dir(image_dir):
    image_dir = Path(image_dir)
    for image_path in image_dir.glob("*.jpg"):
        img = cv2.imread(str(image_path))
        if img is not None:
            h, w, _ = img.shape
            r = 640/max(w, h)
            yield cv2.resize(img, (int(w * r), int(h * r))), str(image_path)

def draw_label(image, point, label, font=cv2.FONT_HERSHEY_SIMPLEX, font_scale=0.8, thickness=1):
    size = cv2.getTextSize(label, font, font_scale, thickness)[0]
    x, y = point
    cv2.rectangle(image, (x, y-size[1]), (x + size[0], y), (255, 0, 0), cv2.FILLED)
    cv2.putText(image, label, point, font, font_scale, (255, 255, 255), thickness, lineType=cv2.LINE_AA)
    

In [25]:
img_generator = yield_images_from_dir('./Images')

In [26]:
for img, path in img_generator:
    input_img = img
    img_name = path.split('/')[-1]
    age = df[df['Image']==img_name]['Age'].values[0]
    gender = df[df['Image']==img_name]['Gender'].values[0]
    emotion = df[df['Image'] == img_name]['Emotion'].values[0]
    race = df[df['Image'] == img_name]['Race'].values[0]
    
    label = "{}, {}, {}, {}".format(age, gender, emotion, race)
    draw_label(input_img, (20, 20), label)
    
    print('./Images/' + img_name)
    cv2.imwrite('./Images/' + img_name, input_img)
    print('Successgully saved')
#     cv2.imshow('window', input_img)
#     cv2.waitKey(0)
    
# cv2.destroyAllWindows()
    

./Images/001818.jpg
Successgully saved
./Images/001597.jpg
Successgully saved
./Images/001557.jpg
Successgully saved
./Images/18500461.jpg
Successgully saved
./Images/001487.jpg
Successgully saved
