In [1]:
import json
from tqdm.auto import tqdm
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
from sqlalchemy import create_engine

db = dict(
    host='rostam.idav.ucdavis.edu',
    dbname='youtube',
    user='ytuser',
    passwd='GqBKuUigfQ4F0lyy'
)

def get_engine():
    return create_engine('mysql+pymysql://%s:%s@%s/%s' % (db['user'], db['passwd'], db['host'], db['dbname']))

In [3]:
sockpuppets = pd.read_sql('sock-puppets', con=get_engine())

In [4]:
users = set()
items = set()
X = []
y = []

In [5]:
for user in tqdm(sockpuppets.iloc[:].itertuples()):
    userId = user.puppet_id
    viewed = user.viewed
    recommendation_trail = user.recommendation_trail
    homepage = user.homepage

    users.add(userId)
    
    for item in viewed:
        items.add(item)
        X.append((userId, item))
        y.append(1)
    
    for trace in recommendation_trail:
        for item in trace:
            if item in viewed:
                continue
            items.add(item)
            X.append((userId, item))
            y.append(0)
            
    for item in homepage:
        items.add(item)
        X.append((userId, item))
        y.append(2)

0it [00:00, ?it/s]

## Train-Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
users = {u: ind for ind, u in enumerate(list(users))}
items = {i: ind for ind, i in enumerate(list(items))}

In [8]:
def encode(collection, val):
    return collection[val]

## userItemDict, itemUserDict, userRatings

In [9]:
userItemDict = {}
userRatings = {}
itemUserDict = {}
itemRatings = {}

In [10]:
for (userId, itemId), rating in tqdm(zip(X_train, y_train), total=len(X_train)):
    u = encode(users, userId)
    v = encode(items, itemId)
    
    if u not in userItemDict:
        userItemDict[u] = []
        userRatings[u] = []
        
    if v not in itemUserDict:
        itemUserDict[v] = []
        itemRatings[v] = []
        
    userItemDict[u].append(v)
    userRatings[u].append(rating)
    itemUserDict[v].append(u)
    itemRatings[v].append(rating)

  0%|          | 0/709423 [00:00<?, ?it/s]

In [11]:
for userId in users:
    u = encode(users, userId)
    if u not in userItemDict:
        userItemDict[u] = []
        userRatings[u] = []
    
for itemId in items:
    v = encode(items, itemId)
    if v not in itemUserDict:
        itemUserDict[v] = []
        itemRatings[v] = []

## Test set

In [12]:
testUsers = []
testItems = []
testRatings = []

In [13]:
for (userId, itemId), rating in tqdm(zip(X_test, y_test), total=len(X_test)):
    testUsers.append(encode(users, userId))
    testItems.append(encode(items, itemId))
    testRatings.append(rating)

  0%|          | 0/177356 [00:00<?, ?it/s]

## Item-Item

In [14]:
video_ids = ','.join(f"'{i}'" for i in items)
sql = 'SELECT author_id, video_id FROM comments WHERE video_id IN (%s)' % video_ids
comments = pd.read_sql(sql, con=get_engine())

In [15]:
videoAuthor = {}
authorVideo = {}
videoVideo = {}

In [16]:
# compute video-author edges
for comment in tqdm(comments.itertuples(), total=len(comments)):
    authorId = comment.author_id
    videoId = comment.video_id
    
    if videoId not in items:
        continue
    
    if videoId not in videoAuthor:
        videoAuthor[videoId] = set()
    if authorId not in authorVideo:
        authorVideo[authorId] = set()
        
    videoAuthor[videoId].add(authorId)
    authorVideo[authorId].add(videoId)
    
# count video-author-video edges
for videoId in tqdm(videoAuthor):
    authors = videoAuthor[videoId]
    videoVideo[videoId] = {}
    for author in authors:
        for relatedVideoId in authorVideo[author]:
            if relatedVideoId not in videoVideo[videoId]:
                videoVideo[videoId][relatedVideoId] = 0
            videoVideo[videoId][relatedVideoId] += 1

  0%|          | 0/19480897 [00:00<?, ?it/s]

  0%|          | 0/90470 [00:00<?, ?it/s]

In [17]:
itemItemDict = {}

In [18]:
for v1 in tqdm(videoVideo):
    item1 = encode(items, v1)
    if item1 not in itemItemDict:
        itemItemDict[item1] = []
    
    for v2 in videoVideo[v1]:
        item2 = encode(items, v2)
        if videoVideo[v1][v2] > 1:
            itemItemDict[item1].append(item2)

  0%|          | 0/90470 [00:00<?, ?it/s]

## Dump results

In [19]:
with open('data/youtube-dataset.pickle', 'wb') as f:
    pickle.dump(userItemDict, f)
    pickle.dump(userRatings, f)
    pickle.dump(itemUserDict, f)
    pickle.dump(itemRatings, f)
    
    pickle.dump([encode(users, i[0]) for i in X_train], f)
    pickle.dump([encode(items, i[1]) for i in X_train], f)
    pickle.dump(y_train, f)
    
    pickle.dump([encode(users, i[0]) for i in X_test], f)
    pickle.dump([encode(items, i[1]) for i in X_test], f)
    pickle.dump(y_test, f)
    
    pickle.dump(itemItemDict, f)
    pickle.dump([0, 1], f)
    
    pickle.dump(len(users), f)
    pickle.dump(len(items), f)