In [1]:
import json
from tqdm.auto import tqdm
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder

In [2]:
from sqlalchemy import create_engine

db = dict(
    host='rostam.idav.ucdavis.edu',
    dbname='youtube',
    user='ytuser',
    passwd='GqBKuUigfQ4F0lyy'
)

def get_engine():
    return create_engine('mysql+pymysql://%s:%s@%s/%s' % (db['user'], db['passwd'], db['host'], db['dbname']))

In [3]:
sockpuppets = pd.read_sql('sock-puppets', con=get_engine())

In [4]:
users = set()
items = set()

## User-Item-Rating

In [5]:
ratings = []

In [23]:
for user in tqdm(sockpuppets.iloc[:].itertuples()):
    userId = user.puppet_id
    viewed = user.viewed
    recommendation_trail = user.recommendation_trail

    users.add(userId)
    
    for item in viewed:
        items.add(item)
        ratings.append((userId, item, 1))
    
    for trace in recommendation_trail:
        for item in trace:
            if item in viewed:
                continue
            items.add(item)
            ratings.append((userId, item, 0))

0it [00:00, ?it/s]

In [7]:
print(len(users), len(items))

100 5486


## Item-Item

In [24]:
video_ids = ','.join(f"'{i}'" for i in items)
sql = 'SELECT author_id, video_id FROM comments WHERE video_id IN (%s)' % video_ids
comments = pd.read_sql(sql, con=get_engine())

In [25]:
videoAuthor = {}
authorVideo = {}

In [26]:
for comment in tqdm(comments.itertuples(), total=len(comments)):
    authorId = comment.author_id
    videoId = comment.video_id
    
    if videoId not in items:
        continue
    
    if videoId not in videoAuthor:
        videoAuthor[videoId] = set()
    if authorId not in authorVideo:
        authorVideo[authorId] = set()
        
    videoAuthor[videoId].add(authorId)
    authorVideo[authorId].add(videoId)

  0%|          | 0/19480897 [00:00<?, ?it/s]

In [27]:
videoVideo = {}

In [28]:
for videoId in tqdm(videoAuthor):
    authors = videoAuthor[videoId]
    videoVideo[videoId] = {}
    for author in authors:
        for relatedVideoId in authorVideo[author]:
            if relatedVideoId not in videoVideo[videoId]:
                videoVideo[videoId][relatedVideoId] = 0
            videoVideo[videoId][relatedVideoId] += 1

  0%|          | 0/90470 [00:00<?, ?it/s]

In [47]:
graph = []

In [48]:
for v1 in tqdm(videoVideo):
    for v2 in videoVideo[v1]:
        if videoVideo[v1][v2] >= 0:
            graph.append((v1, v2, videoVideo[v1][v2]))

  0%|          | 0/90470 [00:00<?, ?it/s]

## Saving

In [98]:
ratingsDf = pd.DataFrame(ratings, columns=['user', 'video', 'rating'])
graphDf = pd.DataFrame(graph, columns=['video1', 'video2', 'mutual'])

In [16]:
userEncoder = LabelEncoder()
videoEncoder = LabelEncoder()

In [17]:
userEncoder.fit(list(users))
videoEncoder.fit(list(items))

LabelEncoder()

In [18]:
ratingsDf['user'] = userEncoder.transform(ratingsDf['user'])
ratingsDf['video'] = videoEncoder.transform(ratingsDf['video'])
graphDf['video1'] = videoEncoder.transform(graphDf['video1'])
graphDf['video2'] = videoEncoder.transform(graphDf['video2'])

In [21]:
ratingsDf.to_csv('data/ratings.csv', index=False)
graphDf.to_csv('data/graph.csv', index=False)

In [22]:
print(len(users), len(items))

100 5486


## Plotting

In [96]:
import plotly.express as px
import plotly.graph_objects as go

In [99]:
mutual = graphDf['mutual']
mutual = mutual[mutual > 1]
mini = mutual.min()
maxi = mutual.max()

In [100]:
fig = go.Figure(data=[go.Histogram(x=mutual, cumulative_enabled=True)])
fig.update_layout(yaxis_type='log', xaxis_type='log')
fig.write_html('cdf-histo.html')

In [86]:
total = len(mutual)

In [87]:
x = []
y = []

count = 0
for i in tqdm(range(mini, maxi + 1)):
    x.append(i)
    count = count + len(mutual[i == mutual])
    y.append(count / total)

  0%|          | 0/894 [00:00<?, ?it/s]

In [91]:
fig = px.line(x=x, y=y)
fig.update_layout(
    xaxis_title='# of mutual authors',
    yaxis_title='CDF'
)
fig.write_html('cdf.html')