In [1]:
import boto3
import pickle as pkl
import pandas as pd
import csv
import time
bucket = 'youtube-commenters'

In [2]:
def see_all_files_s3(bucket):
    objects = boto3.client('s3').list_objects_v2(Bucket=bucket)['Contents']
    files = []
    for obj in objects:
        files.append(obj['Key'])
    return files

def load_pkl_obj_s3(file_name, bucket):
    try:
        s3 = boto3.resource('s3')
        obj = pkl.loads(s3.Bucket(bucket).Object(file_name).get()['Body'].read())
    except:
        time.sleep(5)
        return load_pkl_obj_s3(file_name, bucket)
        
    return obj

In [3]:
utility_files = ['ChannelIdMap.pkl', 'CurrentChannel.pkl', 'YoutubeUsernames.pkl']
commenter_files = [file for file in see_all_files_s3(bucket) if file not in utility_files]
channel_file_dict = {file_name.split('_')[1]:file_name for file_name in commenter_files}
len(channel_file_dict)


106

In [4]:
channel_df = pd.read_csv('Top20kYoutubeChannels.csv')
edges_df = pd.read_csv('youtube_edges.csv')
computer_edges = set()
_ = edges_df.apply(lambda x: computer_edges.add((x['Source'], x['Target'])), axis=1)

In [5]:
overlap_dict = {}
processed_channels = set()

for primary_channel, _file in channel_file_dict.items():
    primary_channel_commenter_dict = load_pkl_obj_s3(_file, bucket)
    overlap_dict[primary_channel] = {}

    for comparison_channel, comparison_file in channel_file_dict.items():
        if comparison_channel == primary_channel:
            continue
        if comparison_channel in processed_channels:
            continue
        if (primary_channel, comparison_channel) in computer_edges:
            continue
        
        print(f'comparisons for {primary_channel} and {comparison_channel}')

        comparison_channel_commenter_dict = load_pkl_obj_s3(comparison_file, bucket)

        shared_commenters = primary_channel_commenter_dict[primary_channel] & comparison_channel_commenter_dict[comparison_channel]
        shared_commenter_count = len(shared_commenters)
        overlap_dict[primary_channel][comparison_channel] = shared_commenter_count
    
    processed_channels.add(primary_channel)


comparisons for AddictedA1 and Chloesaddiction
comparisons for AddictedA1 and Chrisbrowntv
comparisons for AddictedA1 and DisneyMusicVEVO
comparisons for AddictedA1 and DrakeOfficial
comparisons for AddictedA1 and DrossRotzank
comparisons for AddictedA1 and Eminemvevo
comparisons for AddictedA1 and Imaginedragons
comparisons for AddictedA1 and JYPentertainment
comparisons for AddictedA1 and Jellyyt
comparisons for AddictedA1 and Karolgmusic
comparisons for AddictedA1 and Lady16makeup
comparisons for AddictedA1 and Michaeljackson
comparisons for AddictedA1 and NickiMinajTelevision
comparisons for AddictedA1 and NinjasHyper
comparisons for AddictedA1 and Rihannavevo
comparisons for AddictedA1 and SMOSH
comparisons for AddictedA1 and SongsOfIndianCinema
comparisons for AddictedA1 and Sonypalindia
comparisons for AddictedA1 and TwiNboTzVids
comparisons for AddictedA1 and Wwrbhojpuri
comparisons for AddictedA1 and adelelondon
comparisons for AddictedA1 and ashchanchlani
comparisons for Addi

In [8]:
def create_edges_csv(overlap_dict):
    with open('youtube_edges.csv', 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(['Source', 'Target', 'Weight'])
        for primary_channel, primary_overlap_dict in overlap_dict.items():
            for comparison_channel, shared_overlap in primary_overlap_dict.items():
                writer.writerow([primary_channel, comparison_channel, shared_overlap])

def create_labels_csv(overlap_dict, channel_df):
    with open('youtube_labels.csv', 'w', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(['Id', 'Label', 'Count'])
        for primary_channel in overlap_dict:
            display_name = channel_df[channel_df['username'] == primary_channel].iloc[0]['displayname']
            subscriber_count = channel_df[channel_df['username'] == primary_channel].iloc[0]['subscribers']
            writer.writerow([primary_channel, display_name, subscriber_count])

In [9]:
create_edges_csv(overlap_dict)
create_labels_csv(overlap_dict, channel_df)