In [None]:
import pandas as pd
from settings import API_KEY, OUTLIERS
from methods import *

# Enable tqdm pandas methods
tqdm.pandas()

# Test api connection
test_api()

# Gather and export all current data

In [None]:
# Search
channel_ids = search_videos_get_channels("seimas")

# Gather all channels that were searched
# channel_ids = get_all_channels()

channel_ids = [ch for ch in channel_ids if ch not in OUTLIERS]
print("Channels found:", len(channel_ids))

In [None]:
all_channels = get_channels_metadata(channel_ids)
df_channels = filter_channels(all_channels)

print("Channels after initial filtering:", len(df_channels))

In [None]:
videos = get_all_videos(df_channels.index.to_list())
videos = filter_videos(videos)
print("Videos found:", len(videos))

In [None]:
videos_aggregated = aggregate_and_filter_videos(videos)
df_channels = df_channels.join(videos_aggregated, how="inner")
print("Channels after video filtering:", len(df_channels))

In [None]:
df_channels.to_csv("data/df_channels.csv")

# Get all commenters

In [None]:
df_channels = pd.read_csv("data/df_channels.csv")
df_channels = df_channels.set_index("channelId")
df_channels.video_ids = df_channels.video_ids.apply(eval)

if "commenters" not in df_channels:
    df_channels["commenters"] = None

len(df_channels)

In [None]:
df_channels["commenters"] = df_channels.progress_apply(get_channel_commenters, axis=1)
df_channels["commenter_count"] = df_channels["commenters"].apply(lambda x: len(x))

# Create co-commenter matrix

In [None]:
j_matrix = get_similarity(df_channels, target_col="commenters", method="jaccard")
o_matrix = get_similarity(df_channels, target_col="commenters", method="overlap")

df_channels.to_csv("data/df_channels.csv")
o_matrix.to_csv("data/comment_overlap_matrix.csv")
j_matrix.to_csv("data/comment_jaccard_matrix.csv")

# Cache subscribers

In [None]:
df_channels = pd.read_csv("data/df_channels.csv")
df_channels = df_channels.set_index("channelId")
df_channels.commenters = df_channels.commenters.apply(eval)

if "subs_processed" not in df_channels:
    df_channels["subs_processed"] = False

len(df_channels)

In [None]:
cache_subscribers(df_channels)

# Get new channels through subscribers

In [None]:
df_channels = pd.read_csv("data/df_channels.csv")
df_channels = df_channels.set_index("channelId")
df_channels.commenters = df_channels.commenters.apply(eval)
len(df_channels)

In [None]:
sub_count = count_subs_per_channel(df_channels)

In [None]:
# Filter subs
filt_subs = dict(filter(lambda item: item[1] > 100, sub_count.items()))
print("channels left", len(filt_subs))

# Get channel info and filter
new_channel_ids = list(filt_subs.keys())
all_channels_new = get_channels_metadata(new_channel_ids)
df_channels_new = filter_channels(all_channels_new)

print("AFter filtering channels left", len(df_channels_new))

# Get a list of subs per channel

In [None]:
subs_per_channel = get_subs_per_channel(df_channels)
len(subs_per_channel)

In [None]:
df_channels = df_channels.join(pd.Series(subs_per_channel).rename("subs"))
df_channels["sub_count"] = df_channels.subs.apply(len)

# Create co-subscriber matrix

In [None]:
j_matrix = get_similarity(df_channels, target_col="subs", method="jaccard")
o_matrix = get_similarity(df_channels, target_col="subs", method="overlap")

In [None]:
df_channels.to_csv("data/df_channels.csv")
o_matrix.to_csv("data/subs_overlap_matrix.csv")
j_matrix.to_csv("data/subs_jaccard_matrix.csv")