In [None]:
from json.decoder import JSONDecodeError
import requests
import json
import pandas as pd
import time
import collections
import os
import time

#Specify API Credentials in .env file
from dotenv import load_dotenv
load_dotenv()

CLIENT_ID = os.environ.get('CLIENT_ID')
ACCESS_TOKEN = os.environ.get('ACCESS_TOKEN')


In [None]:
# --- Getting Clips from past Month ---

def find_all_lsf_clips_past_month() -> pd.DataFrame:
    #Parameters for query
    endpoint = "https://api.pushshift.io/reddit/search/submission/"
    end_range = 1622256944 #Set this value to UTC Epoch from 1 month ago
    before = 9999999999
    after = "1m"

    df = pd.DataFrame(columns=["Title", "Created_UTC", "URL", "Post_Link"]) #Initalize df to store clip info

    #Loop because max return is 100 and we need to query until we hit the month end
    while(True):
        #Query pushshift for reddit submissions in timeframe
        PARAMS = {"subreddit": "LivestreamFail", "User-Agent": "LSF Analysis by u/Kgersh", "before": before, "after": after, "size": 100, "sort": "desc"}
        request = requests.get(endpoint, params=PARAMS)
        time.sleep(1) #Pushshift has ratelimits

        #Attempt to parse response, if not wait and try again
        try:
            response = request.json()
        except JSONDecodeError as e:
            time.sleep(1)
            continue

        #Conditions for when there are no remaining posts, break loop
        if "data" not in response:
            print("Data not found, breaking")
            break

        objects = response["data"]
        if(len(objects) == 0):
            print("Length of objects is 0, breaking")
            break

        #For each clip check if it is in range and make sure it was not deleted or removed then append to dataframe
        for obj in objects:
            if(obj["created_utc"] < end_range):
                print(obj["created_utc"], "End of date range, returning")
                return df
            if "https://clips.twitch.tv" in obj["url"]:
                if((obj["selftext"] == "[removed]") or (obj["selftext"] == "[deleted]") or ("removed_by_category" in obj)):
                    print("post check failed...")
                    break
                df = df.append({"Title": obj["title"], "Created_UTC": obj["created_utc"], "URL": obj["url"], "Post_Link": obj["full_link"]}, ignore_index=True)
        
        #The next query will use this before value to paginate through posts
        before = objects[-1]["created_utc"] - 1
    return df

clips_df = find_all_lsf_clips_past_month()

#Write df to csv if needed
#clips_df.to_csv("lsf_clips_month.csv")

In [None]:
# --- Getting Channels for each Clip ---

#Read df back from csv if needed
#clips_df = pd.read_csv("lsf_clips_month.csv")

#Queries Twitch API for channel from a clip -- Used in df.apply()
def find_channel_from_clip(x):
    #Parameters for query
    clip_url = x["URL"]
    uri = clip_url.split(".tv/")[-1]
    slug = uri.split("?")[0] 
    endpoint = "https://api.twitch.tv/helix/clips"
    HEADERS = {"Client-Id": CLIENT_ID, "Authorization": f"Bearer {ACCESS_TOKEN}"}
    PARAMS = {"id": slug}

    #Ask Twitch for info from clip
    request = requests.get(endpoint, headers=HEADERS, params=PARAMS)
    time.sleep(1) #Space out queries, optional but good practice

    #Attempt to parse reponse, otherwise clip must have been deleted
    try: 
        return request.json()["data"][0]["broadcaster_name"] #Json path of channel name from response
    except IndexError as e:
        print("Clip Deleted")
        return "Deleted"

#Add channels to each clip entry
clips_df["Channel"] = clips_df.apply(find_channel_from_clip, axis=1)

In [None]:
# %%
# --- Getting Follower for each Channel ---
#Really this should have been done after I made the channels dataframe lower down but oh well
#Save channels we have already hit to minimize API queries
found_channels = {}

#Queries Twitch API for followers from a channel -- Used in df.apply()
def get_followers_for_channel(x):
    channel = x["Channel"]

    #Check if we've already found the followers for that channel
    if channel in found_channels:
        return found_channels[channel]

    #Before getting followers we need the user_id of the channel
    endpoint = "https://api.twitch.tv/helix/users"
    HEADERS = {"Client-Id": CLIENT_ID, "Authorization": f"Bearer {ACCESS_TOKEN}"}
    PARAMS = {"login": channel}
    request = requests.get(endpoint, headers=HEADERS, params=PARAMS)

    time.sleep(0.2) #Again not required but good practice

    #Initialize user_id to 0 in case we don't find one 
    user_id = 0

    #Attempt to parse request for the user_id of the channel otherwise just return 0 followers
    try: 
        user_id = request.json()["data"][0]["id"]
    except IndexError as e:
        print(request.json())
        print("Id not found")
        return user_id
    except KeyError as e:
        print(request.json())
        print("Id not found")
        return user_id
    
    #Now that we have the user_id we can ask for their follower count
    endpoint = "https://api.twitch.tv/helix/users/follows"
    PARAMS = {"to_id": user_id}
    request = requests.get(endpoint, headers=HEADERS, params=PARAMS)

    #Attempt to parse request for follower count otherwise return 0 followers
    try: 
        found_channels[channel] = request.json()["total"]
        return request.json()["total"]
    except IndexError as e:
        found_channels[channel] = request.json()["total"]
        print(request.json())
        print("Id not found")
        return 0

#Add followers to each clip entry
clips_df["Follower Count"] = clips_df.apply(get_followers_for_channel, axis=1)

In [None]:
# --- Generating Final Data ----

#Now that we have all the clip and channel info, we can do the 'by channel' analysis

#Find the number of times each channel appears in the clips df
occurences = collections.Counter(clips_df["Channel"].tolist())
del occurences["Deleted"]

#Create new df to store channels, clip #, and follower count
channel_df = pd.DataFrame(columns=["Channel", "LSF Clip Count", "Follower Count"])
channel_df["Channel"] = occurences.keys()
channel_df = channel_df.set_index("Channel")

#Add clip # to df
channel_df["LSF Clip Count"] = occurences.values()

#Add follower count to df
for channel in occurences.keys():
    channel_df.at[channel, "Follower Count"] = clips_df.loc[clips_df["Channel"] == channel].iloc[0]["Follower Count"]

#Dump to new csv
channel_df.to_csv("lsf_analysis.csv")
