In [1]:
import googleapiclient.discovery

import googleapiclient.errors

In [2]:
def get_channel_videos(channel_id):
    youtube = googleapiclient.discovery.build("youtube", "v3", developerKey="redacted")
    

    # Retrieve the first page of videos uploaded by the channel
    request = youtube.search().list(
        part="id",
        channelId=channel_id,
        maxResults=50,
        order="date",
        type="video"
    )
    response = request.execute()

    # Extract the video IDs from the search results
    video_ids = []
    for item in response["items"]:
        video_ids.append(item["id"]["videoId"])

    # Retrieve the details for each video using the videos().list() method
    videos = []
    while True:
        # Retrieve the details of up to 50 videos at a time
        video_request = youtube.videos().list(
            part="snippet, statistics, contentDetails",
            id=",".join(video_ids[:50])
        )
        video_response = video_request.execute()

        # Extract the title, view count, and length of each video
        for item in video_response["items"]:
            title = item["snippet"]["title"]
            view_count = item["statistics"]["viewCount"]
            length = item["contentDetails"]["duration"]
            
            # Convert the length to a more readable format (HH:MM:SS)
            length = length.replace("PT", "").replace("H", ":").replace("M", ":").replace("S", "")
            if length.count(":") == 1:
                length = "00:" + length
            videos.append({
                "title": title,
                "view_count": view_count,
                "length": length
            })

        # Check if there are more pages of results
        if "nextPageToken" in response:
            next_page_token = response["nextPageToken"]

            # Retrieve the next page of results
            request = youtube.search().list(
                part="id",
                channelId=channel_id,
                maxResults=50,
                order="date",
                type="video",
                pageToken=next_page_token
            )
            response = request.execute()

            # Extract the video IDs from the search results
            video_ids = []
            for item in response["items"]:
                video_ids.append(item["id"]["videoId"])

        else:
            # No more pages of results, break out of the loop
            break

    return videos

In [3]:
channel_id = "UCrPseYLGpNygVi34QpGNqpA"
videos = get_channel_videos(channel_id)


In [4]:
videos = [video for video in videos if video['length'].split(':')[0] == '00']
for video in videos:
    print(video)

{'title': 'Every Dot is a YouTuber. Can I Win?', 'view_count': '695488', 'length': '00:8:37'}
{'title': 'I Hired a Pro Gamer To Secretly Destroy My Chat', 'view_count': '1605691', 'length': '00:8:2'}
{'title': 'Proximity Chat Valorant is Insane.', 'view_count': '622666', 'length': '00:8:32'}
{'title': 'Mario 64 Speedrun But Every 5 Minutes I Add An Overlay to the HUD', 'view_count': '579112', 'length': '00:16:41'}
{'title': 'I Tried Driving Drunk. Legally.', 'view_count': '1164453', 'length': '00:14:21'}
{'title': 'You Laugh You Lose But it’s My Girlfriends Tik Toks', 'view_count': '923519', 'length': '00:14:43'}
{'title': 'Cooking Expensive Steak in a Toaster', 'view_count': '681365', 'length': '00:8:45'}
{'title': "I Challenged Japan's Best Competitive Eater. It Didn't Go Well...", 'view_count': '1136394', 'length': '00:12:41'}
{'title': 'Doing Whatever Stream Tells Me To Do... Again.', 'view_count': '998776', 'length': '00:29:44'}
{'title': 'He Fumbled the Bag', 'view_count': '75671

In [5]:
import pandas as pd

In [6]:
data = videos
df = pd.DataFrame(data)
df = df.astype({'view_count':'int'})


In [7]:
df = df.sort_values(by=['view_count'], ascending=False)

In [8]:
df

Unnamed: 0,title,view_count,length
61,"I Buried $100,000, Go Find It",12470056,00:11:43
286,jschlatt ruins my gameshow | Mogul Money,6034505,00:37:23
210,I made a secret YouTube channel to prove it's ...,5276814,00:16:56
136,I streamed until I beat Elden Ring. It was a m...,5047875,00:59:28
329,Using Magnus Carlsen to CHEAT in Chess,4764838,00:28:5
...,...,...,...
255,Ludwig Hair Tutorial,281587,00:8:14
217,Ludwig Clones Himself ft. William Osman,279983,00:2:31
222,Playing Runescape for the First Time in 20 Years!,278795,00:10:38
354,MOVING IN WITH THE BOYS,277731,00:12:31


In [20]:
import math
general_classifier = math.floor(df.shape[0]/5)
last_classifier = general_classifier + (df.shape[0] - (general_classifier*5))

In [22]:
percentiles = [0, 0.2, 0.4, 0.6, 0.8, 1]
ranges = pd.qcut(df['view_count'], percentiles, labels=False)

In [23]:
df['percentile_range'] = ranges

In [24]:
df

Unnamed: 0,title,view_count,length,percentile_range
61,"I Buried $100,000, Go Find It",12470056,00:11:43,4
286,jschlatt ruins my gameshow | Mogul Money,6034505,00:37:23,4
210,I made a secret YouTube channel to prove it's ...,5276814,00:16:56,4
136,I streamed until I beat Elden Ring. It was a m...,5047875,00:59:28,4
329,Using Magnus Carlsen to CHEAT in Chess,4764838,00:28:5,4
...,...,...,...,...
255,Ludwig Hair Tutorial,281587,00:8:14,0
217,Ludwig Clones Himself ft. William Osman,279983,00:2:31,0
222,Playing Runescape for the First Time in 20 Years!,278795,00:10:38,0
354,MOVING IN WITH THE BOYS,277731,00:12:31,0


In [64]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer


In [78]:
corpus = df['title'].dropna().tolist()


In [165]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [166]:
X.toarray()

array([[0.2924925 , 0.        , 0.43481088, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [167]:
X.shape

(441, 820)

In [168]:
y = df['percentile_range'].dropna().tolist()
y = np.array(y)


In [169]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.3,
                                                   random_state=1234)


In [170]:
lr_model = LogisticRegression(random_state=1234)
param_dict = {'C': [0.001, 0.01, 0.1, 1, 10],
             'solver': ['sag', 'lbfgs', 'saga']}

In [171]:
from sklearn.model_selection import GridSearchCV


In [172]:
grid_search = GridSearchCV(lr_model, param_grid=param_dict)


In [173]:
def get_best_params(grid):
    grid.fit(X_train, y_train)
    best_params = grid.best_params_
    return best_params
    

In [174]:
best_params = get_best_params(grid_search)




In [175]:
best_params

{'C': 10, 'solver': 'saga'}

In [176]:
lr=LogisticRegression(C=10, solver ='sag')
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)

In [177]:
score = accuracy_score(y_test, lr_preds)


In [178]:
score

0.23308270676691728