In [24]:
import googleapiclient.discovery
import requests
import csv
import pandas as pd
import plotly.express as px
import plotly.io as pio
from sklearn.cluster import KMeans
import plotly.graph_objects as go
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
# Set up the YouTube API client
api_key = ""
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)

# Specify the channel custom URL
channel_custom_url = "https://www.youtube.com/@yougotfam"

# Extract the channel username from the custom URL
channel_username = channel_custom_url.split("/")[-1]

# Search for the channel by username
search_response = youtube.search().list(part="id", q=channel_username, type="channel", maxResults=1).execute()
channel_id = search_response["items"][0]["id"]["channelId"]

In [28]:
# retrieve the channel's metadata
channel = youtube.channels().list(part="snippet,statistics", id=channel_id).execute()["items"][0]
print("Channel name:", channel["snippet"]["title"])
print("Channel description:", channel["snippet"]["description"])
print("Number of subscribers:", channel["statistics"]["subscriberCount"])
print("Total number of views:", channel["statistics"]["viewCount"])


Channel name: Fam India
Channel description: FamX by Fam, is India’s first Spending Account made for everyone.

Number of subscribers: 56300
Total number of views: 2612678


In [29]:
# retrieve the channel's videos and their metadata
videos = []
next_page_token = None
while True:
    request = youtube.search().list(part="id", channelId=channel_id, maxResults=50, pageToken=next_page_token, type="video")
    response = request.execute()
    video_ids = [item["id"]["videoId"] for item in response["items"]]
    video_request = youtube.videos().list(part="snippet,statistics", id=",".join(video_ids))
    video_response = video_request.execute()
    videos += video_response["items"]
    next_page_token = response.get("nextPageToken")
    if not next_page_token:
        break

In [30]:
with open("video_data.csv", "w", newline='', encoding='utf-8') as csvfile:
    fieldnames = ["video_title", "video_id", "video_url", "views", "likes", "comments", "caption", "upload_date"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # Retrieve captions and video data, and save them to the CSV file
    for video in videos:
        video_data = {}
        video_data["video_id"] = video["id"]
        video_data["video_title"] = video["snippet"]["title"]
        video_data["video_url"] = f"https://www.youtube.com/watch?v={video_data['video_id']}"
        if "statistics" in video:
            video_data["views"] = video["statistics"].get("viewCount", "N/A")
            video_data["likes"] = video["statistics"].get("likeCount", "N/A")
            # video_data["dislikes"] = video["statistics"].get("dislikeCount", "N/A")
            video_data["comments"] = video["statistics"].get("commentCount", "N/A")
            video_data["upload_date"] = video["snippet"]["publishedAt"]

        else:
            video_data["views"] = "N/A"
            video_data["likes"] = "N/A"
            # video_data["dislikes"] = "N/A"
            video_data["comments"] = "N/A"
            video_data["upload_date"] = "N/A"
        
        caption_request = youtube.captions().list(part="id", videoId=video_data["video_id"])
        caption_response = caption_request.execute()
        if caption_response["items"]:
            caption_id = caption_response["items"][0]["id"]
            caption_download_url = f"https://www.youtube.com/api/timedtext?v={video_data['video_id']}&lang=en&fmt=srv3"
            # download the caption file and save the text
            caption_file = requests.get(caption_download_url).text
            video_data["caption"] = caption_file

        else:
            video_data["caption"] = "No caption available."
        
        # Write the data to the CSV file
        writer.writerow(video_data)

        # Print the data to the console
        print("Video title:", video_data["video_title"])
        print("Video ID:", video_data["video_id"])
        print("Video URL:", video_data["video_url"])
        print("Views:", video_data["views"])
        print("Likes:", video_data["likes"])
        print("Dislikes:", video_data["upload_date"])
        print("Comments:", video_data["comments"])
        print("Caption:\n", video_data["caption"])

Video title: Happy Diwali 2020 | FamPay
Video ID: hiQ4nLVbqY4
Video URL: https://www.youtube.com/watch?v=hiQ4nLVbqY4
Views: 2497
Likes: 156
Dislikes: 2020-11-14T05:05:55Z
Comments: N/A
Caption:
 No caption available.
Video title: Get Featured In Tanmay Bhat’s Next Video #shorts
Video ID: YEK2T3YLT9M
Video URL: https://www.youtube.com/watch?v=YEK2T3YLT9M
Views: 14725
Likes: 786
Dislikes: 2021-03-11T03:20:57Z
Comments: N/A
Caption:
 No caption available.
Video title: FamPay Explained To Parents | All about India's First Neobank for Teens
Video ID: BTcwwlx5Xqk
Video URL: https://www.youtube.com/watch?v=BTcwwlx5Xqk
Views: 371303
Likes: 11755
Dislikes: 2021-01-22T12:25:56Z
Comments: N/A
Caption:
 No caption available.
Video title: WE USED A METAL DETECTOR AND FOUND THIS 😳🤯
Video ID: X9gwJqLJ6nE
Video URL: https://www.youtube.com/watch?v=X9gwJqLJ6nE
Views: 4272
Likes: N/A
Dislikes: 2022-10-20T06:43:15Z
Comments: 14
Caption:
 No caption available.
Video title: How to activate your FamCard: st

In [36]:
df = pd.read_csv("video_data.csv")
df.head(5)

Unnamed: 0,video_title,video_id,video_url,views,likes,comments,caption,upload_date
0,Happy Diwali 2020 | FamPay,hiQ4nLVbqY4,https://www.youtube.com/watch?v=hiQ4nLVbqY4,2497,156.0,,No caption available.,2020-11-14T05:05:55Z
1,Get Featured In Tanmay Bhat’s Next Video #shorts,YEK2T3YLT9M,https://www.youtube.com/watch?v=YEK2T3YLT9M,14725,786.0,,No caption available.,2021-03-11T03:20:57Z
2,FamPay Explained To Parents | All about India'...,BTcwwlx5Xqk,https://www.youtube.com/watch?v=BTcwwlx5Xqk,371303,11755.0,,No caption available.,2021-01-22T12:25:56Z
3,WE USED A METAL DETECTOR AND FOUND THIS 😳🤯,X9gwJqLJ6nE,https://www.youtube.com/watch?v=X9gwJqLJ6nE,4272,,14.0,No caption available.,2022-10-20T06:43:15Z
4,How to activate your FamCard: step by step tut...,SKqJ2QFw3_I,https://www.youtube.com/watch?v=SKqJ2QFw3_I,19857,1397.0,,No caption available.,2022-06-06T07:34:37Z


In [41]:
fig = px.bar(df, x='video_title', y='views', color='likes')
fig.update_layout(title='Views and Likes for Each Video',
                  xaxis_title='Video Title',
                  yaxis_title='Count')
pio.write_html(fig, file='../../fam-report-site/public/Social-Analytics/views_likes.html', auto_open=True)
fig.show()

In [43]:
# Prepare the data
df['video_title'] = df['video_title'].fillna('')  # Fill NaN values with empty string
video_titles = df['video_title']

# Vectorize the video titles
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(video_titles)

# Apply K-means clustering
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)

# Assign cluster labels to the dataframe
df['cluster_label'] = kmeans.labels_

# Calculate the average likes for each cluster
cluster_likes = df.groupby('cluster_label')['likes'].mean()

# Plotting
import plotly.graph_objects as go

fig = go.Figure(data=go.Bar(x=cluster_likes.index, y=cluster_likes.values))
fig.update_layout(
    title='Average Likes by Video Cluster',
    xaxis_title='Cluster',
    yaxis_title='Average Likes'
)
pio.write_html(fig, file='../../fam-report-site/public/Social-Analytics/likes_by_cluster.html', auto_open=True)
fig.show()






In [35]:
# Get the cluster labels for the best and worst performing clusters
worst_cluster = cluster_likes.idxmin()
best_cluster = cluster_likes.idxmax()

# Print the topics in the worst performing cluster
print("Topics in the Worst Performing Cluster (Cluster {}):".format(worst_cluster))
worst_cluster_topics = df[df['cluster_label'] == worst_cluster]['video_title']
for topic in worst_cluster_topics:
    print(topic)

print("\n")

# Print the topics in the best performing cluster
print("Topics in the Best Performing Cluster (Cluster {}):".format(best_cluster))
best_cluster_topics = df[df['cluster_label'] == best_cluster]['video_title']
for topic in best_cluster_topics:
    print(topic)


Topics in the Worst Performing Cluster (Cluster 2):
It's Debatable Finals LIVE - Day 2 FamJam 2.0
Day1 Part 2 - Workshop on Finding The Lost Cause - FamJam 2.0 India's largest digi-fest for teens!
Pitch Please Finale LIVE - Day 2 FamJam 2.0
Presenting FamJam 2.0 - India's Largest Digi-Fest for Teens By FamPay!
Day1 Part 1 - FamJam 2.0 - FamPageant & Workshop on The Art of Journaling
Workshop on Writing A Comedy Sketch LIVE - Day 2 FamJam 2.0
F for Fame and Comicsteen Finale LIVE - Day 2 FamJam 2.0


Topics in the Best Performing Cluster (Cluster 4):
FamPay Explained To Parents | All about India's First Neobank for Teens
How to activate your FamCard: step by step tutorial | India's first numberless card by FamPay
What’s Alina’s type? ft. FamCard
FamCard- The Only Numberless Card For Teens by FamPay
BREAKING NEWS FOR TEENS! ft. @Saurabh_Ghadge | FamPay | Payments app for Teens
Aryan couldn't stop talking about THIS...| Aryan Katariya, Krisha Jain | FamPay India
FamCard Me- The Only Card 

In [52]:
# Add 'cluster_label' column to the dataframe
df['cluster_label'] = kmeans.labels_

# Scatter plot with color-coded clusters
fig = go.Figure(data=go.Scatter(x=df['views'], y=df['likes'], mode='markers', marker=dict(color=df['cluster_label']), hovertext=df['video_title']))

fig.update_layout(title='Views and Likes for Each Video',
                  xaxis_title='Views',
                  yaxis_title='Likes',
                  hovermode='closest')
pio.write_html(fig, file='../../fam-report-site/public/Social-Analytics/views_likes_by_cluster.html', auto_open=True)
fig.show()
