In [1]:
# !pip uninstall gensim
# !pip uninstall scipy
# !pip install scipy==1.10.1
# !pip install gensim==4.3.2


In [1]:
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
# from gensim import corpora
# from gensim.models import LdaModel
# from sklearn.feature_extraction.text import CountVectorizer

In [2]:
video_details_file = "Video_Transcripts.csv"
video_transcripts = pd.read_csv(video_details_file)
video_transcripts.columns

Index(['channel_name', 'video_id', 'video_title', 'published_datetime',
       'duration', 'view_count', 'like_count', 'dislike_count',
       'comment_count', 'description', 'thumbnail_url', 'transcript'],
      dtype='object')

In [3]:
class_df = pd.read_csv('videos_selected.csv', encoding='ISO-8859-1')
class_df.columns

channel_class_map = class_df.set_index('channel_name')['youtube_tier'].to_dict()

video_transcripts['class'] = video_transcripts['channel_name'].map(channel_class_map)

columns = video_transcripts.columns.tolist()
channel_index = columns.index('channel_name')
columns.remove('class')  # Remove 'class' from current position if it exists
columns.insert(channel_index + 1, 'class')  # Insert 'class' after 'channel_name'

video_transcripts = video_transcripts[columns]

video_transcripts.columns


Index(['channel_name', 'class', 'video_id', 'video_title',
       'published_datetime', 'duration', 'view_count', 'like_count',
       'dislike_count', 'comment_count', 'description', 'thumbnail_url',
       'transcript'],
      dtype='object')

In [4]:
video_transcripts = video_transcripts[['channel_name', 'class','video_id', 'video_title', 'transcript']].copy()

video_transcripts['hook'] = video_transcripts['transcript'].apply(
    lambda x: ' '.join(x.split()[:75]) if isinstance(x, str) else ''
)

video_transcripts.head(3)

Unnamed: 0,channel_name,class,video_id,video_title,transcript,hook
0,@Mrwhosetheboss,Diamond,neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,you probably know xiaomi for their suspiciousl...,you probably know xiaomi for their suspiciousl...
1,@Mrwhosetheboss,Diamond,YX8ks42Azn8,The TRIPLE FOLDING phone has a Problem.,this right here is the Huawei mate XT I spent ...,this right here is the Huawei mate XT I spent ...
2,@Mrwhosetheboss,Diamond,4RcThoRG46c,I tested every Celebrity Tech product!,in front of me right now are VTech products ma...,in front of me right now are VTech products ma...


In [5]:
sia = SentimentIntensityAnalyzer()

# Perform sentiment analysis on a given text
def analyze_sentiment(text):
    if isinstance(text, str) and text.strip():
        scores = sia.polarity_scores(text)
        return scores['compound']  # Use compound score as overall sentiment
    else:
        return None  # Handle missing or non-string text

# Perform sentiment analysis for title, transcript, and hook
video_transcripts['title_sentiment'] = video_transcripts['video_title'].apply(analyze_sentiment)
video_transcripts['transcript_sentiment'] = video_transcripts['transcript'].apply(analyze_sentiment)
video_transcripts['hook_sentiment'] = video_transcripts['hook'].apply(analyze_sentiment)

# Save the results to a new CSV file
video_transcripts.to_csv("Video_Transcripts_With_Sentiment.csv", index=False)

print("Sentiment analysis complete. Results saved to 'Video_Transcripts_With_Sentiment.csv'.")


Sentiment analysis complete. Results saved to 'Video_Transcripts_With_Sentiment.csv'.


In [6]:
video_transcripts = pd.read_csv("Video_Transcripts_With_Sentiment.csv")
video_transcripts.head()

Unnamed: 0,channel_name,class,video_id,video_title,transcript,hook,title_sentiment,transcript_sentiment,hook_sentiment
0,@Mrwhosetheboss,Diamond,neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,you probably know xiaomi for their suspiciousl...,you probably know xiaomi for their suspiciousl...,-0.126,1.0,0.9598
1,@Mrwhosetheboss,Diamond,YX8ks42Azn8,The TRIPLE FOLDING phone has a Problem.,this right here is the Huawei mate XT I spent ...,this right here is the Huawei mate XT I spent ...,-0.4019,0.9999,-0.8691
2,@Mrwhosetheboss,Diamond,4RcThoRG46c,I tested every Celebrity Tech product!,in front of me right now are VTech products ma...,in front of me right now are VTech products ma...,0.0,1.0,0.9552
3,@Mrwhosetheboss,Diamond,vSIbvJB4WdI,iPhone 16 Pro Max vs Samsung S24 Ultra Camera ...,this is the iPhone 16 Pro Max this is the Sams...,this is the iPhone 16 Pro Max this is the Sams...,-0.4389,0.9999,0.7184
4,@Mrwhosetheboss,Diamond,cRPBp2tRxFY,iPhone 16 / 16 Pro Unboxing - Testing every ne...,this is the iPhone 16 the iPhone 16 plus the 1...,this is the iPhone 16 the iPhone 16 plus the 1...,0.0,0.9999,0.6124


In [7]:
video_transcripts.columns

Index(['channel_name', 'class', 'video_id', 'video_title', 'transcript',
       'hook', 'title_sentiment', 'transcript_sentiment', 'hook_sentiment'],
      dtype='object')

## TF-IDF

In [8]:
transcripts = video_transcripts['transcript']

# Step 2: TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    stop_words='english',  # Remove common English words
    max_features=5000,    # Limit to 5000 most important terms
    ngram_range=(1, 2)    # Consider unigrams and bigrams
)
tfidf_matrix = vectorizer.fit_transform(transcripts)

# Step 3: Apply NMF for Topic Modeling
n_topics = 5  # Specify the number of topics to extract
nmf_model = NMF(n_components=n_topics, random_state=42)
nmf_matrix = nmf_model.fit_transform(tfidf_matrix)


## NMF Percentage Topic Modeling

In [9]:

# Step 3: Apply NMF for Topic Modeling
n_topics = 5  # Specify the number of topics to extract
nmf_model = NMF(n_components=n_topics, random_state=42)
nmf_matrix = nmf_model.fit_transform(tfidf_matrix)

# Step 4: Normalize contributions (optional: make them sum to 1 for each transcript)
nmf_matrix_normalized = nmf_matrix / nmf_matrix.sum(axis=1, keepdims=True)

# Step 5: Add topic contributions to the dataframe
for topic_idx in range(n_topics):
    column_name = f"topic_{topic_idx + 1}_contribution"
    video_transcripts[column_name] = nmf_matrix_normalized[:, topic_idx]

# Step 6: (Optional) Extract Topic Keywords for Interpretation
def display_topics(model, feature_names, n_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics[topic_idx] = top_words
        print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")
    return topics

n_top_words = 10
feature_names = vectorizer.get_feature_names_out()
topics = display_topics(nmf_model, feature_names, n_top_words)

# Save the updated DataFrame to a new file
#video_transcripts.to_csv("Video_Transcripts_With_Topics_And_Contributions.csv", index=False)

print("Topic modeling complete. Results saved to 'Video_Transcripts_With_Topics_And_Contributions.csv'.")


Topic 1: like, just, phone, m4, mac, actually, oh, thing, screen, really
Topic 2: tv, box, 4k, dolby, google, got, ve got, remote, hdr, streaming
Topic 3: watch, apple watch, apple, series, series 10, watches, like, watch ultra, watch series, 10
Topic 4: airpods, ear, sound, airpods pro, like, noise, pro, ear design, earbuds, cancellation
Topic 5: iphone, camera, apple, iphone 16, 16, like, phones, phone, new, pro
Topic modeling complete. Results saved to 'Video_Transcripts_With_Topics_And_Contributions.csv'.


In [12]:
# Create a mapping of old column names to new ones
column_mapping = {
    'topic_1_contribution': 'tt_1_genimpressions',
    'topic_2_contribution': 'tt_2_tvs_streaming',
    'topic_3_contribution': 'tt_3_wearables',
    'topic_4_contribution': 'tt_4_audio_accessories',
    'topic_5_contribution': 'tt_5_iphone_cameras'
}

# Rename columns in the dataframe
video_transcripts.rename(columns=column_mapping, inplace=True)

video_transcripts.head()


Unnamed: 0,channel_name,class,video_id,video_title,transcript,hook,title_sentiment,transcript_sentiment,hook_sentiment,tt_1_genimpressions,tt_2_tvs_streaming,tt_3_wearables,tt_4_audio_accessories,tt_5_iphone_cameras
0,@Mrwhosetheboss,Diamond,neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,you probably know xiaomi for their suspiciousl...,you probably know xiaomi for their suspiciousl...,-0.126,1.0,0.9598,0.781448,0.0,0.150271,0.068281,0.0
1,@Mrwhosetheboss,Diamond,YX8ks42Azn8,The TRIPLE FOLDING phone has a Problem.,this right here is the Huawei mate XT I spent ...,this right here is the Huawei mate XT I spent ...,-0.4019,0.9999,-0.8691,0.912694,0.0,0.0,0.0,0.087306
2,@Mrwhosetheboss,Diamond,4RcThoRG46c,I tested every Celebrity Tech product!,in front of me right now are VTech products ma...,in front of me right now are VTech products ma...,0.0,1.0,0.9552,0.708494,0.023958,0.064661,0.202886,0.0
3,@Mrwhosetheboss,Diamond,vSIbvJB4WdI,iPhone 16 Pro Max vs Samsung S24 Ultra Camera ...,this is the iPhone 16 Pro Max this is the Sams...,this is the iPhone 16 Pro Max this is the Sams...,-0.4389,0.9999,0.7184,0.050387,0.020822,0.028909,0.020463,0.879418
4,@Mrwhosetheboss,Diamond,cRPBp2tRxFY,iPhone 16 / 16 Pro Unboxing - Testing every ne...,this is the iPhone 16 the iPhone 16 plus the 1...,this is the iPhone 16 the iPhone 16 plus the 1...,0.0,0.9999,0.6124,0.124643,0.013059,0.0,0.1171,0.745198


In [13]:
video_details_df = pd.read_csv('Video_Details_2.csv')
video_details_df.columns

Index(['channel_name', 'subscriber_count', 'video_id', 'video_title',
       'published_datetime', 'duration', 'view_count', 'like_count',
       'dislike_count', 'comment_count', 'description', 'thumbnail_url'],
      dtype='object')

In [15]:
video_transcripts.columns

Index(['channel_name', 'class', 'video_id', 'video_title', 'transcript',
       'hook', 'title_sentiment', 'transcript_sentiment', 'hook_sentiment',
       'tt_1_genimpressions', 'tt_2_tvs_streaming', 'tt_3_wearables',
       'tt_4_audio_accessories', 'tt_5_iphone_cameras'],
      dtype='object')

In [18]:
merged_df = pd.merge(video_details_df, video_transcripts, 
                     on=['channel_name', 'video_id', 'video_title'], 
                     how='inner')

merged_df.head()

Unnamed: 0,channel_name,subscriber_count,video_id,video_title,published_datetime,duration,view_count,like_count,dislike_count,comment_count,...,transcript,hook,title_sentiment,transcript_sentiment,hook_sentiment,tt_1_genimpressions,tt_2_tvs_streaming,tt_3_wearables,tt_4_audio_accessories,tt_5_iphone_cameras
0,@Mrwhosetheboss,20200000,neIYdLysqlk,I tested the Craziest Xiaomi Gadgets!,2024-11-13T12:04:54Z,PT27M53S,6679521,219704,0,8890,...,you probably know xiaomi for their suspiciousl...,you probably know xiaomi for their suspiciousl...,-0.126,1.0,0.9598,0.781448,0.0,0.150271,0.068281,0.0
1,@Mrwhosetheboss,20200000,YX8ks42Azn8,The TRIPLE FOLDING phone has a Problem.,2024-10-26T14:06:50Z,PT12M54S,3464013,111736,0,5932,...,this right here is the Huawei mate XT I spent ...,this right here is the Huawei mate XT I spent ...,-0.4019,0.9999,-0.8691,0.912694,0.0,0.0,0.0,0.087306
2,@Mrwhosetheboss,20200000,4RcThoRG46c,I tested every Celebrity Tech product!,2024-10-05T10:52:18Z,PT27M15S,5872358,185517,0,5921,...,in front of me right now are VTech products ma...,in front of me right now are VTech products ma...,0.0,1.0,0.9552,0.708494,0.023958,0.064661,0.202886,0.0
3,@Mrwhosetheboss,20200000,vSIbvJB4WdI,iPhone 16 Pro Max vs Samsung S24 Ultra Camera ...,2024-09-19T11:44:30Z,PT16M44S,3852255,116115,0,10790,...,this is the iPhone 16 Pro Max this is the Sams...,this is the iPhone 16 Pro Max this is the Sams...,-0.4389,0.9999,0.7184,0.050387,0.020822,0.028909,0.020463,0.879418
4,@Mrwhosetheboss,20200000,cRPBp2tRxFY,iPhone 16 / 16 Pro Unboxing - Testing every ne...,2024-09-18T12:00:50Z,PT21M40S,4292702,125122,0,8474,...,this is the iPhone 16 the iPhone 16 plus the 1...,this is the iPhone 16 the iPhone 16 plus the 1...,0.0,0.9999,0.6124,0.124643,0.013059,0.0,0.1171,0.745198


In [19]:
merged_df.to_csv("Metadata_Sentiment_Topic.csv", index=False)


# Extra Modeling for Topic Analysis

## NMF

In [10]:
# Step 4: Extract Topic Keywords
def display_topics(model, feature_names, n_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics[topic_idx] = top_words
        print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")
    return topics

n_top_words = 10
feature_names = vectorizer.get_feature_names_out()
topics = display_topics(nmf_model, feature_names, n_top_words)

# Step 5: Assign Topics to Transcripts
video_transcripts['topic'] = nmf_matrix.argmax(axis=1)

# Save the updated DataFrame to a new file
video_transcripts.to_csv("Video_Transcripts_With_Topics.csv", index=False)

print("Topic modeling complete. Results saved to 'Video_Transcripts_With_Topics.csv'.")


Topic 1: like, just, phone, m4, mac, actually, oh, thing, screen, really
Topic 2: tv, box, 4k, dolby, google, got, ve got, remote, hdr, streaming
Topic 3: watch, apple watch, apple, series, series 10, watches, like, watch ultra, watch series, 10
Topic 4: airpods, ear, sound, airpods pro, like, noise, pro, ear design, earbuds, cancellation
Topic 5: iphone, camera, apple, iphone 16, 16, like, phones, phone, new, pro
Topic modeling complete. Results saved to 'Video_Transcripts_With_Topics.csv'.


In [11]:
#video_transcripts['topic']

In [12]:
# Topic 1: Smartphones and Apple Products
# Topic 2: TVs and Home Entertainment
# Topic 3: Wearables (Apple Watch)

In [13]:
# Topic 1: General Impressions and Screens
# Topic 2: TVs and Streaming Devices
# Topic 3: Apple Watch and Wearables
# Topic 4: AirPods and Audio Accessories
# Topic 5: iPhone and Cameras

In [14]:
video_transcripts.groupby('topic')['class'].value_counts()

topic  class  
0      Diamond    10
       Gold        8
       Silver      1
1      Silver      5
2      Gold        4
       Silver      2
3      Silver      4
       Diamond     1
4      Diamond     4
       Gold        3
       Silver      3
Name: count, dtype: int64

## LDA

In [15]:
# vectorizer = CountVectorizer(stop_words='english', max_features=5000)
# doc_term_matrix = vectorizer.fit_transform(transcripts).toarray()

# # Create a dictionary and corpus
# dictionary = corpora.Dictionary([vectorizer.get_feature_names_out()])
# corpus = [dictionary.doc2bow(doc) for doc in doc_term_matrix]

# # Train LDA model
# lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=42)

# # Display topics
# topics = lda_model.print_topics(num_words=10)
# for topic in topics:
#     print(topic)


# BERTopic - more nuanced than NMF

In [16]:
# from bertopic import BERTopic

# # Train BERTopic model
# topic_model = BERTopic()
# topics, probs = topic_model.fit_transform(transcripts)

# # Assign topics to the DataFrame
# video_transcripts['topic'] = topics

# # Visualize topics without filtering manually
# topic_model.visualize_topics()


# LSA - ideal for longer documents so meh

In [17]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize transcripts with TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(video_transcripts['transcript'].fillna(''))

# Apply Truncated SVD
n_topics = 5
lsa_model = TruncatedSVD(n_components=n_topics, random_state=42)
lsa_matrix = lsa_model.fit_transform(tfidf_matrix)

# Print topics
terms = vectorizer.get_feature_names_out()
for i, comp in enumerate(lsa_model.components_):
    terms_in_topic = [terms[idx] for idx in comp.argsort()[-10:]]
    print(f"Topic {i}: {', '.join(terms_in_topic)}")


Topic 0: phones, really, new, phone, camera, pro, iphone, apple, just, like
Topic 1: streaming, hdr, remote, macbook, google, got, box, dolby, 4k, tv
Topic 2: little, oneplus, open, earbuds, cancellation, like, noise, sound, ear, airpods
Topic 3: ipad, like, black, m4, mac, 10, watches, apple, series, watch
Topic 4: huawei, thing, folding, base, gpu, thunderbolt, mini, like, m4, mac


In [18]:
video_transcripts.groupby('topic')['class'].value_counts()

topic  class  
0      Diamond    10
       Gold        8
       Silver      1
1      Silver      5
2      Gold        4
       Silver      2
3      Silver      4
       Diamond     1
4      Diamond     4
       Gold        3
       Silver      3
Name: count, dtype: int64

# K-means

In [19]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize transcripts
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(video_transcripts['transcript'].fillna(''))

# Apply K-Means clustering
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
video_transcripts['topic'] = kmeans.fit_predict(tfidf_matrix)

# Print top terms per cluster
terms = vectorizer.get_feature_names_out()
for i in range(n_clusters):
    cluster_terms = tfidf_matrix[kmeans.labels_ == i].mean(axis=0).A1
    top_terms = [terms[idx] for idx in cluster_terms.argsort()[-10:]]
    print(f"Cluster {i}: {', '.join(top_terms)}")


Cluster 0: thunderbolt, apple, just, ipad, pro, mini, macbook, mac, m4, like
Cluster 1: going, ear, camera, watch, airpods, pro, iphone, just, apple, like
Cluster 2: iphone, year, huawei, folding, camera, screen, just, phones, phone, like
Cluster 3: streaming, hdr, drive, remote, got, google, dolby, box, 4k, tv
Cluster 4: nova, magic, red, book, yoga, keyboard, hp, elite, processor, laptop


In [20]:
# Count the distribution of classes within each cluster
cluster_class_distribution = video_transcripts.groupby('topic')['class'].value_counts()

# Convert to DataFrame for better readability
cluster_class_distribution_df = cluster_class_distribution.reset_index(name='count')
cluster_class_distribution_df

Unnamed: 0,topic,class,count
0,0,Gold,4
1,0,Diamond,1
2,0,Silver,1
3,1,Diamond,9
4,1,Silver,8
5,1,Gold,7
6,2,Diamond,5
7,2,Gold,2
8,2,Silver,1
9,3,Silver,4
