In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from minisom import MiniSom
import torch
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import DBSCAN
from wordcloud import WordCloud
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from keras import layers
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPool1D
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow import keras
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split as tt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("sdmn_video_details.csv")

# Stemming / Lemmatizing
words = stopwords.words("english")
port_stem = PorterStemmer()
lem = WordNetLemmatizer()


def stemming(contents):
    contents = str(contents)
    stemmed_contents = re.sub(r'[^a-zA-Z]', ' ', contents)
    # stemmed_contents = stemmed_contents.lower()
    stemmed_contents = stemmed_contents.capitalize()
    stemmed_contents = stemmed_contents.split()
    # stemmed_contents = [port_stem.stem(word) for word in stemmed_contents if word not in words]
    # stemmed_contents = [lem.lemmatize(word) for word in stemmed_contents if word not in words]
    stemmed_contents = ' '.join(stemmed_contents)
    return stemmed_contents


df["video_title"] = df["title"].apply(stemming)
# df["video_title"] = df["title"]
# print(df['video_title'])

data = list(df['video_title'])

# Convert the textual data to a vector representation using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data)
df['vectorized_data'] = list(X)

In [3]:
# Define the size of the SOM and train the model
som = MiniSom(3, 3, X.shape[1], sigma=1.0, learning_rate=0.5)
som.random_weights_init(X.toarray())
som.train_random(X.toarray(), 100)

# Assign each document to a cluster based on the closest neuron
clusters = {}
for i, x in enumerate(X):
    winner = som.winner(x.toarray()[0])
    if winner not in clusters:
        clusters[winner] = [i]
    else:
        clusters[winner].append(i)

for cluster, docs in clusters.items():
    print(f"Cluster {cluster}:")
    for doc in docs:
        print(f"  {data[doc]}")

print(len(clusters))

Cluster (1, 0):
  Sidemen one word interview
  Sidemen noob vs pro rocket league
  Sidemen are you smarter than a year old
  Sidemen secret santa
  Sidemen vs lucky blocks in minecraft
  Sidemen cards against humanity
  Sidemen memory test
  Sidemen extreme fear pong
  Mr beast hijacks a sidemen video
  Sidemen controversial tweets
  Sidemen lose their minds playing mario
  Sidemen forfeit golf shot it or slap it
  Sidemen take an iq test
  Sidemen become superheroes
  Sidemen giant jenga painful
  Sidemen forfeit golf
  Sidemen mario kart rainbow road rage
  Sidemen mario kart race
  Sidemen rocket league but its golf
  Sidemen olympics
  Sidemen minigolf but it s a race
  Sidemen google feud
  Sidemen flag game
  Sidemen rap battle but we diss ourselves
  The rematch sidemen geoguessr duel
  Sidemen slap game
  Sidemen water game
  Sidemen trippy olympics
  Sidemen geoguessr duel
  Brand new sidemen trivial pursuit
  Brand new sidemen shellshock
  Sidemen minigolf
  The most outrageo

In [4]:
# Cluster the data using DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=4)
dbscan.fit(X)

clusters = {}
for i, label in enumerate(dbscan.labels_):
    if label not in clusters:
        clusters[label] = [i]
    else:
        clusters[label].append(i)

print(len(clusters))

for cluster, docs in clusters.items():
    print(f"Cluster {cluster}:")
    for doc in docs:
        print(f"  {data[doc]}")

12
Cluster -1:
  Sidemen one word interview
  Sidemen among us chaos mode
  Sidemen association challenge
  Sidemen noob vs pro rocket league
  Sidemen among us sheriff role the dumbest lobby ever
  Sidemen are you smarter than a year old
  This is the greatest sidemen video ever
  Sidemen among us jester role
  Sidemen extreme cop chase challenge on gta
  Sidemen christmas mukbang
  Sidemen reverse hide seek on ksi s mega yacht
  Sidemen gta meme olympics
  Buy christmas drillings today
  Sidemen christmas drillings edm remix official music video
  Sidemen box of lies
  Harry potter s doppelg nger
  Sidemen christmas drillings house remix official lyric video
  Sidemen vs lucky blocks in minecraft
  Sidemen grenade game on gta
  Sidemen cards against humanity
  Sidemen memory test
  Sidemen get cancelled challenge
  Sidemen tinder among us edition
  Can ksi guess the real prime
  The sidemen christmas quiz
  Sidemen blind eating challenge
  The moment that cancelled vikkstar
  Sidemen

In [5]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data)

# Cluster the data using LDA
lda = LatentDirichletAllocation(n_components=5)
lda.fit(X)

# Assign each document to a cluster based on the topic with the highest probability
clusters = {}
for i, x in enumerate(X):
    topic = lda.transform(x)[0].argmax()
    if topic not in clusters:
        clusters[topic] = [i]
    else:
        clusters[topic].append(i)

print(len(clusters))

for cluster, docs in clusters.items():
    print(f"Cluster {cluster}:")
    for doc in docs:
        print(f"  {data[doc]}")

5
Cluster 1:
  Sidemen one word interview
  Sidemen among us chaos mode
  Sidemen are you smarter than a year old
  Sidemen tinder among us edition
  Ksi quits sidemen among us forever
  Sidemen is it real or chocolate
  Sidemen forfeit golf shot it or slap it
  Sidemen play among us but the imposter gets hacked
  Sidemen forfeit golf
  Sidemen mario kart rainbow road rage
  Sidemen mario kart race
  Sidemen rocket league but its golf
  Sidemen minigolf but it s a race
  Sidemen shapeshifter among us gone right
  Sidemen among us but everyone is third imposter
  Sidemen geoguessr bullseye mode
  Sidemen disstrack season
  Sidemen play golf it but the map is impossible
  New roles in sidemen among us
  Sidemen among us but it s pure chaos
  Sidemen among us is back
  Trivial pursuit but ksi s knowledge is massive
  Sidemen play competitive tetris
  Gartic phone but ksi is years old
  The nigerian takeover sidemen olympics
  Sidemen describe celebrities in one word
  Women rate the smell

In [6]:
# Load the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

inputs = tokenizer(data, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(inputs["input_ids"], attention_mask=inputs["attention_mask"])
embeddings = outputs[0][:, 0, :].numpy()

kmeans = KMeans(n_clusters=20)
kmeans.fit(embeddings)

clusters = {}
for i, label in enumerate(kmeans.labels_):
    if label not in clusters:
        clusters[label] = [i]
    else:
        clusters[label].append(i)

print(len(clusters))
# Print the clusters
# for cluster, docs in clusters.items():
#     print(f"Cluster {cluster}:")
#     for doc in docs:
#         print(f"  {data[doc]}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


20


In [7]:
j = 0
for i in range(len(clusters)):
    print(f"Cluster {i}:")
    for doc in clusters[i]:
        if doc>j:
            j = doc
        print(f" {data[doc]}")
# print(j)
# print(clusters.keys())

Cluster 0:
 Sidemen among us but harry cheats to win
 Sidemen gta but loser gets slapped
 Sidemen among us but the imposters try to vote each other out
 Sidemen minigolf but it s a race
 Sidemen among us but everyone is third imposter
 Sidemen quiplash but we forgot how to be funny
 Sidemen rap battle but we diss ourselves
 Sidemen play golf it but the map is impossible
 Sidemen among us but it s pure chaos
 Mikel arteta rates the sidemens football ability
 Trivial pursuit but ksi s knowledge is massive
 Gib annoys the sidemen for minutes straight
 Gartic phone but ksi is years old
 Biggest sidemen lies exposed
 Ksi has left moresidemen
 Quirky quiplash but tobi hates simon
 Sidemen but we re set
 Mad verse city but ksi can t rap
 Sidemen golf but there s on the line
 Ksi proves he s the best at fifa
 Sidemen codenames but it s customised
 Sidemen chaotic trolling in among us proximity
 Sidemen vs faze clan among us but ksi has iq
 Sidemen play among us but there s a rd impostor sideme

In [8]:
# Recommendation

In [12]:
combined_features = pd.concat([df["title"], df["description"]])
combined_features = combined_features.apply(stemming)

# Compute similarity matrix
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(combined_features)
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Define function to recommend similar videos
def recommend_similar_videos(video_id, cosine_sim):
    video_index = df[df["video_id"] == video_id].index[0]
    sim_scores = list(enumerate(cosine_sim[video_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    video_indices = [i[0] for i in sim_scores]
    return df.iloc[video_indices]

# Test recommendation function
f = df[df.video_id=='AtXe3DAxPZg'].title
print(f)
recommend_similar_videos("AtXe3DAxPZg", cosine_sim)

6    SIDEMEN ARE YOU SMARTER THAN A 10 YEAR OLD 2
Name: title, dtype: object


Unnamed: 0,channel,video_id,title,description,categoryId,publishedAt,duration,tags,likes,views,comments,video_title,vectorized_data
889,Sidemen,6ZCtuKvBYcw,SIDEMEN ARE YOU SMARTER THAN A 10 YEAR OLD,👉🏻: Subscribe to our Reacts Channel: https://w...,24,2021-04-18T17:00:23Z,PT1H17M3S,"['sidemen', 'sidemen sunday', '#sidemensunday'...",532280.0,14997485.0,24053,Sidemen are you smarter than a year old,"(0, 632)\t0.3766466862199205\n (0, 1086)\t0..."
786,MoreSidemen,FO5rw8fYUtY,BEST OLD SIDEMEN MOMENTS!,BEST MOMENTS FROM OLD SIDEMEN VIDEOS - PLEASE ...,22,2018-02-21T20:45:35Z,PT13M23S,"['sidemen', 'more sidemen', 'sidemen moments',...",186710.0,7091627.0,3264,Best old sidemen moments,"(0, 595)\t0.5515053222300713\n (0, 69)\t0.4..."
692,MoreSidemen,Dr6qecvZeew,SIDEMEN REACT TO OLD VIDEOS 3!,SIDEMEN REACT TO OLD VIDEOS 3 - PLEASE LEAVE A...,22,2018-12-14T21:00:01Z,PT10M29S,,24708.0,826885.0,455,Sidemen react to old videos,"(0, 749)\t0.5157314858774761\n (0, 1017)\t0..."
718,MoreSidemen,hh2-4FPtOxk,SIDEMEN REACT TO OLD VIDEOS 2!,SIDEMEN REACT TO OLD VIDEOS 2 - PLEASE LEAVE A...,22,2018-10-08T19:00:03Z,PT13M47S,,32633.0,1128161.0,352,Sidemen react to old videos,"(0, 749)\t0.5157314858774761\n (0, 1017)\t0..."
1005,Sidemen,2ubwlOsUXGs,SIDEMEN REACT TO OLD VIDEOS 2,The Sidemen react to a bunch of their old vide...,24,2019-01-27T18:01:07Z,PT14M29S,"['sidemen', 'sidemen sunday', 'sidemen react t...",210875.0,8671641.0,4357,Sidemen react to old videos,"(0, 749)\t0.5157314858774761\n (0, 1017)\t0..."
1063,Sidemen,e_Ql2ThXzAM,SIDEMEN REACT TO OLD VIDEOS,SIDEMEN REACTING TO OLD SIDEMEN VIDEOS! #Sidem...,24,2018-02-04T18:00:19Z,PT15M43S,"['sidemen', 'sidemen sunday', 'sidemen sundays...",348345.0,19078502.0,4423,Sidemen react to old videos,"(0, 749)\t0.5157314858774761\n (0, 1017)\t0..."
744,MoreSidemen,N3MpGyVrwnE,SIDEMEN REACTING TO OLD VIDEOS!,SIDEMEN REACTING TO OLD VIDEOS - PLEASE LEAVE ...,22,2018-08-10T19:00:02Z,PT14M16S,,52639.0,1631526.0,695,Sidemen reacting to old videos,"(0, 750)\t0.5831085995755863\n (0, 1017)\t0..."
197,MoreSidemen,S_IOdiORI2o,SIDEMEN 8 YEAR ANNIVERSARY ON GTA 5,🎥: Access exclusive content at: https://www.si...,22,2021-11-02T19:00:15Z,PT27M42S,"['sidemen', 'moresidemen', 'miniminter', 'ksi'...",245273.0,6202423.0,4365,Sidemen year anniversary on gta,"(0, 22)\t0.636986903402816\n (0, 375)\t0.38..."
808,Sidemen,Pnpc0pySVTo,SIDEMEN OLD vs YOUNG FOR 24 HOURS CHALLENGE,SIDEMEN OLD vs YOUNG CHALLENGE HOW EXCITING\n🍗...,24,2022-10-30T19:31:52Z,PT42M25S,"['sidemen', 'sidemen sunday', '#sidemensunday']",351463.0,8020290.0,9357,Sidemen old vs young for hours challenge,"(0, 1091)\t0.5827298948003234\n (0, 423)\t0..."
801,Sidemen,u9-FIgI_V4s,"SIDEMEN $20,000 BIG FAT QUIZ OF THE YEAR",🎄 BUY CHRISTMAS DRILLINGS PLEASE: https://www....,24,2022-12-18T18:48:24Z,PT49M24S,"['sidemen', 'sidemen sunday', '#sidemensunday']",240637.0,5765737.0,4332,Sidemen big fat quiz of the year,"(0, 292)\t0.5046113473368071\n (0, 71)\t0.5..."
