In [12]:
import requests
from bs4 import BeautifulSoup

def scrape_vid_data(vid_url):
    response = requests.get(vid_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting metadata
    title = soup.find('meta', {'name': 'title'})['content']
    description = soup.find('meta', {'name': 'description'})['content']
    views = soup.find('meta', {'itemprop': 'interactionCount'})['content']
    
    #parsing for tags - needed for HTML
    tags = soup.find('meta', {'name': 'keywords'})['content'] if soup.find('meta', {'name': 'keywords'}) else ""
    return {
        'title': title,
        'description': description,
        'tags': tags,
        'views': int(views)
    }

#Scraping video URLs
vid_urls = [
    "https://www.youtube.com/watch?v=B4Q_dlfQFyg",
    "https://www.youtube.com/watch?v=rrAbtDOXkfk",
]

data = [scrape_vid_data(url) for url in vid_urls]


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

#Converting scraped data to DataFrame
df = pd.DataFrame(data)

#Combining textual fields
df['text'] = df['title'] + " " + df['description'] + " " + df['tags']

# Textual similarity
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['text'])

# Normalizing numerical features
scaler = MinMaxScaler()
df[['views']] = scaler.fit_transform(df[['views']])

In [17]:
from sklearn.metrics import pairwise_distances

# Calculating similarities
text_similarity = cosine_similarity(tfidf_matrix)
num_similarity = 1 / (1 + pairwise_distances(df[['views']], metric='euclidean'))

# Composite similarity score
comp_similarity = 0.7 * text_similarity + 0.3 * num_similarity

# Attempt to find top 10 most similar videos for each query. It shows 1 :/
query_indices = [1] 
for idx in query_indices:
    similar_indices = comp_similarity[idx].argsort()[-11:-1][::-1]
    print(f"Top 10 similar videos for '{df.iloc[idx]['title']}':")
    print(df.iloc[similar_indices][['title', 'views']])

Top 10 similar videos for 'NEW BMW M5 vs Audi RS7 Performance vs AMG E63 S // DRAG & ROLL RACE':
                                   title  views
0  2025 BMW M5 Review // We Need To Talk    0.0
