In [20]:
!pip install nltk



In [40]:
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.probability import FreqDist
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

def extract_themes_from_review(review_text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(review_text)
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words and word.isalpha()]
    tagged_words = pos_tag(filtered_words)
    relevant_words = [word for word, tag in tagged_words if tag.startswith(('NN', 'JJ'))]
    freq_dist = FreqDist(relevant_words)
    most_common_words = freq_dist.most_common(5)
    themes = [word for word, freq in most_common_words]
    return themes

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
from tqdm import tqdm
with open('short_dataset.json') as file:
    data = json.load(file)
    titles = data['titles']
    reviews = data['reviews']
album_themes_map = {}
for title, review in tqdm(zip(titles, reviews), total=len(titles), desc="Processing"):
    themes = extract_themes_from_review(review)
    album_themes_map[title] = themes


Processing: 100%|██████████| 47720/47720 [01:14<00:00, 636.33it/s]


In [52]:
for album, themes in list(album_themes_map.items())[:5]:
    print(f"Album: {album}, Themes: {themes}")

Album: michael spyres in the shadows, Themes: ['unexpected', 'got', 'fact', 'michael', 'albums']
Album: js bach st matthew passion arr mendelssohn, Themes: ['story', 'mendelssohn', 'unknown', 'st', 'matthew']
Album: mendelssohn lieder ohne worte michael barenboim, Themes: ['hackles', 'idea', 'mendelssohn', 'songs', 'words']
Album: strauss ein heldenleben mahler ruckert lieder payare, Themes: ['unlikely', 'programme', 'kicks', 'orchestral', 'work']
Album: karchin keyboards winds, Themes: ['gt', 'earthlings', 'pandemic', 'louis', 'karchin']


In [45]:
def jaccard_similarity(set1, set2):
    intersection = len(set(set1).intersection(set2))
    union = len(set(set1).union(set2))
    return intersection / union if union != 0 else 0

def find_most_relevant_albums(input_album, album_themes_map, top_n=10):
    if input_album not in album_themes_map:
        print(f"Album '{input_album}' not found.")
        return []
    input_themes = album_themes_map[input_album]
    similarity_scores = []
    for album, themes in album_themes_map.items():
        if album == input_album:
            continue
        score = jaccard_similarity(input_themes, themes)
        similarity_scores.append((album, score))
    sorted_albums = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    return sorted_albums[:top_n]


In [49]:
input_album = "michael spyres in the shadows"
relevant_albums = find_most_relevant_albums(input_album, album_themes_map, 10)
for album, score in relevant_albums:
    print(f"Album: {album}, Similarity Score: {score}")

Album: organ recital 2, Similarity Score: 0.25
Album: bach keyboard concertos vol 2, Similarity Score: 0.2
Album: v kingur lafsson from afar, Similarity Score: 0.16666666666666666
Album: brahms mendelssohn string quartets, Similarity Score: 0.14285714285714285
Album: bach bartók boulez works for solo violin, Similarity Score: 0.14285714285714285
Album: michael haydn string quintets, Similarity Score: 0.14285714285714285
Album: bartók duke bluebeards castle 8, Similarity Score: 0.14285714285714285
Album: dixit dominus, Similarity Score: 0.14285714285714285
Album: rachmaninov nocturne vespers byzantine hymns, Similarity Score: 0.14285714285714285
Album: enescu violin sonatas 0, Similarity Score: 0.14285714285714285
