In [22]:
import pandas as pd
import re
import codecs
from sklearn.feature_extraction.text import TfidfVectorizer
from datasketch import MinHashLSHForest, MinHash
from itertools import combinations
from collections import defaultdict
import csv
import math
import hashlib

In [23]:
songs = pd.read_csv('./songs.csv')
lyrics = pd.read_csv('./lyrics.csv')

### 1. TF-IDF

In [24]:
def splitwords(row):
    lyric = codecs.decode(row['lyrics'], "unicode_escape")
    lyric = re.sub(r'[ \u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]', ' ', lyric)
    lyric = re.sub(r"(?<!\w)'(?!s\s|t\s|re\s|ve\s|m\s|d\s|ll\s|nt\s)", "", lyric)
    lyric = re.sub(r"\', ", " ", lyric)
    lyric = re.sub(r',', '', lyric)
    lyric = re.sub(r'\]', '', re.sub(r'\[', '', lyric))
    lyric = re.sub(r'\"', '', lyric)
    return lyric

In [25]:
lyrics['formatted_lyrics'] = lyrics.apply(splitwords, axis=1)
all_lyrics = lyrics['formatted_lyrics'].tolist()
all_songs = lyrics['track_name'].tolist()

# initialize a TfidfVectorizer object
vectorizer = TfidfVectorizer(stop_words='english')

# # fit_transform the lyrics to obtain matrix representation of TF-IDF scores
tfidf_matrix = vectorizer.fit_transform(all_lyrics)

# feature names are the 
feature_names = vectorizer.get_feature_names_out()

all_sig_words = []
all_scores = []

for i in range(len(all_lyrics)):
    tfidf_scores = tfidf_matrix[i].toarray()[0]
    max_ind = tfidf_scores.argmax()
    sig_word = feature_names[max_ind]
    max_score = max(tfidf_scores)

    all_sig_words.append(sig_word)
    all_scores.append(max_score)

tfidf = pd.DataFrame({'track_name': all_songs, 'significant_word': all_sig_words, 'score': all_scores})
tfidf

Unnamed: 0,track_name,significant_word,score
0,happier,happy,0.447475
1,Solo,let,0.432551
2,Search & Rescue,mami,0.436402
3,WAIT FOR U (feat. Drake & Tems),vocãª,0.568177
4,Time Flies,outside,0.441883
5,ball w/o you,ball,0.652065
6,a lot,lot,0.701501
7,Runnin,runnin,0.885889
8,Glock In My Lap,21,0.589367
9,Maria I'm Drunk (feat. Justin Bieber & Young T...,bãªbados,0.427328


### 2. Map-Reduce

Verify this result with Hadoop MapReduce computation.

In [26]:
df = songs[['playlist_uri', 'track_name', 'id', 'popularity']]
df.groupby('playlist_uri').mean(numeric_only=True)

Unnamed: 0_level_0,popularity
playlist_uri,Unnamed: 1_level_1
spotify:playlist:0Blc7H0vclyoZYpkfN5oFn,77.416667
spotify:playlist:0cNYHRGCv0cyQjeLblv2Oa,75.666667
spotify:playlist:0pYKCDEitaEtWRzYJX0Hwy,62.127273
spotify:playlist:0uRHOYgXiR3l9BwssW7IMH,78.727273
spotify:playlist:0wO7kqupC0YZBLlJsQwceS,52.200000
...,...
spotify:playlist:6MoR16aokakIrchsWepo2x,76.920000
spotify:playlist:6ySrUrNk0prJL8szfuiuIA,68.535714
spotify:playlist:72lmW37G35cATUdAiDPKdj,79.428571
spotify:playlist:7J8BhUluUUOFKMv9GkmfbI,72.888889


### 3. LSH

In [27]:
feature_vectors = songs[['id','danceability','speechiness','acousticness','instrumentalness','liveness']]
feature_vectors = feature_vectors.drop_duplicates()
feature_vectors.drop('id', inplace=True, axis=1)

In [28]:
lsh = MinHashLSHForest(num_perm=128)

for i in range(len(feature_vectors)):
    song_features = feature_vectors.iloc[i]
    minhash = MinHash(num_perm=128)  # Adjust the number of permutations as needed

    # Hash the song features
    for feature in song_features:
        minhash.update(str(feature).encode('utf-8'))

    # Index the song in LSH model
    lsh.add(str(i), minhash)

lsh.index()

query_song_index = 0
query_song_features = feature_vectors.iloc[query_song_index]

# Create MinHash for the query song
query_minhash = MinHash(num_perm=128)  # Adjust the number of permutations as needed

# Hash the query song features
for feature in query_song_features:
    query_minhash.update(str(feature).encode('utf-8'))

# Query the LSH model
result = lsh.query(query_minhash, k=10)  # Retrieve the top 10 similar songs

# Retrieve the actual similar songs from the dataset

songs[['track_name','danceability','speechiness','acousticness','instrumentalness','liveness']].iloc[map(int, result)]
# similar_songs

Unnamed: 0,track_name,danceability,speechiness,acousticness,instrumentalness,liveness
1248,Control,0.599,0.0314,0.643,2e-06,0.149
801,You Right,0.828,0.0565,0.0164,0.00233,0.0845
198,Lost in the Fire (feat. The Weeknd),0.658,0.0363,0.0933,0.000927,0.115
1448,Ho Hey,0.685,0.0304,0.794,2e-06,0.0915
2312,Can't Help Falling in Love,0.396,0.0275,0.941,0.000196,0.105
1803,The Frozen Planet,0.0607,0.0432,0.645,0.493,0.0616
2054,Teenage Dirtbag - Sped Up,0.663,0.0804,0.514,0.0583,0.119
2041,No Sleep Till Brooklyn,0.712,0.128,0.122,0.0,0.0538
1869,Too Good At Goodbyes,0.681,0.0432,0.64,0.0,0.169
0,happier,0.395,0.133,0.765,1e-05,0.0839


### 4. Bloom Filter

In [None]:
class BloomFilter:
    def _init_(self, num_items, false_positive_rate):
        self.num_items = num_items
        self.false_positive_rate = false_positive_rate
        self.num_bits = self.calculate_num_bits(num_items, false_positive_rate)
        self.num_hashes = self.calculate_num_hashes(num_items, self.num_bits)
        self.bit_array = [0] * self.num_bits
        print("Number of bits:", self.num_bits)
        self.hash_functions = self.create_hash_functions(self.num_hashes)

    def calculate_num_bits(self, num_items, false_positive_rate):
        numerator = num_items * abs(math.log(false_positive_rate))
        denominator = math.log(2) ** 2
        return int(numerator / denominator)

    def calculate_num_hashes(self, num_items, num_bits):
        numerator = num_bits * math.log(2)
        return max(1, int(numerator / num_items))

    def create_hash_functions(self, num_hashes):
        hash_functions = []
        for seed in range(num_hashes):
            hash_functions.append(self.hash_function(seed))
        return hash_functions

    def hash_function(self, seed):
        def fnv1a_32(data):
            hash_value = 2166136261
            for byte in data:
                hash_value ^= byte
                hash_value *= 16777619
            return hash_value

        def sha256(data):
            return int(hashlib.sha256(data).hexdigest(), 16)

        if seed % 2 == 0:
            return lambda data: fnv1a_32(data.encode())
        else:
            return lambda data: sha256(data.encode())

    def add(self, item):
        for hash_func in self.hash_functions:
            index = hash_func(str(item)) % self.num_bits
            self.bit_array[index] = 1

    def contains(self, item):
        for hash_func in self.hash_functions:
            index = hash_func(str(item)) % self.num_bits
            if self.bit_array[index] == 0:
                return False
        return True


def load_dataset(file_path):
    dataset = []
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            dataset.append(row)
    return dataset


def get_track_info(dataset, song_id):
    for row in dataset:
        if row[2] == song_id:
            track_name = row[0]
            artist = row[1]
            return track_name, artist
    return None, None


dataset_path = 'songs.csv'
dataset = load_dataset(dataset_path)
bloom_filter = BloomFilter(len(dataset), 0.05)


for row in dataset:
    song_id = row[2]
    bloom_filter.add(song_id)

# Search for a song in the bloom filter
# 4lH6nENd1y81jp7Yt9lTBX

search_song_id = '2Hh3ETdQKrmSI3QS0hme7g'
if bloom_filter.contains(search_song_id):
    track_name, artist = get_track_info(dataset, search_song_id)
    if track_name and artist:
        print(
            f"The song with ID '{search_song_id}' is likely in the playlist.")
        print(f"Track Name: {track_name}")
        print(f"Artist: {artist}")
    else:
        print(
            f"Failed to retrieve track name and artist for song ID '{search_song_id}'.")
else:
    print(f"The song with ID '{search_song_id}' is not in the playlist.")

### 5. PCY Algorithm

In [30]:
# 1. Representing songs as a set of transactions
playlistset = songs['playlist_uri'].drop_duplicates()
transactions = defaultdict(set)
for playlist in playlistset:
    transactions[playlist] = set(songs.loc[songs['playlist_uri'] == playlist, 'id'].tolist())

In [31]:
# 2. count of each item and generate candidate itemset
single_item_frequency = defaultdict(int)
for transaction in transactions:
    for item in transactions[transaction]:
        single_item_frequency[item] += 1
sc = 3
candidate_item_set = {k: v for k, v in single_item_frequency.items() if single_item_frequency[k] >= sc}
len(candidate_item_set)

215

In [32]:
num_buckets = 223

# Filtering step: Identify frequent individual songs
frequent_songs = candidate_item_set

# Pair counting step: Count frequent pairs
frequent_pairs = defaultdict(int)
for transaction in transactions.values():
    # Generate candidate pairs of frequent songs
    frequent_candidate_pairs = combinations([song for song in transaction if song in frequent_songs], 2)

    # Hash pairs and increment counts for frequent pairs
    for pair in frequent_candidate_pairs:
        pair_hash = hash(pair) % num_buckets  # Use a hash function to map the pair to a specific bucket
        frequent_pairs[pair] += 1

# Filter frequent pairs based on support threshold
frequent_pairs = {pair: count for pair, count in frequent_pairs.items() if count >= sc}

frequent_pairs


{('7MXVkk9YMctZqd1Srtv4MB', '76FZM38RC8XaAjJ77CVTNe'): 3,
 ('7MXVkk9YMctZqd1Srtv4MB', '09mEdoA6zrmBPgTEN5qXmN'): 4,
 ('6ocbgoVGwYJhOv1GgI9NsF', '09mEdoA6zrmBPgTEN5qXmN'): 3,
 ('6ocbgoVGwYJhOv1GgI9NsF', '2nMeu6UenVvwUktBCpLMK9'): 3,
 ('09mEdoA6zrmBPgTEN5qXmN', '2nMeu6UenVvwUktBCpLMK9'): 3,
 ('68Dni7IE4VyPkTOH9mRWHr', '6PGoSes0D9eUDeeAafB2As'): 3,
 ('68Dni7IE4VyPkTOH9mRWHr', '7xoUc6faLbCqZO6fQEYprd'): 3,
 ('1Y3LN4zO1Edc2EluIoSPJN', '6GGtHZgBycCgGBUhZo81xe'): 3,
 ('4nVBt6MZDDP6tRVdQTgxJg', '15JINEqzVMv3SvJTAXAKED'): 3,
 ('4nVBt6MZDDP6tRVdQTgxJg', '3QGsuHI8jO1Rx4JWLUh9jd'): 3,
 ('4nVBt6MZDDP6tRVdQTgxJg', '3U4isOIWM3VvDubwSI3y7a'): 3,
 ('4nVBt6MZDDP6tRVdQTgxJg', '0u2P5u6lvoDfwTYjAADbn4'): 3,
 ('4nVBt6MZDDP6tRVdQTgxJg', '0pqnGHJpmpxLKifKRmU6WP'): 3,
 ('4nVBt6MZDDP6tRVdQTgxJg', '21jGcNKet2qwijlDFuPiPb'): 3,
 ('4nVBt6MZDDP6tRVdQTgxJg', '6nek1Nin9q48AVZcWs9e9D'): 3,
 ('4nVBt6MZDDP6tRVdQTgxJg', '7qEHsqek33rTcFNT9PFqLf'): 3,
 ('4nVBt6MZDDP6tRVdQTgxJg', '1zB4vmk8tFRmM9UULNzbLB'): 3,
 ('5Ohxk2dO5CO