This module utilizes the following dataset:
https://www.kaggle.com/datasets/michaelbryantds/cpu-and-gpu-product-data

It can either be imported using the code block below or downloaded manually as a .zip file. If you use the code below, make sure to run `pip install kagglehub` first.

All uses of AI-generated code will be clearly labeled.

In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("undefinenull/million-song-dataset-spotify-lastfm")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\yalts\.cache\kagglehub\datasets\undefinenull\million-song-dataset-spotify-lastfm\versions\1


Go to the specified path above and copy the file from that folder to the same directory as this Python file.

In [None]:
#Import modules
import pandas as pd
import networkx as nx
from collections import Counter
from io import StringIO

In [43]:
# Load the datasets, handling the empty file
try:
    listens_df = pd.read_csv("listens.csv")
except pd.errors.EmptyDataError:
    # Create dummy data for listens_df if the file is empty
    listens_df = pd.DataFrame(columns=['track_id', 'user_id', 'playcount'])

music_df = pd.read_csv("music.csv")

# Merge the DataFrames on 'track_id'
merged_df = pd.merge(listens_df, music_df, on='track_id', how='inner')

# Preprocess the tag data
merged_df['tags'] = merged_df['tags'].astype(str).str.lower().str.strip()
tag_lists = merged_df['tags'].str.split(', ')
unique_tags = list(set(tag for tags in tag_lists for tag in tags))

# Build the tag co-occurrence matrix
tag_index = {tag: i for i, tag in enumerate(unique_tags)}
tag_cooccurrence_matrix = [[0] * len(unique_tags) for _ in range(len(unique_tags))]

for tags in tag_lists:
    for i, tag1 in enumerate(tags):
        for j, tag2 in enumerate(tags):
            if i != j:
                tag_cooccurrence_matrix[tag_index[tag1]][tag_index[tag2]] += 1

# Convert the co-occurrence matrix to a DataFrame
cooccurrence_df = pd.DataFrame(tag_cooccurrence_matrix, index=unique_tags, columns=unique_tags)

# Convert to long format for easier use with NetworkX
cooccurrence_df = cooccurrence_df.reset_index().rename(columns={'index': 'tag1'})
cooccurrence_df_long = cooccurrence_df.melt(id_vars='tag1', var_name='tag2', value_name='weight')

# Filter out self-loops and zero-weight edges
cooccurrence_df_filtered = cooccurrence_df_long[(cooccurrence_df_long['tag1'] != cooccurrence_df_long['tag2']) & (cooccurrence_df_long['weight'] > 0)]

# Display the first 5 rows of the filtered DataFrame
print(cooccurrence_df_filtered.head().to_markdown(index=False, numalign="left", stralign="left"))


KeyboardInterrupt: 