In [1]:
import pandas as pd
import sys # To exit if files don't load

# --- Configuration ---
# IMPORTANT: Make sure these paths point to your actual processed files
PLAYLISTS_FILE = 'C:\\Narasimha\\KLETU Related\\6th Semester Related\\GenAI and NLP\\GenAI\\Course Project\\GitHub Repo\\Multi-Hop-RAG-for-Personalized-Music-Recommendation\\data\\processed\\mpd_playlists.parquet'
TRACKS_FILE = 'C:\\Narasimha\\KLETU Related\\6th Semester Related\\GenAI and NLP\\GenAI\\Course Project\\GitHub Repo\\Multi-Hop-RAG-for-Personalized-Music-Recommendation\\data\\processed\\mpd_unique_tracks.parquet'
PLAYLIST_TRACKS_FILE = 'C:\\Narasimha\\KLETU Related\\6th Semester Related\\GenAI and NLP\\GenAI\\Course Project\\GitHub Repo\\Multi-Hop-RAG-for-Personalized-Music-Recommendation\\data\\processed\\mpd_playlist_tracks.parquet'

print("--- Starting MPD Data Verification ---")

# --- 1. Load the Parquet Files ---
print("\nStep 1: Loading Parquet files...")
try:
    playlists_df = pd.read_parquet(PLAYLISTS_FILE)
    tracks_df = pd.read_parquet(TRACKS_FILE)
    playlist_tracks_df = pd.read_parquet(PLAYLIST_TRACKS_FILE)
    print(f"  Successfully loaded:")
    print(f"    - Playlists: {PLAYLISTS_FILE}")
    print(f"    - Tracks: {TRACKS_FILE}")
    print(f"    - Playlist-Track Map: {PLAYLIST_TRACKS_FILE}")
except Exception as e:
    print(f"Error loading Parquet files: {e}")
    print("Cannot continue verification. Please check file paths and integrity.")
    sys.exit(1) # Exit the script if files can't load

# --- 2. Basic Checks (Individual DataFrames) ---
print("\nStep 2: Performing Basic Checks...")

print("\n--- Verifying playlists_df ---")
print("Shape:", playlists_df.shape)
print("Info:")
playlists_df.info()
print("Missing Values Sum:\n", playlists_df.isnull().sum())
print("Number of unique PIDs:", playlists_df['pid'].nunique())
print("Head:\n", playlists_df.head())


print("\n--- Verifying tracks_df ---")
print("Shape:", tracks_df.shape)
print("Info:")
tracks_df.info()
print("Missing Values Sum:\n", tracks_df.isnull().sum())
# Check for duplicate track URIs (should ideally be 0)
duplicate_tracks = tracks_df['track_uri'].duplicated().sum()
print(f"Duplicate track URIs found: {duplicate_tracks}")
if duplicate_tracks > 0:
     print("  WARNING: Duplicate track URIs found in the unique tracks table!")
print("Number of unique track URIs:", tracks_df['track_uri'].nunique())
print("Head:\n", tracks_df.head())


print("\n--- Verifying playlist_tracks_df ---")
print("Shape:", playlist_tracks_df.shape)
print("Info:")
playlist_tracks_df.info()
print("Missing Values Sum:\n", playlist_tracks_df.isnull().sum())
print("Number of unique PIDs in map:", playlist_tracks_df['pid'].nunique())
print("Number of unique track URIs in map:", playlist_tracks_df['track_uri'].nunique())
print("Head:\n", playlist_tracks_df.head())


# --- 3. Consistency Checks (Across DataFrames) ---
print("\nStep 3: Performing Consistency Checks...")

# Check 1: Do unique PIDs match between playlists_df and playlist_tracks_df?
unique_pids_playlists = set(playlists_df['pid'])
unique_pids_map = set(playlist_tracks_df['pid'])
print(f"\nCheck 1: Comparing PIDs in playlists_df ({len(unique_pids_playlists)}) vs playlist_tracks_df ({len(unique_pids_map)})...")
if unique_pids_playlists == unique_pids_map:
    print("  Result: Passed. Unique PIDs match.")
else:
    print("  Result: Failed/Warning. PID sets differ.")
    diff1 = unique_pids_playlists - unique_pids_map
    if diff1:
        print(f"    - {len(diff1)} PIDs found in playlists_df but not in playlist_tracks_df.")
    diff2 = unique_pids_map - unique_pids_playlists
    if diff2:
        print(f"    - {len(diff2)} PIDs found in playlist_tracks_df but not in playlists_df.")

# Check 2: Does the set of track_uris in playlist_tracks_df match tracks_df?
unique_tracks_map = set(playlist_tracks_df['track_uri'])
unique_tracks_table = set(tracks_df['track_uri'])
print(f"\nCheck 2: Comparing Track URIs in tracks_df ({len(unique_tracks_table)}) vs playlist_tracks_df ({len(unique_tracks_map)})...")
if unique_tracks_map == unique_tracks_table:
     print("  Result: Passed. Unique track URIs match.")
else:
    print(f"  Result: Warning/Info. Track URI sets differ.")
    missing_from_tracks_table = unique_tracks_map - unique_tracks_table
    if missing_from_tracks_table:
         print(f"    - WARNING: {len(missing_from_tracks_table)} track URIs are in playlist_tracks_df but NOT in tracks_df!")
    missing_from_map = unique_tracks_table - unique_tracks_map
    if missing_from_map:
         print(f"    - Info: {len(missing_from_map)} track URIs are in tracks_df but NOT used in playlist_tracks_df.")


# Check 3: Check track counts for a sample of playlists
print("\nCheck 3: Verifying track counts for a sample of 5 playlists...")
if len(playlists_df) > 0:
    # Ensure we don't sample more playlists than available
    sample_size = min(5, len(playlists_df))
    sample_pids_df = playlists_df.sample(sample_size)

    mismatch_count = 0
    for index, row in sample_pids_df.iterrows():
        pid_to_check = row['pid']
        expected_count = row['num_tracks']
        # Calculate actual count from playlist_tracks_df
        actual_count = playlist_tracks_df[playlist_tracks_df['pid'] == pid_to_check].shape[0]

        if expected_count == actual_count:
            print(f"  - PID {pid_to_check}: OK (Expected={expected_count}, Found={actual_count})")
        else:
            print(f"  - PID {pid_to_check}: MISMATCH! (Expected={expected_count}, Found={actual_count})")
            mismatch_count += 1
    if mismatch_count == 0:
        print("  Result: Sample counts match expected values.")
    else:
        print(f"  Result: Found {mismatch_count} mismatches in the sample.")
else:
    print("  Skipping check 3 because playlists_df is empty.")


# Check 4: Check relationships for a single random entry from playlist_tracks_df
print("\nCheck 4: Verifying relationships for one random playlist-track entry...")
if not playlist_tracks_df.empty:
    sample_entry = playlist_tracks_df.sample(1).iloc[0]
    sample_pid = sample_entry['pid']
    sample_track_uri = sample_entry['track_uri']
    print(f"  Sample Entry: PID={sample_pid}, TrackURI={sample_track_uri}")

    # Does the PID exist in playlists_df?
    playlist_exists = sample_pid in unique_pids_playlists # Use the set created earlier
    print(f"  Playlist (PID {sample_pid}) exists in playlists_df: {playlist_exists}")

    # Does the track_uri exist in tracks_df?
    track_exists = sample_track_uri in unique_tracks_table # Use the set created earlier
    print(f"  Track (URI {sample_track_uri}) exists in tracks_df: {track_exists}")

    if playlist_exists and track_exists:
        print("  Result: Relationships seem consistent for the sample.")
    else:
        print("  Result: Relationship inconsistency detected for the sample!")
else:
    print("  Skipping check 4 because playlist_tracks_df is empty.")


print("\n--- Verification Complete ---")

--- Starting MPD Data Verification ---

Step 1: Loading Parquet files...
  Successfully loaded:
    - Playlists: C:\Narasimha\KLETU Related\6th Semester Related\GenAI and NLP\GenAI\Course Project\GitHub Repo\Multi-Hop-RAG-for-Personalized-Music-Recommendation\data\processed\mpd_playlists.parquet
    - Tracks: C:\Narasimha\KLETU Related\6th Semester Related\GenAI and NLP\GenAI\Course Project\GitHub Repo\Multi-Hop-RAG-for-Personalized-Music-Recommendation\data\processed\mpd_unique_tracks.parquet
    - Playlist-Track Map: C:\Narasimha\KLETU Related\6th Semester Related\GenAI and NLP\GenAI\Course Project\GitHub Repo\Multi-Hop-RAG-for-Personalized-Music-Recommendation\data\processed\mpd_playlist_tracks.parquet

Step 2: Performing Basic Checks...

--- Verifying playlists_df ---
Shape: (5000, 10)
Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
