In [None]:
from gurobipy import Model, GRB, quicksum
import pandas as pd

# Load your min-max normalized song dataset
df = pd.read_csv("songs_with_predictions_small (1).csv")

user_cols = [col for col in df.columns if col.startswith('user_')]
track_names = df['track_name'].tolist()
artists = df['artist_name'].tolist()
unique_artists = list(set(artists))
num_tracks = len(track_names)
playlist_sizes = list(range(10, 41, 5))  # test 10, 15, 20, 25, 30, 35, 40
max_per_artist = 2  # keep artist cap fixed

results = []

for playlist_size in playlist_sizes:
    for user_col in user_cols:
        ratings = df[user_col].fillna(0).tolist()

        model = Model(f"Playlist_{user_col}_{playlist_size}")
        model.setParam('OutputFlag', 0)

        X = model.addVars(num_tracks, vtype=GRB.BINARY, name="X")
        model.addConstr(quicksum(X[i] for i in range(num_tracks)) == playlist_size)

        artist_to_songs = {artist: [] for artist in unique_artists}
        for i, artist in enumerate(artists):
            artist_to_songs[artist].append(i)

        for artist in unique_artists:
            model.addConstr(quicksum(X[i] for i in artist_to_songs[artist]) <= max_per_artist)

        model.setObjective(quicksum(X[i] * ratings[i] for i in range(num_tracks)), GRB.MAXIMIZE)
        model.optimize()

        if model.status == GRB.OPTIMAL:
            selected = [i for i in range(num_tracks) if X[i].X > 0.5]
            total_rating = sum(ratings[i] for i in selected)
            diversity = len(set(artists[i] for i in selected))
            avg_rating = total_rating / playlist_size

            results.append({
                "user": user_col,
                "playlist_size": playlist_size,
                "total_rating": total_rating,
                "avg_rating_per_song": avg_rating,
                "diversity": diversity
            })

# Convert to DataFrame
size_results_df = pd.DataFrame(results)

# Save to CSV
size_results_df.to_csv("size_results.csv", index=False)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load your dataset
df = pd.read_csv("diversity_vs_playlist_size.csv")  # replace with actual file

# Calculate changes
df = df.sort_values("playlist_size")
df["delta_diversity"] = df["unique_artists"].diff()

# Plot shadow price for diversity
plt.figure(figsize=(10, 5))
plt.plot(df["playlist_size"], df["delta_diversity"], marker='s', color='green')
plt.axhline(0, linestyle='--', color='gray')
plt.title("Shadow Price: Diversity vs Playlist Size")
plt.xlabel("Playlist Size")
plt.ylabel("Δ Unique Artists")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Calculate changes
df["delta_avg_rating"] = df["avg_rating_per_song"].diff()

# Plot shadow price for average rating
plt.figure(figsize=(10, 5))
plt.plot(df["playlist_size"], df["delta_avg_rating"], marker='o', color='orange')
plt.axhline(0, linestyle='--', color='gray')
plt.title("Shadow Price: Avg Rating per Song vs Playlist Size")
plt.xlabel("Playlist Size")
plt.ylabel("Δ Avg Rating per Song")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Load your original dataset
df = pd.read_csv("songs_with_predictions_small (1).csv")

# Identify user rating columns
user_cols = [col for col in df.columns if col.startswith('user_')]

# Min-max normalize ratings per user
df[user_cols] = df[user_cols].apply(
    lambda col: (col - col.min()) / (col.max() - col.min()) if col.max() != col.min() else 0,
    axis=0
)

from gurobipy import Model, GRB, quicksum

track_names = df['track_name'].tolist()
artists = df['artist_name'].tolist()
unique_artists = list(set(artists))
num_tracks = len(track_names)

results = []

for user_col in user_cols:
    ratings = df[user_col].fillna(0).tolist()

    for max_per_artist in range(1, 6):
        model = Model(f"Playlist_{user_col}_{max_per_artist}")
        model.setParam('OutputFlag', 0)

        # Binary selection variables
        X = model.addVars(num_tracks, vtype=GRB.BINARY, name="X")
        model.addConstr(quicksum(X[i] for i in range(num_tracks)) == 30)

        # Artist constraints
        artist_to_songs = {artist: [] for artist in unique_artists}
        for i, artist in enumerate(artists):
            artist_to_songs[artist].append(i)

        for artist in unique_artists:
            model.addConstr(quicksum(X[i] for i in artist_to_songs[artist]) <= max_per_artist)

        # Objective
        model.setObjective(quicksum(X[i] * ratings[i] for i in range(num_tracks)), GRB.MAXIMIZE)
        model.optimize()

        if model.status == GRB.OPTIMAL:
            selected = [i for i in range(num_tracks) if X[i].X > 0.5]
            total_rating = sum(ratings[i] for i in selected)
            diversity = len(set(artists[i] for i in selected))
            avg_rating = total_rating / 30

            results.append({
                "user": user_col,
                "max_per_artist": max_per_artist,
                "total_rating": total_rating,
                "diversity": diversity,
                "avg_rating_per_song": avg_rating
            })

# Save results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("normalized_diversity_vs_rating_all_users.csv", index=False)



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df_plot = pd.read_csv("normalized_diversity_vs_rating_all_users.csv")

plt.figure(figsize=(10, 6))
palette = sns.color_palette("Set1", n_colors=5)

# Scatter by max songs per artist
for i, val in enumerate(sorted(df_plot['max_per_artist'].unique())):
    subset = df_plot[df_plot['max_per_artist'] == val]
    x = subset['diversity'] + np.random.normal(0, 0.2, size=len(subset))  # jitter
    y = subset['avg_rating_per_song']
    plt.scatter(x, y, label=f"{val} per artist", alpha=0.6, color=palette[i])

# Trendline (global)
z = np.polyfit(df_plot['diversity'], df_plot['avg_rating_per_song'], 1)
p = np.poly1d(z)
x_range = np.linspace(df_plot['diversity'].min(), df_plot['diversity'].max(), 100)
plt.plot(x_range, p(x_range), linestyle='--', color='black', label='Trendline')

# Labels
plt.xlabel("Unique Artists in Playlist (Diversity)")
plt.ylabel("Normalized Avg Rating per Song")
plt.title("Diversity vs Avg Rating per Song (All Users, All Songs)")
plt.grid(True)
plt.legend(title="Max Songs per Artist")
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the data (assuming you've already aggregated it beforehand)
df = pd.read_csv("normalized_diversity_vs_rating_all_users.csv")

# Step 1: Group by max_per_artist and compute average rating
grouped = df.groupby("max_per_artist")["avg_rating_per_song"].mean().reset_index()

# Step 2: Compute marginal change (∆Rating / ∆Constraint)
grouped["delta_rating"] = grouped["avg_rating_per_song"].diff() / grouped["max_per_artist"].diff()

# Step 3: Drop the first row (since diff gives NaN)
grouped = grouped.dropna()

# Step 4: Plot
plt.figure(figsize=(8, 5))
plt.plot(grouped["max_per_artist"], grouped["delta_rating"], marker="o", color="orange")
plt.title("Marginal Change in Playlist Rating (ΔRating / ΔArtist Limit)")
plt.xlabel("Max Songs per Artist")
plt.ylabel("Avg ΔRating per Step")
plt.axhline(0, linestyle="--", color="gray", linewidth=1)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import gurobipy as gp
from gurobipy import GRB
from tqdm import tqdm

# Load the dataset
df = pd.read_csv("Final_Cleaned_Song_Predictions.csv")
user_id = "user_b4beed9bf653604a876fdfd9df59e19c"  # Update with your desired user ID
playlist_size = 20
noise_levels = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3]
num_trials = 10
max_per_artist = 2

# Mood filter (party-like)
df_mood = df[
    (df["danceability"] >= 0.8) &
    (df["energy"] >= 0.7) &
    (df["valence"] >= 0.6) &
    (df["tempo"] >= 115)
].copy().reset_index(drop=True)

# Get user ratings
base_ratings = df_mood[user_id].values
results = []

for noise in noise_levels:
    for trial in range(num_trials):
        noisy_ratings = base_ratings + np.random.normal(0, noise, size=len(base_ratings))
        noisy_ratings = np.clip(noisy_ratings, 0, 5)

        model = gp.Model("robust_playlist_flow")
        model.setParam("OutputFlag", 0)
        x = model.addVars(len(df_mood), vtype=GRB.BINARY, name="x")

        model.setObjective(gp.quicksum(noisy_ratings[i] * x[i] for i in range(len(df_mood))), GRB.MAXIMIZE)
        model.addConstr(gp.quicksum(x[i] for i in range(len(df_mood))) == playlist_size)

        for artist in df_mood["artist_name_clean_x"].unique():
            idxs = df_mood[df_mood["artist_name_clean_x"] == artist].index.tolist()
            model.addConstr(gp.quicksum(x[i] for i in idxs) <= max_per_artist)

        model.optimize()

        if model.status == GRB.OPTIMAL:
            selected_indices = [i for i in range(len(df_mood)) if x[i].X > 0.5]
            playlist = df_mood.iloc[selected_indices].copy()

            # Arrange using your advanced flow model (assume it’s already defined)
            arranger = AdvancedPlaylistArranger(playlist)
            arranged_playlist = arranger.arrange_by_advanced_party_flow()

            # Compute average tempo difference between consecutive songs
            tempo_diffs = arranged_playlist["tempo"].diff().abs().dropna()
            avg_tempo_change = tempo_diffs.mean()

            results.append({
                "noise_level": noise,
                "trial": trial,
                "avg_tempo_change": avg_tempo_change,
                "total_rating": base_ratings[selected_indices].sum(),
                "avg_rating_per_song": base_ratings[selected_indices].sum() / playlist_size
            })

# Save results
robust_df = pd.DataFrame(results)
robust_df.to_csv("robustness_playlist_flow_noise.csv", index=False)
print("Saved to robustness_playlist_flow_noise.csv")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("robustness_playlist_flow_noise.csv")

# Ensure data is numeric
df["noise_level"] = pd.to_numeric(df["noise_level"], errors="coerce")
df["avg_tempo_change"] = pd.to_numeric(df["avg_tempo_change"], errors="coerce")
df = df.dropna(subset=["noise_level", "avg_tempo_change"])

# Group by noise level
summary = df.groupby("noise_level").agg(
    avg_jump=("avg_tempo_change", "mean"),
    std_jump=("avg_tempo_change", "std")
).reset_index()

# Plot
plt.figure(figsize=(8, 5))
plt.plot(summary["noise_level"], summary["avg_jump"], marker="o", label="Avg Tempo Change")
plt.fill_between(
    summary["noise_level"],
    summary["avg_jump"] - summary["std_jump"],
    summary["avg_jump"] + summary["std_jump"],
    color="skyblue", alpha=0.3, label="±1 Std Dev"
)
plt.title("Robustness of Playlist Flow to Noisy Ratings")
plt.xlabel("Noise Level (std dev)")
plt.ylabel("Average Tempo Change Between Songs (BPM)")
plt.xticks(summary["noise_level"])
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

#print the average standard deviation of the average tempo change
avg_std_dev = summary["std_jump"].mean()
print(f"Average Standard Deviation of Avg Tempo Change: {avg_std_dev:.2f} BPM")


In [None]:
import pandas as pd
import numpy as np
import gurobipy as gp
from gurobipy import GRB
from tqdm import tqdm

# Load dataset
df = pd.read_csv("Final_Cleaned_Song_Predictions.csv")

# Use one specific user (update as needed)
user_id = "user_b4beed9bf653604a876fdfd9df59e19c"

# Parameters
playlist_size = 20
num_trials = 10
noise_levels = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3]
max_per_artist = 2

# Mood filter (e.g., party)
df_mood = df[
    (df["danceability"] >= 0.8) &
    (df["energy"] >= 0.7) &
    (df["valence"] >= 0.6) &
    (df["tempo"] >= 115)
].copy().reset_index(drop=True)

# Ratings to perturb
base_ratings = df_mood[user_id].values

# Collect results
records = []

for noise in noise_levels:
    for trial in range(num_trials):
        noisy_ratings = base_ratings + np.random.normal(0, noise, size=len(base_ratings))
        noisy_ratings = np.clip(noisy_ratings, 0, 5)

        model = gp.Model("robust_energy_valence_dance")
        model.setParam("OutputFlag", 0)
        x = model.addVars(len(df_mood), vtype=GRB.BINARY, name="x")
        model.setObjective(gp.quicksum(noisy_ratings[i] * x[i] for i in range(len(df_mood))), GRB.MAXIMIZE)
        model.addConstr(gp.quicksum(x[i] for i in range(len(df_mood))) == playlist_size)

        for artist in df_mood["artist_name_clean_x"].unique():
            idxs = df_mood[df_mood["artist_name_clean_x"] == artist].index
            model.addConstr(gp.quicksum(x[i] for i in idxs) <= max_per_artist)

        model.optimize()

        if model.status == GRB.OPTIMAL:
            selected_idxs = [i for i in range(len(df_mood)) if x[i].X > 0.5]
            selected_songs = df_mood.iloc[selected_idxs]
            records.append({
                "noise_std": noise,
                "trial": trial,
                "avg_energy": selected_songs["energy"].mean(),
                "avg_valence": selected_songs["valence"].mean(),
                "avg_danceability": selected_songs["danceability"].mean()
            })

# Save the new dataset
df_result = pd.DataFrame(records)
df_result.to_csv("robustness_energy_valence_danceability.csv", index=False)
print("Saved to robustness_energy_valence_danceability.csv")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load your new dataset
df = pd.read_csv("robustness_energy_valence_danceability.csv")

# Group by noise level and compute means and standard deviations
summary = df.groupby("noise_std").agg({
    "avg_energy": ["mean", "std"],
    "avg_valence": ["mean", "std"],
    "avg_danceability": ["mean", "std"]
}).reset_index()

# Flatten multi-level columns
summary.columns = ["noise_std", "energy_mean", "energy_std", "valence_mean", "valence_std", "dance_mean", "dance_std"]

# Plot setup
plt.figure(figsize=(10, 6))
sns.set(style="whitegrid")

# Plot each feature with shaded std deviation
plt.plot(summary["noise_std"], summary["energy_mean"], marker='o', label="Energy")
plt.fill_between(summary["noise_std"],
                 summary["energy_mean"] - summary["energy_std"],
                 summary["energy_mean"] + summary["energy_std"],
                 alpha=0.2)

plt.plot(summary["noise_std"], summary["valence_mean"], marker='s', label="Valence")
plt.fill_between(summary["noise_std"],
                 summary["valence_mean"] - summary["valence_std"],
                 summary["valence_mean"] + summary["valence_std"],
                 alpha=0.2)

plt.plot(summary["noise_std"], summary["dance_mean"], marker='^', label="Danceability")
plt.fill_between(summary["noise_std"],
                 summary["dance_mean"] - summary["dance_std"],
                 summary["dance_mean"] + summary["dance_std"],
                 alpha=0.2)

# Styling
plt.title("Robustness of Audio Features to Noisy User Ratings")
plt.xlabel("Noise Std Dev")
plt.ylabel("Average Value")
plt.ylim(0, 1)
plt.legend()
plt.tight_layout()
plt.show()

# print the average standard deviation of the average energy, valence, and danceability
avg_energy_std = summary["energy_std"].mean()
avg_valence_std = summary["valence_std"].mean()
avg_dance_std = summary["dance_std"].mean()
print(f"Average Std Dev of Avg Energy: {avg_energy_std:.2f}")
print(f"Average Std Dev of Avg Valence: {avg_valence_std:.2f}")
# print the average standard deviation of the average danceability
avg_dance_std = summary["dance_std"].mean()
print(f"Average Std Dev of Avg Danceability: {avg_dance_std:.2f}")
import pandas as pd