# AI-Powered Game Soundtrack Generator (Text-to-Music AI) 🤖🎵🎮

In [72]:
import pandas as pd
import numpy as np
import mido
from transformers import AutoProcessor, MusicgenForConditionalGeneration, pipeline
import torch
import torchaudio
import librosa
import numpy as np
import scipy
from sklearn.ensemble import RandomForestClassifier

## 🔹 1: Extract Features from MIDI Files
### 🎯 Goal: Convert MIDI files into structured features for machine learning.

✅ Steps:
- 1️⃣ Read MIDI files from df_labelled using mido.
    - df_unlabelled has a very high degree of missing data.
- 2️⃣ Extract relevant musical features, such as:

    - Tempo (BPM)   
    - Note density (number of notes per second)
    - Chord structure & key signature
    - Pitch variation & range

- 3️⃣ Store the extracted features in a new column.

In [73]:
#load df's
df_labelled = pd.read_csv('vgmidi_labelled.csv')
df_unlabelled = pd.read_csv('vgmidi_unlabelled.csv')

#fix path
df_labelled['midi'] = df_labelled['midi'].str.replace('labelled/phrases/', 'labelled/midi/')
df_unlabelled['midi'] = df_unlabelled['midi'].str.replace('data_clean/midi/', 'unlabelled/midi/')

#fix naming
df_labelled['midi'] = df_labelled['midi'].str.replace('_0', '')
df_unlabelled['midi'] = df_unlabelled['midi'].str.replace('_0', '')

In [75]:
def extract_midi_features(midi_path):
    
    try:
        mid = mido.MidiFile(midi_path)
        tempos, notes = [], []
        for track in mid.tracks:
            for msg in track:
                if msg.type == "set_tempo":
                    tempos.append(mido.tempo2bpm(msg.tempo))
                elif msg.type == "note_on":
                    notes.append(msg.note)
        
        return {
            "avg_bpm": np.mean(tempos) if tempos else 120,  
            "note_density": len(notes) / mid.length if mid.length > 0 else len(notes),
            "pitch_variance": np.var(notes) if notes else 0
        }
        
    except FileNotFoundError:
        return np.nan

In [74]:
#apply feature extraction
df_labelled["features"] = df_labelled["midi"].apply(extract_midi_features)

#not for this df due to high degree of missing data
#df_unlabelled["features"] = df_unlabelled["midi"].apply(extract_midi_features)

## 🚀 2: Label df_unlabelled Using Zero-Shot Classification
### 🎯 Goal: Since df_unlabelled lacks extracted MIDI features, we use BART zero-shot classification to predict valence/arousal labels from metadata.

✅ Steps:

- Use facebook/bart-large-mnli to classify game moods based on available metadata (e.g., song title, game name).
- Assign valence & arousal values using text-based heuristics.

In [76]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Device set to use mps:0


In [77]:
def classify_midi_metadata(title):
    labels = ["Calm", "Intense", "Mysterious", "Action"]
    result = classifier(title, candidate_labels=labels)
    
    # Map moods to valence/arousal
    mood_map = {
        "Calm": (-1, -1),
        "Intense": (1, -1),
        "Mysterious": (-1, 1),
        "Action": (1, 1)
    }
    return mood_map[result["labels"][0]]

In [78]:
df_unlabelled = df_unlabelled.dropna(subset=["piece"])
df_unlabelled[["valence", "arousal"]] = df_unlabelled["piece"].apply(classify_midi_metadata).apply(pd.Series)

In [79]:
df_unlabelled[['valence', 'arousal']].value_counts(dropna=False)

valence  arousal
 1       -1         1476
          1         1080
-1        1          811
         -1          482
Name: count, dtype: int64

## 🚀 3: Train a Hybrid Model to Predict Valence/Arousal
### 🎯 Goal: Train a model using MIDI features (from df_labelled) + text-based classification (from df_unlabelled).

✅ Steps:

- Train an ML model only on df_labelled MIDI features.
- Use a text-based classifier to label df_unlabelled.
- Combine both predictions for music generation.
- Train only on df_labelled

In [80]:
X = pd.json_normalize(df_labelled["features"])  
y_valence = df_labelled["valence"]
y_arousal = df_labelled["arousal"]

In [81]:
model_valence = RandomForestClassifier(n_estimators=100).fit(X, y_valence)
model_arousal = RandomForestClassifier(n_estimators=100).fit(X, y_arousal)

## 🚀 4: Generate AI Music Based on Hybrid Model Predictions
### 🎯 Goal: Use hybrid model (ML + zero-shot classification) to generate game soundtracks using facebook/musicgen.

✅ Steps:

- Generate mood-based prompts using ML model (df_labelled) + zero-shot classification (df_unlabelled).
- Feed the prompt into MusicGen for soundtrack generation.
- MusicGen using Hybrid Prediction

In [85]:
#load Facebook MusicGen model
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")

Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summ

In [92]:
def generate_music(mood):
    prompt = [f"{mood} video game soundtrack"]
    inputs = processor(text=prompt, return_tensors="pt").to("cpu")

    # Generate music
    audio_values = model.generate(**inputs)

    # Ensure output is 2D (1 channel, N samples)
    audio_values = audio_values.squeeze(0).unsqueeze(0)  

    # Save the generated audio
    torchaudio.save(f"generated_{mood}.wav", audio_values, sample_rate=32000)

    return f"generated_{mood}.wav"

def generate_music_from_metadata(title):
    valence, arousal = classify_midi_metadata(title)  # Zero-shot classification
    mood = "Calm" if valence == -1 and arousal == -1 else \
           "Intense" if valence == 1 and arousal == -1 else \
           "Mysterious" if valence == -1 and arousal == 1 else "Action"
    
    return generate_music(mood)

In [1]:
generate_music_from_metadata("Dark haunted castle theme")