# Jazz Impressions: AI-Driven Visual Echoes of Musical Influences
<p style="text-align:center;">by Lois Kelly</p>

----

<p style="text-align:center;">This code is submitted as in partial fulfilment for the requirements for the degree of </p>

<p style="text-align:center;">MASTER OF SCIENCE in Data Science and AI for the Creative Industries.  </p>

<p style="text-align:center;">Creative Computing Institue,  </p>

<p style="text-align:center;">University of the Arts Londond </p>

<p style="text-align:center;">2024. </p>

----

- To run this notebook please ensure the full github repo has been downloaded: https://git.arts.ac.uk/23044972/Jazz_Impressions/tree/main

- This code requires access to a GPU set up using CUDA. 

- *LLM DISCLAIMER: This code was developed with assistance from ChatGTP. Other sources are cited at the bottom of the relevent cell.*

----
## Contents

1. [Imports and Configuration](#imports-and-configuration)
2. [Chunk Processing](#chunk-processing)
3. [Main Function](#main-function)
4. [Adding the Audio File](#adding-the-audio-file)

### Imports and configuration

To begin, load the necessary imports.

In [None]:
import numpy as np
import torch
from datetime import datetime
from os import mkdir
from os.path import join
import os
import cv2
import csv
import torchaudio
from skimage import img_as_ubyte
from visualiserfunc_ import generate_interpolated_images
import sdtransitions_
from sdtransitions_ import generate_frames
from tqdm import tqdm  # For progress bar
from datetime import datetime
import librosa
import soundfile as sf
from jazzclassification import predict
import subprocess


Add the solo file that is being used into the ``` audio_path``` variable. The configuration can be adjusted for experimentation.

In [12]:
# Configuration
audio_path = 'exampleaudio.wav' # Add the solo here 
instruments = r'instruments.csv' 
chunk_duration = 5  # Length of audio chunks the solo will split into
fps = 20 # Number of frames per second in the final output video

This sets up the functions to process the audio, ready for classificaiton as well as setting up the instrument function.

In [13]:
# Processing functions
def split_audio(audio_path, chunk_duration=5):
    try:
        waveform, sample_rate = librosa.load(audio_path, sr=None)
    except Exception as e:
        print(f"Error loading audio file: {e}")
        return []
    
    # Calculate the number of samples per chunk
    chunk_size = int(sample_rate * chunk_duration)

    # Create output folder
    output_dir = "chunks"
    os.makedirs(output_dir, exist_ok=True)
    
    filenames = []
    
    # Split the audio file into 5 seconds chunks 
    for i, start in enumerate(range(0, len(waveform), chunk_size)):
        end = min(start + chunk_size, len(waveform))
        chunk = waveform[start:end]
        chunk_file = os.path.join(output_dir, f"chunk_{i+1}.wav")
        
        # Save chunk
        sf.write(chunk_file, chunk, sample_rate)
        filenames.append(chunk_file)
        print(f"Saved chunk {i+1} to {chunk_file}")
    
    return filenames

# Working from the instruments.csv file. Ensure this is within the same folder.
def find_instrument(artist_name):
    with open(instruments, mode='r') as file:
        reader = csv.reader(file)
        for row in reader:
            if row[0] == artist_name:
                return row[1]  
    return None  

### Chunk Processing 

Here the functions to process each chunk of audio are set up. It includes making the prediction and then setting up the stable diffusion prompts with the relevent predicitons and instruments. Then the correct LoRA weights are loaded in and the frames are generated for both the blended images and the transition images. 

In [None]:
def process_chunk(filename):
    waveform, sample_rate = torchaudio.load(filename)
    new_sample_rate = 8000
    transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=new_sample_rate)
    waveform = transform(waveform)
    
    predictions = predict(waveform)  # You would need to define `predict`

    # Unpack top 2 predictions
    prediction_1_label, weight_1 = predictions[0]
    print(prediction_1_label)
    prediction_2_label, weight_2 = predictions[1]
    # Find instruments for each prediction
    instrument_1 = find_instrument(prediction_1_label)
    instrument_2 = find_instrument(prediction_2_label)

    # Create prompts for image generation
    prompt_1 = f'{prediction_1_label} playing the {instrument_1} in a jazzy  painting style with vivid colors and distinctive brush strokes. The images are bright and lively, with a focus on the artist and their instrument.'
    print(prompt_1)
    prompt_2 = f'{prediction_2_label} playing the {instrument_2} in a jazzy  painting style with vivid colors and distinctive brush strokes. The images are bright and lively, with a focus on the artist and their instrument.'

    # Generate images and frames
    model_id = "stabilityai/stable-diffusion-xl-base-1.0"
    lora1_path = f'weights/{prediction_1_label}_lora_weights.safetensors'
    lora2_path = f'weights/{prediction_2_label}_lora_weights.safetensors'
    adapter_names = ["adapter1", "adapter2"]
    prompts = [prompt_1, prompt_2]

    output_dir, image_paths = generate_interpolated_images(
        model_id, adapter_names, prompts, lora1_path, lora2_path)
    frames = generate_frames(output_dir, lora1_path, lora2_path, model_id, steps_per_image=20)

    return frames

This functions brings all the frames for each processed chunk and puts them together into a video. 

In [10]:

def frames_to_video(frames, output_path, fps=30):
    if not frames:
        raise ValueError("No frames to write to video.")
    frame = np.array(frames[0])
    height, width, layers = frame.shape
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    for frame in frames:
        frame = np.array(frame)
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        video.write(frame)
    video.release()


### Main Function

Here all of the functions are brought together to create the visualiser video. An output directory is set up using a current time stamp. Then a for loop processes each 5 second chunk in order and the frames are saved to the ```all_frames``` array. The frames are then put together to create the video. 

In [None]:

def main():
    timestampStr = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = join('output_videos', timestampStr)
     
    # This creates an output directory that is labelled with the timestamp
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    filenames = split_audio(audio_path, chunk_duration)
    
    all_frames = []
    for i, chunk_file in tqdm(enumerate(filenames), desc="Processing chunks", unit="chunk"):
        frames = process_chunk(chunk_file)
        all_frames.extend(frames)   

    output_path = os.path.join(output_dir, 'output_video.mp4')
    frames_to_video(all_frames, output_path, fps=fps)
    print(f"Video saved at {output_path}")

if __name__ == "__main__":
    main()


### Adding the Audio File

Finally, the audio file is added to the visualisation video. 

In [None]:
timestampStr = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = join('fullvideo', timestampStr)
output_final = os.path.join(output_path, 'full.mp4')


subprocess.run([
    "ffmpeg", "-i", output_path, "-i", audio_path, 
    "-c:v", "copy", "-c:a", "aac", output_final
])
