**A simple demo for the entire thing can be found here**

https://docs.google.com/document/d/1GgqUubmcImbhCanVGzHgqYh_MAVg9arwg04Koi2OLjM/edit?usp=sharing

In [None]:
# !pip install salesforce-lavis openai transformers chardet charset-normalizer

In [None]:
import openai

from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

In [None]:
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [None]:
max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds

In [None]:
image_location = "image_Caption.png"
image_caption = predict_step([image_location])
user_prompt = "traditional japanese music"
print(image_caption)

In [None]:
def generate_prompt(image_caption, user_prompt = None):
    if user_prompt == None:
        return f'Give the music style for "{image_caption}" within 10 words'
    else:
        return f'Combine the music styles for "{image_caption}" and "{user_prompt}" within 10 words.'
    

In [None]:
prompt = generate_prompt(image_caption[0], user_prompt)
print(prompt)

In [None]:
!curl https://api.openai.com/v1/models \
  -H "Authorization: Bearer PUT YOUR API KEY" \
  -H "OpenAI-Organization: PUT YOUR ORGANIZATION ID"
    
OPENAI_API_KEY = "PUT YOUR API KEY"

openai.organization = "PUT THE ORGANIZATION ID"
openai.api_key = OPENAI_API_KEY

def generate_prompt_riffusion(prompt):
    completion = openai.ChatCompletion.create(
      model="gpt-4",
      messages=[
        {"role": "user", "content": prompt}
      ],
      max_tokens=1000,
      temperature=0
    )

    return completion.choices[0].message["content"]

In [None]:
rp = generate_prompt_riffusion(prompt)
print(rp)

In [None]:
# To be able to run the further steps, you first need to set up the environment as mentioned in 
# https://github.com/riffusion/riffusion

default_template = {
  "alpha": 0.75,
  "num_inference_steps": 50,
  "seed_image_id": "og_beat",

  "start": {
    "prompt": "church bells on sunday",
    "seed": 42,
    "denoising": 0.75,
    "guidance": 7.0
  },

  "end": {
    "prompt": "jazz with piano",
    "seed": 123,
    "denoising": 0.75,
    "guidance": 7.0
  }
}

default_template["start"]["prompt"] = rp
default_template["end"] = default_template["start"]

print(default_template)
      
# You can then feed this json file into the riffusion server to get the required audio output.