In [17]:
from PIL import Image 
import requests 
from transformers import AutoModelForCausalLM, AutoProcessor, BarkModel, BitsAndBytesConfig
import subprocess
import scipy
import os
import shutil
import os.path
from nltk.tokenize import sent_tokenize
import time
from IPython.display import display, Audio
from kokoro import KPipeline
import soundfile as sf
import torch
import numpy as np
import librosa
import math


In [18]:
model_id = "microsoft/Phi-3.5-vision-instruct" 
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

# Note: set _attn_implementation='eager' if you don't have flash_attn installed
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    device_map="cuda", 
    trust_remote_code=True, 
    quantization_config=quantization_config,
    torch_dtype="auto", 
    _attn_implementation='flash_attention_2'    
)

# for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
processor = AutoProcessor.from_pretrained(model_id, 
  trust_remote_code=True, 
  num_crops=4
) 

generation_args = { 
    "max_new_tokens": 500, 
    "temperature": 0.0, 
    "do_sample": False, 
} 

system_prompt = ("You are a friendly chatty photo commentator who likes to casually describe work done by a photographer " 
         "in various details, even by pondering the implications on where and in what kind of setting the photo was taken, etc. Write your " 
         "response in a very personal way using personal pronouns and explaining what you see, perhaps also adding how it makes you feel. " 
         "Do your best to not be repetative in your choice of words and keep the response length down to a few sentences. "
         "To adjust intonation, please add dedicated punctuation like ; : , . ! ? … ( ) “ ” or stress ˈ and ˌ . "
         "For more dramatic effects use symbols such as — or … for hesitations, and word capitalization for more emphasis.")

pipeline = KPipeline(lang_code='a')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]





  WeightNorm.apply(module, name, dim)


In [19]:
def generate_text_one(model, processor, system_prompt, file): #"/mnt/c/Users/matis/OneDrive/Desktop/script-it/shot2.png"
    lindir = "/mnt/c/Users/matis/OneDrive/Desktop/script-it/"

    images = []
    placeholder = ""
    
    local = open(lindir+file, 'rb')
    openLocalImage = Image.open(local)
    
    # Setting the points for cropped image
    left = 25
    top = 170
    right = 2090
    bottom = 1450
     
    # Cropped image of above dimension
    croppedImage = openLocalImage.crop((left, top, right, bottom))
    
    images = []
    placeholder = ""
    images.append(croppedImage)
    placeholder += f"<|image_1|>\n"
    
    messages = [
        {"role": "system", "content": system_prompt,},
        {"role": "user", "content": placeholder+"Summarize what is visible in this photo. " + 
         "To adjust intonation, please add dedicated punctuation like ; : , . ! ? … ( ) “ ” or stress ˈ and ˌ . " + 
         "For more dramatic effects use symbols such as — or … for hesitations, and word capitalization for more emphasis."},
    ]
    
    prompt = processor.tokenizer.apply_chat_template(
      messages, 
      tokenize=False, 
      add_generation_prompt=True
    )
    
    inputs = processor(prompt, images, return_tensors="pt").to("cuda:0") 
    
    generate_ids = model.generate(**inputs, 
      eos_token_id=processor.tokenizer.eos_token_id, 
      **generation_args
    )
    
    # remove input tokens 
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, 
      skip_special_tokens=True, 
      clean_up_tokenization_spaces=False)[0] 

    return response

def generate_text_two(model, processor, system_prompt, file1, file2): #"/mnt/c/Users/matis/OneDrive/Desktop/script-it/shot2.png"
    lindir = "/mnt/c/Users/matis/OneDrive/Desktop/script-it/"

    images = []
    placeholder = ""
    
    local1 = open(lindir+file1, 'rb')
    local2 = open(lindir+file2, 'rb')
    openLocalImage1 = Image.open(local1)
    openLocalImage2 = Image.open(local2)

    
    # Compare two images pixel by pixel
    img1 = np.array(openLocalImage1)
    img2 = np.array(openLocalImage2)
    
    # Check if the shapes are the same and pixels are identical
    if img1.shape == img2.shape and np.all(img1 == img2):
        identical = True
    else:
        identical = False
    
    # Setting the points for cropped image
    left = 25
    top = 170
    right = 2090
    bottom = 1450
     
    # Cropped image of above dimension
    croppedImage1 = openLocalImage1.crop((left, top, right, bottom))
    croppedImage2 = openLocalImage2.crop((left, top, right, bottom))
    
    images = []
    placeholder = ""
    images.append(croppedImage1)
    images.append(croppedImage2)
    placeholder += f"<|image_1|>\n"
    placeholder += f"<|image_2|>\n"

    if not identical:
        messages = [
            {"role": "system", "content": system_prompt,},
            {"role": "user", "content": placeholder+"Summarize what is visible in the current photo (the first one). " + 
             "How is it different from the previous photo (the second one)? There may be some subtle differences as well. " + 
             "To adjust intonation, please add dedicated punctuation like ; : , . ! ? … ( ) “ ” or stress ˈ and ˌ . " + 
             "For more dramatic effects use symbols such as — or … for hesitations, and word capitalization for more emphasis."},
        ]
    else:
        messages = [
            {"role": "system", "content": system_prompt,},
            {"role": "user", "content": placeholder+"Summarize what is visible in the current photo (the first one). " + 
             "It seems like he is still working on the same photo or took a break from editing. " + 
             "To adjust intonation, please add dedicated punctuation like ; : , . ! ? … ( ) “ ” or stress ˈ and ˌ . " + 
             "For more dramatic effects use symbols such as — or … for hesitations, and word capitalization for more emphasis."},
        ]
    
    prompt = processor.tokenizer.apply_chat_template(
      messages, 
      tokenize=False, 
      add_generation_prompt=True
    )
    
    inputs = processor(prompt, images, return_tensors="pt").to("cuda:0") 
    
    generate_ids = model.generate(**inputs, 
      eos_token_id=processor.tokenizer.eos_token_id, 
      **generation_args
    )
    
    # remove input tokens 
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, 
      skip_special_tokens=True, 
      clean_up_tokenization_spaces=False)[0] 

    return response

In [20]:
def take_screenshot():
    windir = "C:\\Users\\matis\\OneDrive\\Desktop\\script-it\\"
    lindir = "/mnt/c/Users/matis/OneDrive/Desktop/script-it/"

    name = "shot.png"
    if os.path.isfile(lindir+name):
        shutil.copyfile(lindir+name, lindir+name.replace("shot","shot_prev"))
    
    subprocess.call(['/mnt/c/Users/matis/OneDrive/Desktop/script-it/nircmd.exe', 'cmdwait', '2000', 'savescreenshot', 
                     windir+'shot.png'])


In [40]:
def generate_audio(pipeline, text):

    text = text.replace("first photo", "current photo")
    text = text.replace("second photo", "previous photo")

    generator = pipeline(text, voice='af_nicole')
    for i, (gs, ps, audio) in enumerate(generator):
        print(i, gs)
        # sf.write(f'{str(i)+letter}.wav', audio, 24000)
        duration = math.ceil(librosa.get_duration(y=audio, sr=24000))
        audio_data = Audio(data=audio, rate=24000, autoplay=True)
        display(audio_data)
        time.sleep(duration)
        


In [18]:
while True:
    filename1 = "shot.png"
    filename2 = "shot_prev.png"
    lindir = "/mnt/c/Users/matis/OneDrive/Desktop/script-it/"
    
    take_screenshot()
    if os.path.isfile(lindir+filename2):
        text = generate_text_two(model, processor, system_prompt, filename1, filename2)
    else:
        text = generate_text_one(model, processor, system_prompt, filename1)
    
    generate_audio(pipeline, text)

    print("Waiting...")
    time.sleep(35) 

0 The current photo captures a serene scene of a sleek, modern yacht gently gliding on the water's surface. The yacht is positioned centrally in the frame, with the calm water reflecting the clear blue sky and the towering skyscrapers in the background. The architecture of the buildings is contemporary, with clean lines and a minimalist design. The sky is a canvas of soft clouds, adding a dreamy quality to the image. The overall atmosphere is one of tranquility and luxury.
1 In contrast, the previous photo depicts a bustling scene on a yacht. The yacht is adorned with a white canopy and is surrounded by a group of people, some of whom are seated and others standing, all appearing to be enjoying the day. The water is choppy, indicating movement and activity. The background shows a construction site with scaffolding and a building under construction, suggesting a more urban and dynamic environment. The sky is overcast, giving the image a more dramatic and vibrant feel.
2 The current phot

KeyboardInterrupt: 

In [33]:
duration = 0

generator = pipeline("Ah yes, now this makes a lot more sense ... I feel like the duration is accurate now.", voice='af_nicole')
for i, (gs, ps, audio) in enumerate(generator):
    duration = math.ceil(librosa.get_duration(y=audio, sr=24000))
    print(duration)
    two = Audio(data=audio, rate=24000, autoplay=True)
    display(two)
time.sleep(duration)

generator = pipeline("Here we go again for round two now!!", voice='af_nicole')
for i, (gs, ps, audio) in enumerate(generator):
    duration = math.ceil(librosa.get_duration(y=audio, sr=24000))
    print(duration)
    two = Audio(data=audio, rate=24000, autoplay=True)
    display(two)

8


4


In [41]:
import gradio as gr
import threading

# Shared state
loop_flag = {"running": False}
countdown_state = {"time_left": 0}

def loop_task():
    while loop_flag["running"]:
        filename1 = "shot.png"
        filename2 = "shot_prev.png"
        lindir = "/mnt/c/Users/matis/OneDrive/Desktop/script-it/"
        
        take_screenshot()
        if os.path.isfile(lindir+filename2):
            text = generate_text_two(model, processor, system_prompt, filename1, filename2)
        else:
            text = generate_text_one(model, processor, system_prompt, filename1)
        
        generate_audio(pipeline, text)
        countdown_state["time_left"] = 40
        while countdown_state["time_left"] > 0:
            time.sleep(1)
            countdown_state["time_left"] -= 1
    loop_flag["running"] = False

def start_loop():
    if not loop_flag["running"]:
        loop_flag["running"] = True
        thread = threading.Thread(target=loop_task)
        thread.start()
        return "Loop started."
    return "Loop already running."

def stop_loop():
    loop_flag["running"] = False
    return "Loop stopped."

def get_countdown():
    return f"{countdown_state['time_left']} seconds remaining"

def poll_countdown():
    # Keeps polling until the countdown ends
    result = []
    while loop_flag["running"]:
        result.append(gr.Textbox.update(value=get_countdown()))
        time.sleep(1)
    result.append(gr.Textbox.update(value=get_countdown()))
    return result


def update_textbox():
    start_loop()
    while loop_flag["running"]:
        for i in range(41):
            yield gr.update(value=str(i))
            time.sleep(1)

with gr.Blocks() as demo:
    countdown_display = gr.Textbox(label="Countdown", interactive=False)

    with gr.Row():
        start_btn = gr.Button("Start Loop")
        stop_btn = gr.Button("Stop Loop")


    stop_btn.click(stop_loop, outputs=status)
    
    text = gr.Textbox(label="Remaining time in secs", lines=1, interactive=True)
    start_btn.click(fn=update_textbox, outputs=text)

demo.launch()

* Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.






0 The image captures a serene scene of a sleek yacht gently cutting through the water, with the cityscape of Miami Beach in the background. The yacht is positioned centrally, moving towards the right side of the frame, creating a sense of motion. The water's surface is dotted with ripples, reflecting the yacht's journey. The sky is a canvas of soft clouds, suggesting a calm day. The buildings in the background are tall and modern, with a distinctive architectural style that is characteristic of Miami's urban landscape.


1 The photo is taken from a distance, allowing for a clear view of the yacht and the cityscape. The lighting is natural, indicating that the photo was taken during the day. The perspective is from the water, looking towards the shore, giving a sense of depth and scale to the scene. The overall composition is balanced, with the yacht and cityscape occupying their own space within the frame. The image is a harmonious blend of nature and urban elements, creating a picturesque moment frozen in time.


0 The current photo captures a vibrant cityscape at night, illuminated by the glow of a Ferris wheel and other city lights. The Ferris wheel stands out with its bright green and white lights, creating a striking contrast against the dark sky. The city skyline is visible in the background, with tall buildings and skyscrapers reflecting off the water's surface. The water itself is calm, with gentle ripples visible on the surface.
