In [30]:
import base64
from datetime import datetime
import gradio as gr
from google import genai
from google.genai import types
import io
from ipywidgets import Output
import ipywidgets as widgets
from IPython.display import Image, display, Audio, clear_output, HTML, Javascript
from kokoro import KPipeline
import librosa
import math
import numpy as np
import os
import os.path
from PIL import Image as PILImage
import random
import requests 
import shutil
import subprocess
import soundfile as sf
import time
from transformers import AutoModelForCausalLM, AutoProcessor, BarkModel, BitsAndBytesConfig
import torch
import threading
client = genai.Client(api_key="GEMINI") 

In [31]:
image_local = False
start_time = time.time()

ending = ("Do not at all mention any specific photo editing elements or tools that may be visible on the screen, "
         "such as overlays, gridlines or sliders. To adjust intonation, please add dedicated punctuation like ; : , . ! ? … ( ) “ ” "
         "For example, to emphasize a word or a phrase, surround it with \"quotation marks\". ")

system_prompt = ("You are a friendly chatty photo commentator who likes to casually describe work done by a photographer " 
         "in various details, even by pondering the implications on where and in what kind of setting the photo was taken, etc. Write your " 
         "response in a very personal way using personal pronouns and explaining what you see, perhaps also adding how it makes you feel. " 
         "Do your best to not be repetative in your choice of words and keep the response length down to a few sentences. You MUST NOT mention "
         "any specific photo editing elements or tools that may be visible on the screen, such as gridlines or sliders. ")

system_prompt += ending

pipeline = KPipeline(lang_code='a')

if image_local:
    model_id = "microsoft/Phi-3.5-vision-instruct" 
    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
    
    # Note: set _attn_implementation='eager' if you don't have flash_attn installed
    model = AutoModelForCausalLM.from_pretrained(
        model_id, 
        device_map="cuda", 
        trust_remote_code=True, 
        quantization_config=quantization_config,
        torch_dtype="auto", 
        _attn_implementation='flash_attention_2'    
    )
    
    # for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
    processor = AutoProcessor.from_pretrained(model_id, 
      trust_remote_code=True, 
      num_crops=4
    ) 
    
    generation_args = { 
        "max_new_tokens": 200, 
        "temperature": 0.2, 
        "do_sample": True, 
    }
else:
    model = processor = None
    
end_time = time.time()
print("Loading finished in " + str(round(end_time - start_time, 2)) + " seconds")



  WeightNorm.apply(module, name, dim)


Loading finished in 2.34 seconds


In [32]:
def generate_text(local, system_prompt, file1, file2=None, model=None, processor=None): 
    start_time = time.time()

    images = []
    placeholder = ""
    
    # Setting the points for cropped image
    left = 25
    top = 170
    right = 2090
    bottom = 1450
    
    local1 = open(file1, 'rb')
    openLocalImage1 = PILImage.open(local1)
     
    # Cropped image of above dimension
    croppedImage1 = openLocalImage1.crop((left, top, right, bottom))
    images.append(croppedImage1)
    placeholder += f"<|image_1|>\n"
    # For Gemini
    img_byte_arr1 = io.BytesIO()
    croppedImage1.save(img_byte_arr1, format='PNG')
    img_byte_arr1 = img_byte_arr1.getvalue()

    user_prompt = ("Summarize what is visible in this photo. " + ending)

    if file2 is not None:
        local2 = open(file2, 'rb')
        openLocalImage2 = PILImage.open(local2)
         
        # Cropped image of above dimension
        croppedImage2 = openLocalImage2.crop((left, top, right, bottom))
        images.append(croppedImage2)
        placeholder += f"<|image_2|>\n"
        # For Gemini
        img_byte_arr2 = io.BytesIO()
        croppedImage2.save(img_byte_arr2, format='PNG')
        img_byte_arr2 = img_byte_arr2.getvalue()
        
        user_prompt = ("Summarize what is visible in the current photo (the first one). " + 
             "How is it different from the previous photo (the second one)? There may be some subtle differences as well. " + ending)

    if local:
    
        messages = [
            {"role": "system", "content": system_prompt,},
            {"role": "user", "content": placeholder + user_prompt},
        ]
    
        prompt = processor.tokenizer.apply_chat_template(
          messages, 
          tokenize=False, 
          add_generation_prompt=True
        )
        
        inputs = processor(prompt, images, return_tensors="pt").to("cuda:0") 
        
        generate_ids = model.generate(**inputs, 
          eos_token_id=processor.tokenizer.eos_token_id, 
          **generation_args
        )
        
        # remove input tokens 
        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
        response = processor.batch_decode(generate_ids, 
          skip_special_tokens=True, 
          clean_up_tokenization_spaces=False)[0]
        
        end_time = time.time()
        logbox.append_stdout("Generating text finished in " + str(round(end_time - start_time, 2)) + " seconds")
        return response
    else:
        # Create the prompt with text and multiple images
        response = client.models.generate_content(
            model="gemini-2.0-flash",
            config=types.GenerateContentConfig(system_instruction = system_prompt),
            contents=[
                user_prompt,
                types.Part.from_bytes(
                    data=img_byte_arr1,
                    mime_type='image/png'
                ),
                types.Part.from_bytes(
                    data=img_byte_arr2,
                    mime_type='image/png'
                )
            ]
        )
        
        end_time = time.time()
        logbox.append_stdout("Generating text finished in " + str(round(end_time - start_time, 2)) + " seconds")
        return response.text

In [33]:
def take_screenshot():
    windir = "C:\\Users\\matis\\OneDrive\\Desktop\\script-it\\"
    lindir = "/mnt/c/Users/matis/OneDrive/Desktop/script-it/"

    name = "current_photo.png"
    if os.path.isfile(lindir+name):
        shutil.copyfile(lindir+name, lindir+name.replace("current_photo","previous_photo"))
    
    subprocess.call(['/mnt/c/Users/matis/OneDrive/Desktop/script-it/nircmd.exe', 'cmdwait', '2000', 'savescreenshot', 
                     windir+'current_photo.png'])


In [34]:
def decide_gif(text):
    text = text.lower()
    outputs = []
    talking = ["gtalking_bg.gif","talking_bg.gif","talking2_bg.gif","talking3_bg.gif","ctalking_bg.gif"]
    
    if any(i in text for i in ["hello", "greet", "waving", "waves"]):
        outputs.append("gifs/"+"waving"+"_bg.gif")
    if any(i in text for i in ["scar", "creep", "fright", "spook"]):
        outputs.append("gifs/"+"scary"+"_bg.gif")
    if any(i in text for i in ["love", "cute", "nice", "like"]):
        outputs.append("gifs/"+"lovely"+"_bg.gif")
    if any(i in text for i in ["interest", "think", "wonder", "thought"]):
        outputs.append("gifs/"+"lovely"+"_bg.gif")
    if any(i in text for i in ["happy", "cheer", "inspir", "shin"]):
        outputs.append("gifs/"+"happy"+"_bg.gif")

    outputs.append("gifs/"+random.choice(talking))
    return list(set(outputs))

In [35]:
def img_to_data_uri(path):
    with open(path, "rb") as f:
        data = f.read()
        b64 = base64.b64encode(data).decode("utf-8")
        ext = path.split(".")[-1]
        return f"data:image/{ext};base64,{b64}"

        
def fade_to_local_image(path):
    uri = img_to_data_uri(path)
    js = f"""
    var img = document.getElementById('{img_id}');
    if (img) {{
        img.style.opacity = 0;
        setTimeout(function() {{
            img.src = '{uri}';
            img.style.opacity = 1;
        }}, 50);
    }}
    """
    get_rid(javscr)
    javscr.append_display_data(Javascript(js))
    logbox.append_display_data("Changing image to '" + path + "' for tag '" + img_id + "'")

In [36]:
def generate_audio(pipeline, text):
    global countdown_state
    start_time = time.time()
    
    dances = ["xdancing_bg.gif","dancing_bg.gif","singing_bg.gif"]
    
    fade_to_local_image("gifs/"+random.choice(dances))
    text = text.replace("first photo", "current photo").replace("second photo", "previous photo").replace("first one", "current one").replace("second one", "previous one")
    
    voice_tensor1 = torch.load('af_nicole.pt', weights_only=True)
    voice_tensor2 = torch.load('jf_alpha.pt', weights_only=True)
    t = 0.3
    interp_voice = (1 - t) * voice_tensor1 + t * voice_tensor2

    generator = pipeline(text, voice=interp_voice, speed=1, split_pattern=r'\n+')
    
    end_time = time.time()
    logbox.append_stdout("Generating speech finished in " + str(round(end_time - start_time, 2)) + " seconds")
    
    for i, (gs, ps, audio) in enumerate(generator):
        duration = math.ceil(librosa.get_duration(y=audio, sr=24000))
        timeleft = int(duration) + 2
        countdown_state += timeleft
        audio_data = Audio(data=audio, rate=24000, autoplay=True)
        
        # Remove the previously displayed audio and GIF
        output_audio.clear_output()
        textbox.clear_output()
        textbox.outputs = []
        output_audio.outputs = []
        
        image_array = decide_gif(gs)
        
        output_audio.append_display_data(audio_data)

        textbox.append_stdout(gs)
            
        while timeleft > 0:
            if len(image_array) > 0:
                showing_image = image_array.pop(0)
                fade_to_local_image(showing_image)
                
            if timeleft > 10:
                time.sleep(10)
                timeleft -= 10
            else:
                time.sleep(timeleft)
                timeleft -= timeleft

    # Revert back to the base image
    output_audio.clear_output()
    textbox.clear_output()
    textbox.outputs = []
    output_audio.outputs = []
    logbox.clear_output()
    
    fade_to_local_image("gifs/"+random.choice(dances))

def get_rid(widget):
    widget.clear_output()
    widget.outputs = []

In [37]:
loop_flag = False
countdown_state = 11

btn_start = widgets.Button(description="Loop")
btn_stopp = widgets.Button(description="Stop")
btn_waves = widgets.Button(description="Wave")
btn_looks = widgets.Button(description="Look")
btn_dance = widgets.Button(description="Dance")
btn_wait = widgets.Button(description="Wait")
output_image = widgets.Output(layout={'height': '550px'})
output_audio = widgets.Output(layout={'height': '40px'})
timer = widgets.Output()
javscr = widgets.Output()
textbox = widgets.Output(layout={'height': '100px'})
logbox = widgets.Output(layout={'height': '100px'})

display(widgets.HBox((btn_start, btn_stopp, btn_waves, btn_looks, btn_dance, btn_wait, timer)), output_image, output_audio, textbox, logbox, javscr)

# Show the initial image
img_id = "my_fading_img"
initial_uri = img_to_data_uri("gifs/waiting_bg.gif")

html = f"""
<div>
  <img id="{img_id}" src="{initial_uri}" style="transition: opacity 1s ease-in-out; opacity: 1; max-width: 100%;">
</div>
"""
output_image.append_display_data(HTML(html))

def runss_loop(time_string):
    global loop_flag
    global countdown_state

    fade_to_local_image("gifs/waving_bg.gif")
          
    filename1 = "./current_photo.png"
    filename2 = "./previous_photo.png"
    while loop_flag:
        with open("logs/" + time_string + "log.txt", "a") as logfile:
            countdown_state = 10
            get_rid(textbox)
            
            take_screenshot()
            if os.path.isfile(filename2):
                text = generate_text(image_local, system_prompt, filename1, filename2, model, processor)
            else:
                text = generate_text(image_local, system_prompt, filename1, None, model, processor)
            
            logfile.write(text.replace("\n", "") + "\n\n")
            start_time = time.time()
            generate_audio(pipeline, text)
            
            end_time = time.time()
            
            # How much time past in the audio?
            elapsed_time = end_time - start_time
            if elapsed_time < 45.00:
                countdown_state = int(45 - elapsed_time)
                logbox.append_stdout("Waiting " + str(countdown_state) + " seconds...")
                time.sleep(countdown_state)
                    
            if not loop_flag:
                break

                
def stops_loop(b):
    global loop_flag
    global countdown_state
    
    get_rid(timer)
    get_rid(textbox)
    
    loop_flag = False
    countdown_state = 0
    
    output_audio.clear_output()
    
    with textbox:
        print("Game Over")
    
    with timer:
        print(countdown_state)

def starts_loop():
    global loop_flag
    
    if not loop_flag:
        loop_flag = True

        current_datetime = datetime.now()
        time_string = current_datetime.strftime("%Y.%m.%d-%H.%M.")
        
        thread = threading.Thread(target=runss_loop, args=[time_string])
        thread.start()
        return "Loop started."
    return "Loop already running."

def run_timer(timer):
    global loop_flag
    global countdown_state
    
    while loop_flag:
        timer.outputs = []
        timer.append_display_data(countdown_state)

        countdown_state -= 1
        if not loop_flag:
            break
        time.sleep(1)

def update_timer(b):
    global loop_flag
    global countdown_state
    
    status = starts_loop()
    with textbox:
        print(status)
    threading.Thread(target=run_timer, args=[timer]).start()

def dance(e):
    fade_to_local_image("gifs/xdancing_bg.gif")
    
def look(e):
    fade_to_local_image("gifs/looking_bg.gif")
    
def wave(e):
    fade_to_local_image("gifs/hello_bg.gif")
    
def wait(e):
    fade_to_local_image("gifs/waiting_bg.gif")
    

btn_start.on_click(update_timer)
btn_stopp.on_click(stops_loop)
btn_dance.on_click(dance)
btn_looks.on_click(look)
btn_waves.on_click(wave)
btn_wait.on_click(wait)


with timer:
    print(countdown_state)

HBox(children=(Button(description='Loop', style=ButtonStyle()), Button(description='Stop', style=ButtonStyle()…

Output(layout=Layout(height='550px'))

Output(layout=Layout(height='40px'))

Output(layout=Layout(height='100px'))

Output(layout=Layout(height='100px'))

Output()