Xunkar's AI service rework updated (libretro#15640)

* AI service rework * File missing * Fixed C89 build * Fixed usage of inline for C89 build * Fixed an overlay unloading bug Made sure to unload the overlay on release and when the server returns empty values in automatic modes. * Fixed forward decl (c89) * Fixed OpenGL texture loading Moved image display to the main thread for now * Changed some formatting slightly * Fixed struct variable order and put brackets on newlines * Moved pointer, fixed retroarch.cfg, and replaced strlcat with strlcpy * Fixed catenation issue * Fixed a few other catenation issues * Fixed one more concatenation issue * Fixed concatenation issue * Fixed a few other concatenation issues * Fixed one more concatenation issue * potential fix for parsing issue --------- Co-authored-by: Xunkar <329857+xunkar@users.noreply.github.com>
JoeOsborn · Nov 2, 2023 · 8fe9583 · 8fe9583
1 parent 00df26d
commit 8fe9583
Show file tree

Hide file tree

Showing 15 changed files with 1,924 additions and 966 deletions.
diff --git a/accessibility.h b/accessibility.h
@@ -31,11 +31,36 @@
 #endif
 
 #include "configuration.h"
+#include "tasks/tasks_internal.h"
+
+#ifdef HAVE_THREADS
+#include "rthreads/rthreads.h"
+#endif
 
 typedef struct
 {
+   /* The last request task, used to prepare and send the translation */
+   retro_task_t *request_task;
+
+   /* The last response task, used to parse costly translation data */
+   retro_task_t *response_task;
+
+   /* Timestamp of the last translation request */
+   retro_time_t last_call;
+
+   #ifdef HAVE_THREADS
+   /* Necessary because last_image is manipulated by task handlers */
+   slock_t *image_lock;
+   #endif
+
+   /* Frame captured during the last call to the translation service */
+   uint8_t *last_image;
+   int last_image_size;
+
+   /* 1 if the automatic mode has been enabled, 0 otherwise */
    int ai_service_auto;
-   /* Is text-to-speech accessibility turned on? */
+
+   /* Text-to-speech narrator override flag */
    bool enabled;
 } access_state_t;
 
@@ -46,42 +71,73 @@ bool is_narrator_running(bool accessibility_enable);
 #endif
 
 /*
-   This function does all the stuff needed to translate the game screen,
-   using the URL given in the settings.  Once the image from the frame
-   buffer is sent to the server, the callback will write the translated
-   image to the screen.
-
-   Supported client/services (thus far)
-   -VGTranslate client ( www.gitlab.com/spherebeaker/vg_translate )
-   -Ztranslate client/service ( www.ztranslate.net/docs/service )
-
-   To use a client, download the relevant code/release, configure
-   them, and run them on your local machine, or network.  Set the
-   retroarch configuration to point to your local client (usually
-   listening on localhost:4404 ) and enable translation service.
-
-   If you don't want to run a client, you can also use a service,
-   which is basically like someone running a client for you.  The
-   downside here is that your retroarch device will have to have
-   an internet connection, and you may have to sign up for it.
-
-   To make your own server, it must listen for a POST request, which
-   will consist of a JSON body, with the "image" field as a base64
-   encoded string of a 24bit-BMP/PNG that the will be translated.
-   The server must output the translated image in the form of a
-   JSON body, with the "image" field also as a base64 encoded
-   24bit-BMP, or as an alpha channel png.
-
-  "paused" boolean is passed in to indicate if the current call
-   was made during a paused frame.  Due to how the menu widgets work,
-   if the ai service is called in "auto" mode, then this call will
-   be made while the menu widgets unpause the core for a frame to update
-   the on-screen widgets.  To tell the ai service what the pause
-   mode is honestly, we store the runloop_paused variable from before
-   the handle_translation_cb wipes the widgets, and pass that in here.
+   Invoke this method to send a request to the AI service. 
+   It makes the following POST request using URL params:
+      – source_lang (optional): language code of the content currently running.
+      – target_lang (optional): language of the content to return.
+      – output: comma-separated list of formats that must be provided by the
+         service. Also lists supported sub-formats.
+         
+   The currently supported formats are:
+      – sound: raw audio to playback. (wav)
+      – text: text to be read through internal text-to-speech capabilities.
+         'subs' can be specified on top of that to explain that we are looking
+         for short text response in the manner of subtitles.
+      – image: image to display on top of the video feed. Widgets will be used
+         first if possible, otherwise we'll try to draw it directly on the 
+         video buffer. (bmp, png, png-a) [All in 24-bits BGR formats]
+         
+   In addition, the request contains a JSON payload, formatted as such:
+      – image: captured frame from the currently running content (in base64).
+      – format: format of the captured frame ("png", or "bmp").
+      – coords: array describing the coordinates of the image within the 
+         viewport space (x, y, width, height).
+      – viewport: array describing the size of the viewport (width, height).
+      – label: a text string describing the content (<system id>__<content id>).
+      – state: a JSON object describing the state of the frontend, containing:
+         – paused: 1 if the content has been paused, 0 otherwise.
+         – <key>: the name of a retropad input, valued 1 if pressed.
+            (a, b, x, y, l, r, l2, r2, l3, r3)
+            (up, down, left, right, start, select)
+            
+   The translation component then expects a response from the AI service in the
+   form of a JSON payload, formatted as such:
+      – image: base64 representation of an image in a supported format.
+      – sound: base64 representation of a sound byte in a supported format.
+      – text: results from the service as a string.
+      – text_position: hint for the position of the text when the service is
+         running in text mode (ie subtitles). Position is a number,
+         1 for Bottom or 2 for Top (defaults to bottom).
+      – press: a list of retropad input to forcibly press. On top of the 
+         expected keys (cf. 'state' above) values 'pause' and 'unpause' can be
+         specified to control the flow of the content.
+      – error: any error encountered with the request.
+      – auto: either 'auto' or 'continue' to control automatic requests.
+      
+   All fields are optional, but at least one of them must be present.
+   If 'error' is set, the error is shown to the user and everything else is
+   ignored, even 'auto' settings.
+   
+   With 'auto' on 'auto', RetroArch will automatically send a new request
+   (with a minimum delay enforced by uints.ai_service_poll_delay), with a value
+   of 'continue', RetroArch will ignore the returned content and skip to the 
+   next automatic request. This allows the service to specify that the returned
+   content is the same as the one previously sent, so RetroArch does not need to
+   update its display unless necessary. With 'continue' the service *must* 
+   still send the content, as we may need to display it if the user paused the 
+   AI service for instance.
+
+   {paused} boolean is passed in to indicate if the current call was made 
+   during a paused frame. Due to how the menu widgets work, if the AI service 
+   is called in 'auto' mode, then this call will be made while the menu widgets 
+   unpause the core for a frame to update the on-screen widgets. To tell the AI
+   service what the pause mode is honestly, we store the runloop_paused 
+   variable from before the service wipes the widgets, and pass that in here.
 */
 bool run_translation_service(settings_t *settings, bool paused);
 
+void translation_release(bool inform);
+
 bool accessibility_speak_priority(
       bool accessibility_enable,
       unsigned accessibility_narrator_speech_speed,

diff --git a/config.def.h b/config.def.h
@@ -1749,8 +1749,14 @@
 
 #define DEFAULT_AI_SERVICE_MODE 1
 
+#define DEFAULT_AI_SERVICE_TEXT_POSITION 0
+#define DEFAULT_AI_SERVICE_TEXT_PADDING 5
+
 #define DEFAULT_AI_SERVICE_URL "http://localhost:4404/"
 
+#define DEFAULT_AI_SERVICE_POLL_DELAY 0
+#define MAXIMUM_AI_SERVICE_POLL_DELAY 500
+
 #if defined(HAVE_FFMPEG) || defined(HAVE_MPV)
 #define DEFAULT_BUILTIN_MEDIAPLAYER_ENABLE true
 #else

diff --git a/configuration.c b/configuration.c
@@ -2477,11 +2477,13 @@ static struct config_uint_setting *populate_settings_uint(
    SETTING_UINT("cheevos_appearance_anchor",     &settings->uints.cheevos_appearance_anchor, true, DEFAULT_CHEEVOS_APPEARANCE_ANCHOR, false);
    SETTING_UINT("cheevos_visibility_summary",    &settings->uints.cheevos_visibility_summary, true, DEFAULT_CHEEVOS_VISIBILITY_SUMMARY, false);
 #endif
-
    SETTING_UINT("accessibility_narrator_speech_speed", &settings->uints.accessibility_narrator_speech_speed, true, DEFAULT_ACCESSIBILITY_NARRATOR_SPEECH_SPEED, false);
-   SETTING_UINT("ai_service_mode",               &settings->uints.ai_service_mode,        true, DEFAULT_AI_SERVICE_MODE, false);
-   SETTING_UINT("ai_service_target_lang",        &settings->uints.ai_service_target_lang, true, 0, false);
-   SETTING_UINT("ai_service_source_lang",        &settings->uints.ai_service_source_lang, true, 0, false);
+   SETTING_UINT("ai_service_mode",              &settings->uints.ai_service_mode,            true, DEFAULT_AI_SERVICE_MODE, false);
+   SETTING_UINT("ai_service_target_lang",       &settings->uints.ai_service_target_lang,     true, 0, false);
+   SETTING_UINT("ai_service_source_lang",       &settings->uints.ai_service_source_lang,     true, 0, false);
+   SETTING_UINT("ai_service_poll_delay",        &settings->uints.ai_service_poll_delay,      true, DEFAULT_AI_SERVICE_POLL_DELAY, false);
+   SETTING_UINT("ai_service_text_position",     &settings->uints.ai_service_text_position,   true, DEFAULT_AI_SERVICE_TEXT_POSITION, false);
+   SETTING_UINT("ai_service_text_padding",      &settings->uints.ai_service_text_padding,    true, DEFAULT_AI_SERVICE_TEXT_PADDING, false);
 
 #ifdef HAVE_LIBNX
    SETTING_UINT("libnx_overclock",               &settings->uints.libnx_overclock, true, SWITCH_DEFAULT_CPU_PROFILE, false);

diff --git a/configuration.h b/configuration.h
@@ -334,6 +334,9 @@ typedef struct settings
       unsigned ai_service_mode;
       unsigned ai_service_target_lang;
       unsigned ai_service_source_lang;
+      unsigned ai_service_poll_delay;
+      unsigned ai_service_text_position;
+      unsigned ai_service_text_padding;
 
       unsigned core_updater_auto_backup_history_size;
       unsigned video_black_frame_insertion;

diff --git a/frontend/drivers/platform_win32.c b/frontend/drivers/platform_win32.c
@@ -1064,9 +1064,12 @@ static bool accessibility_speak_windows(int speed,
       if (!wc || res != 0) 
       {
          RARCH_ERR("Error communicating with NVDA\n");
+         /* Fallback on powershell immediately and retry */
+         g_plat_win32_flags &= ~PLAT_WIN32_FLAG_USE_NVDA;
+         g_plat_win32_flags |= PLAT_WIN32_FLAG_USE_POWERSHELL;
          if (wc)
             free(wc);
-         return false;
+         return accessibility_speak_windows(speed, speak_text, priority);
       }
 
       nvdaController_cancelSpeech_func();

diff --git a/gfx/gfx_widgets.c b/gfx/gfx_widgets.c
@@ -1471,6 +1471,67 @@ static void INLINE gfx_widgets_font_unbind(gfx_widget_font_data_t *font_data)
    font_driver_bind_block(font_data->font, NULL);
 }
 
+#ifdef HAVE_TRANSLATE
+static void gfx_widgets_ai_line(
+      video_frame_info_t *video, char *line, int line_idx, int line_total)
+{
+   settings_t *settings       = config_get_ptr();
+   gfx_display_t *p_disp      = (gfx_display_t*)video->disp_userdata;
+   dispgfx_widget_t *p_widget = (dispgfx_widget_t*)video->widgets_userdata;
+   void *userdata             = video->userdata;
+   unsigned video_width       = video->width;
+   unsigned video_height      = video->height;
+
+   int line_width             = font_driver_get_message_width(
+         p_widget->gfx_widget_fonts.regular.font,
+         line, strlen(line), 1.0f);
+
+   int hpadding               = p_widget->simple_widget_padding;
+   int vpadding               = settings->uints.ai_service_text_padding;
+   int half_vw                = video_width * 0.5f;
+   int block_width            = line_width + hpadding * 2;
+   int block_height           = p_widget->simple_widget_height;
+   int block_x                = half_vw - block_width * 0.5f;
+   int block_y                = 0;
+   int line_y                 = 0;
+
+   int position               = (settings->uints.ai_service_text_position > 0)
+         ? settings->uints.ai_service_text_position
+         : p_widget->ai_service_text_position;
+
+   switch (position)
+   {
+      case 0: /* Undef. */
+      case 1: /* Bottom */
+         block_y  = (video_height * (100 - vpadding) * 0.01f)
+                  - ((line_total - line_idx) * block_height);
+         break;
+      case 2: /* Top    */
+         block_y  = (video_height * (vpadding * 0.01f))
+                  + (line_idx * block_height);
+         break;
+   }
+
+   line_y = block_y + block_height * 0.5f 
+          + p_widget->gfx_widget_fonts.regular.line_centre_offset;
+
+   gfx_display_set_alpha(p_widget->backdrop_orig, DEFAULT_BACKDROP);
+
+   gfx_display_draw_quad(
+         p_disp, userdata, video_width, video_height,
+         block_x, block_y, block_width, block_height,
+         video_width, video_height,
+         p_widget->backdrop_orig,
+         NULL);
+
+   gfx_widgets_draw_text(
+         &p_widget->gfx_widget_fonts.regular,
+         line, half_vw, line_y,
+         video_width, video_height,
+         0xFFFFFFFF, TEXT_ALIGN_CENTER, true);
+}
+#endif
+
 void gfx_widgets_frame(void *data)
 {
    size_t i;
@@ -1520,12 +1581,8 @@ void gfx_widgets_frame(void *data)
    /* AI Service overlay */
    if (p_dispwidget->ai_service_overlay_state > 0)
    {
-      float outline_color[16] = {
-      0.00, 1.00, 0.00, 1.00,
-      0.00, 1.00, 0.00, 1.00,
-      0.00, 1.00, 0.00, 1.00,
-      0.00, 1.00, 0.00, 1.00,
-      };
+      int text_length = strlen(p_dispwidget->ai_service_text);
+
       gfx_display_set_alpha(p_dispwidget->pure_white, 1.0f);
 
       if (p_dispwidget->ai_service_overlay_texture)
@@ -1550,63 +1607,46 @@ void gfx_widgets_frame(void *data)
          if (dispctx->blend_end)
             dispctx->blend_end(userdata);
       }
-
-      /* top line */
-      gfx_display_draw_quad(
-            p_disp,
-            userdata,
-            video_width, video_height,
-            0, 0,
-            video_width,
-            p_dispwidget->divider_width_1px,
-            video_width,
-            video_height,
-            outline_color,
-            NULL
-            );
-      /* bottom line */
-      gfx_display_draw_quad(
-            p_disp,
-            userdata,
-            video_width, video_height,
-            0,
-            video_height - p_dispwidget->divider_width_1px,
-            video_width,
-            p_dispwidget->divider_width_1px,
-            video_width,
-            video_height,
-            outline_color,
-            NULL
-            );
-      /* left line */
-      gfx_display_draw_quad(
-            p_disp,
-            userdata,
-            video_width,
-            video_height,
-            0,
-            0,
-            p_dispwidget->divider_width_1px,
-            video_height,
-            video_width,
-            video_height,
-            outline_color,
-            NULL
-            );
-      /* right line */
-      gfx_display_draw_quad(
-            p_disp,
-            userdata,
-            video_width, video_height,
-            video_width - p_dispwidget->divider_width_1px,
-            0,
-            p_dispwidget->divider_width_1px,
-            video_height,
-            video_width,
-            video_height,
-            outline_color,
-            NULL
-            );
+
+      /* AI Service subtitle overlay widget */
+      if (text_length > 0)
+      {
+         int padding      = p_dispwidget->simple_widget_padding;
+         int text_width   = font_driver_get_message_width(
+               p_dispwidget->gfx_widget_fonts.regular.font,
+               p_dispwidget->ai_service_text,
+               text_length, 1.0f);
+
+         if (text_width > (video_width * 0.9f - padding * 2))
+         {
+            int text_half     = text_length / 2;
+            char *extra_line  = (char*)malloc(sizeof(char) * text_length);
+            for (; text_half > 0; text_half--)
+            {
+               if (p_dispwidget->ai_service_text[text_half] == ' ')
+               {
+                  p_dispwidget->ai_service_text[text_half] = '\0';
+                  gfx_widgets_ai_line(
+                        video_info, p_dispwidget->ai_service_text, 0, 2);
+                  strlcpy(
+                        extra_line, 
+                        p_dispwidget->ai_service_text + text_half + 1,
+                        text_length - text_half);
+                  gfx_widgets_ai_line(
+                        video_info, extra_line, 1, 2);
+
+                  p_dispwidget->ai_service_text[text_half] = ' ';
+                  free(extra_line);
+                  break;
+               }
+            }
+         } 
+         else 
+         {
+            gfx_widgets_ai_line(
+                  video_info, p_dispwidget->ai_service_text, 0, 1);
+         }
+      }
 
       if (p_dispwidget->ai_service_overlay_state == 2)
           p_dispwidget->ai_service_overlay_state = 3;
@@ -2149,6 +2189,7 @@ void gfx_widgets_ai_service_overlay_unload(void)
    if (p_dispwidget->ai_service_overlay_state == 1)
    {
       video_driver_texture_unload(&p_dispwidget->ai_service_overlay_texture);
+      p_dispwidget->ai_service_text[0]         = '\0';
       p_dispwidget->ai_service_overlay_texture = 0;
       p_dispwidget->ai_service_overlay_state   = 0;
    }