fix: prevent blocking of non-browser clients accessing ML4K API

Using the edge function to get requests to the relevant API endpoints to look like curl, regardless of whatever Python or App Inventor user-agents they provide. This seems to be accepted by Cloudflare. Signed-off-by: Dale Lane <dale.lane@uk.ibm.com>
IBM · Sep 15, 2022 · 0b3fe1a · 0b3fe1a
1 parent 40b2ccd
commit 0b3fe1a
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 31 deletions.
diff --git a/cis/edge-functions.js b/cis/edge-functions.js
@@ -89,28 +89,18 @@ function forwardRequest(request, targetHost) {
   // newRequest.headers.set('X-Forwarded-Host', request.url.hostname);
   newRequest.headers.set('host', targetHost);
 
-  if (!includesUserAgent(request) && isUrlExemptFromBrowserIntegrityCheck(request)) {
-    // the request didn't include a user-agent but they are trying
-    //  to access a resource that shouldn't be blocked for this so
-    //  we'll inject a dummy user-agent so can get through the
-    //  Cloudflare in-front of Code Engine
-    newRequest.headers.set('user-agent', 'MachineLearningForKids');
+  if (isUrlExemptFromBrowserIntegrityCheck(request)) {
+    // URLs for some Scratch APIs are commonly accessed by non-browser
+    //  clients (Python/Tensorflow/AppInventor/etc) so for these we
+    //  add a user-agent to pretend to be curl, so that the Cloudflare
+    //  in-front of Code Engine is less likely to block them
+    newRequest.headers.set('user-agent', 'curl/7.79.1');
   }
 
   return fetch(newUrl, newRequest);
 }
 
 
-function includesUserAgent(request) {
-  try {
-    return request.headers.has('User-Agent');
-  }
-  catch (err) {
-    // we couldn't check - let's assume there is one
-    return true;
-  }
-}
-
 function isUrlExemptFromBrowserIntegrityCheck(request) {
   try {
     return BIC_EXEMPT_URLS.some((urlRegexTest) => urlRegexTest.test(request.url));

diff --git a/resources/mlforkids.py b/resources/mlforkids.py
@@ -33,7 +33,7 @@ def __init__(self, scratchkey: str):
         print("MLFORKIDS: Downloading information about your machine learning project")
         self.scratchkey = scratchkey
         try:
-            apiurl = self.__switchToTemporarySite("https://machinelearningforkids.co.uk/api/scratch/" + scratchkey + "/train")
+            apiurl = "https://machinelearningforkids.co.uk/api/scratch/" + scratchkey + "/train"
             with urllib.request.urlopen(apiurl) as url:
                 self.__downloaded_training_images_list = json.loads(url.read().decode())
         except urllib.error.HTTPError:
@@ -55,7 +55,7 @@ def __get_training_images_generator(self):
         projectcachedir = str(os.path.expanduser(os.path.join(cachedir, cachelocation)))
         for trainingitem in self.__downloaded_training_images_list:
             try:
-                tf.keras.utils.get_file(origin=self.__switchToTemporarySite(trainingitem["imageurl"]),
+                tf.keras.utils.get_file(origin=trainingitem["imageurl"],
                                         cache_dir=cachedir,
                                         cache_subdir=os.path.join(cachelocation, trainingitem["label"]),
                                         fname=self.__get_fname(trainingitem))
@@ -115,19 +115,6 @@ def __train_model(self, trainingimagesdata):
         print("MLFORKIDS: Model training complete")
 
 
-    # Cloudflare is currently blocking access to the Machine Learning for Kids API
-    #  from non-browser user agents
-    # While I raise this with them to get this unblocked, switching to this
-    #  temporary URL should avoid the problem
-    #
-    # TODO: remove this function as soon as Cloudflare have
-    #  stopped breaking Python apps
-    #
-    def __switchToTemporarySite(self, url):
-        return url.replace("https://machinelearningforkids.co.uk/api/scratch/",
-                           "https://mlforkids-api.j8clybxvjr0.us-south.codeengine.appdomain.cloud/api/scratch/")
-
-
     #
     # public methods
     #