### Purpose:
The purpose of this notebook is to clean that data gathered from the readibility API

### Dependancies:

In [79]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import os
import csv
import json
import time
from IPython.display import clear_output
from urllib3.exceptions import NewConnectionError, MaxRetryError
from socket import gaierror
import json
import string

In [80]:
def get_json_data(directory):
    """
    Recursively flattens all dictionaries in a given directory.
    """
    flattened_list = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        with open(filepath, "r") as f:
            try:
                d = json.load(f)
                flattened = flatten_dict(d)
                flattened_list.append(flattened)
            except json.JSONDecodeError:
                # Skip over any files that contain invalid JSON data
                print(f"Skipping file {filename}: invalid JSON data")
                pass
    return flattened_list

In [81]:
def load_csv(filepath: str) -> dict:
    """
    Load a CSV file and return a dictionary with the link as the key and the cleaned text as the value.

    Parameters:
    filepath (str): The path to the CSV file to load.

    Returns:
    dict: A dictionary with the link as the key and the cleaned text as the value.
    """
    # Increase the field size limit to the maximum possible value
    csv.field_size_limit(2**31-1)

    data = {}
    with open(filepath, encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            link = row["Link"]
            text = row["Text"]
            cleaned_text = clean_text(text)
            data[link] = cleaned_text
    return data

In [82]:
def clean_text(text: str) -> str:
    """
    Clean up text by replacing tabs and newlines with spaces, and replacing multiple spaces with single spaces.

    Parameters:
    text (str): The text to clean up.

    Returns:
    str: The cleaned up text.
    """
    # Replace all tabs with a space
    text = re.sub("\t", " ", text)

    # Replace all newlines with a space
    text = re.sub("\n", " ", text)

    # Replace all double spaces with a single space
    text = re.sub("  +", " ", text)

    text = text.strip()
    return text

In [83]:
def flatten_dict(d):
    """
    Recursively flattens a dictionary with nested keys.
    """
    items = {}
    for key, value in d.items():
        #try:
        #if isinstance(value, dict):
        if type(value) == dict:
            flattened = flatten_dict(value)
            for subkey, subvalue in flattened.items():
                items[subkey] = subvalue
        else:
            items[key] = value
        #except:
        #    print("somthing bad happened")
    
    new_dict = {k: v for k, v in items.items()}
    return new_dict

In [84]:
def evaluate_readability(text: str, url: str = "https://staging-originalityai-originstg.kinsta.cloud/tool-readability/src/API/TextHighlighter-api.php") -> requests.Response:
    """
    Evaluate the readability of a given text using an API endpoint.

    Parameters:
    text (str): The text to evaluate.
    url (str): The URL of the API endpoint to use. Defaults to "https://staging-originalityai-originstg.kinsta.cloud/tool-readability/src/API/TextHighlighter-api.php".

    Returns:
    requests.Response: The response from the API endpoint.
    """
    payload = json.dumps({"text": text})
    response = requests.request("POST", url=url, data=payload)
    
    #clear_output(wait=True)
    print(f"Request sent to {url}")
    if response.status_code == 200:
        print(f"Response successfully received")
    else:
       print(f"Error response received") 
    print(f"Response code: {response.status_code}")
    return response

### Functions:

### Code Execution

In [85]:
x = get_json_data("data/json")

In [86]:
len(x)

13588

In [87]:
# data/responses/03944.csv to data/responses/03949.csv had invalid characters and needed repair

In [88]:
dict = load_csv('data/responses/03949.csv')
dict

{'https://www.arducam.com/product-category/uvc-usb-camera-module/usb-uvc-cameras-low-light/': "Low Light USB Cameras Archives - Arducam Toggle Navigation Solutions TinyML UDOO KEYPico4MLRaspberry Pi PicoUltra Low-Power Cameras for Nano 33 BLE SenseMachine Learning on MCUs (TinyML) with VisionFor Your Platform Raspberry PiRaspberry Pi PicoJetson Nano/Xavier NXArduino and other MicrocontrollersFor Your Application Multispectral/Hyperspectral ImagingSync Multiple MIPI Cameras with CamarrayUSB 3.0 Camera Dev Kit for CMOS SensorsEmbedded Spatial AI and 3D LocalizationLens Selection in Optical TestingFor Image Sensor IMX477OV2640HM01B0/HM0360Products FPD-Link III CamerasArducam MegaRaspberry Pi Camera ToF Camera for RPi64MP Camera for Pi16MP Camera for PiIMX708 Camera Module 3Native Pi CamerasPivariety CamerasIndustrial Board CamerasGlobal Shutter CameraPTZ CameraMultiple CamerasUltra Low Light CameraAutofocusHDRNVIDIA Jetson Cameras NVIDIA Jetson Orin CamerasOpenCV AI KitMicrocontroller Cam

In [89]:
for k, v in dict.items():
    v = v.strip()
v = re.sub(r'\s+', ' ', v)

try:
    json.loads(v)
except ValueError as e:
    print("Invalid JSON characters found, removing...")
    cleaned_str = ''.join(filter(lambda x: x in string.printable, v))
    print("Cleaned string:")
    print(cleaned_str)
print(v)

Invalid JSON characters found, removing...
Cleaned string:
Low Light USB Cameras Archives - Arducam Toggle Navigation Solutions TinyML UDOO KEYPico4MLRaspberry Pi PicoUltra Low-Power Cameras for Nano 33 BLE SenseMachine Learning on MCUs (TinyML) with VisionFor Your Platform Raspberry PiRaspberry Pi PicoJetson Nano/Xavier NXArduino and other MicrocontrollersFor Your Application Multispectral/Hyperspectral ImagingSync Multiple MIPI Cameras with CamarrayUSB 3.0 Camera Dev Kit for CMOS SensorsEmbedded Spatial AI and 3D LocalizationLens Selection in Optical TestingFor Image Sensor IMX477OV2640HM01B0/HM0360Products FPD-Link III CamerasArducam MegaRaspberry Pi Camera ToF Camera for RPi64MP Camera for Pi16MP Camera for PiIMX708 Camera Module 3Native Pi CamerasPivariety CamerasIndustrial Board CamerasGlobal Shutter CameraPTZ CameraMultiple CamerasUltra Low Light CameraAutofocusHDRNVIDIA Jetson Cameras NVIDIA Jetson Orin CamerasOpenCV AI KitMicrocontroller Camera Arduino CamerasSTM32 Camera Modu

In [90]:
v = evaluate_readability(v)

Request sent to https://staging-originalityai-originstg.kinsta.cloud/tool-readability/src/API/TextHighlighter-api.php
Response successfully received
Response code: 200


In [91]:
v.json()

{'text': "Low Light USB Cameras Archives - Arducam Toggle Navigation Solutions TinyML UDOO KEYPico4MLRaspberry Pi PicoUltra Low-Power Cameras for Nano 33 BLE SenseMachine Learning on MCUs (TinyML) with VisionFor Your Platform Raspberry PiRaspberry Pi PicoJetson Nano/Xavier NXArduino and other MicrocontrollersFor Your Application Multispectral/Hyperspectral ImagingSync Multiple MIPI Cameras with CamarrayUSB 3.0 Camera Dev Kit for CMOS SensorsEmbedded Spatial AI and 3D LocalizationLens Selection in Optical TestingFor Image Sensor IMX477OV2640HM01B0/HM0360Products FPD-Link III CamerasArducam MegaRaspberry Pi Camera ToF Camera for RPi64MP Camera for Pi16MP Camera for PiIMX708 Camera Module 3Native Pi CamerasPivariety CamerasIndustrial Board CamerasGlobal Shutter CameraPTZ CameraMultiple CamerasUltra Low Light CameraAutofocusHDRNVIDIA Jetson Cameras NVIDIA Jetson Orin CamerasOpenCV AI KitMicrocontroller Camera Arduino CamerasSTM32 Camera ModulesESP32/ESP8266 CameraRaspberry Pi Pico CameraBB

In [92]:
v = re.sub('’', '"', v)

TypeError: expected string or bytes-like object

In [None]:
v = re.sub('[“”]', '/"', v)

In [None]:
v