### Purpose:
The purpose of this notebook is to clean and re-integrate the reacquired content and then scan it with the readibility API

### Dependancies:

In [116]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import os
import csv
import json
import time

### Functions:

In [117]:
def clean_text(text: str) -> str:
    """
    Clean up text by replacing tabs and newlines with spaces, and replacing multiple spaces with single spaces.

    Parameters:
    text (str): The text to clean up.

    Returns:
    str: The cleaned up text.
    """
    # Replace all tabs with a space
    text = re.sub("\t", " ", text)

    # Replace all newlines with a space
    text = re.sub("\n", " ", text)

    # Replace all double spaces with a single space
    text = re.sub("  +", " ", text)

    text = text.strip()
    return text

In [118]:
def load_csv(filepath: str) -> dict:
    """
    Load a CSV file and return a dictionary with the link as the key and the cleaned text as the value.

    Parameters:
    filepath (str): The path to the CSV file to load.

    Returns:
    dict: A dictionary with the link as the key and the cleaned text as the value.
    """
    # Increase the field size limit to the maximum possible value
    csv.field_size_limit(2**31-1)

    data = {}
    with open(filepath, encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            link = row["Link"]
            text = row["Text"]
            cleaned_text = clean_text(text)
            data[link] = cleaned_text
    return data

In [119]:
def load_all_csvs(directory: str) -> dict:
    """
    Load all CSV files in a directory and return a dictionary with the link as the key and the cleaned text as the value.

    Parameters:
    directory (str): The path to the directory containing the CSV files to load.

    Returns:
    dict: A dictionary with the link as the key and the cleaned text as the value for all CSV files in the directory.
    """
    data = {}
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            csv_data = load_csv(filepath)
            data.update(csv_data)
    return data

In [120]:
def evaluate_readability(text: str, url: str = "https://staging-originalityai-originstg.kinsta.cloud/tool-readability/src/API/TextHighlighter-api.php") -> requests.Response:
    """
    Evaluate the readability of a given text using an API endpoint.

    Parameters:
    text (str): The text to evaluate.
    url (str): The URL of the API endpoint to use. Defaults to "https://staging-originalityai-originstg.kinsta.cloud/tool-readability/src/API/TextHighlighter-api.php".

    Returns:
    requests.Response: The response from the API endpoint.
    """
    payload = json.dumps({"text": text})
    response = requests.request("POST", url=url, data=payload)

    print(f"Request sent to {url}")
    if response.status_code == 200:
        print(f"Response successfully received")
    else:
       print(f"Error response received") 
    print(f"Response code: {response.status_code}")
    return response


In [121]:
# def evaluate_all(data: dict, directory: str):
#     """
#     Evaluate the readability of all texts in a dictionary and save the results to JSON files.

#     Parameters:
#     data (dict): The dictionary containing the text data to evaluate.
#     directory (str): The directory to save the JSON files to.

#     Returns:
#     None
#     """
#     # Create the directory if it doesn't exist
#     if not os.path.exists(directory):
#         os.makedirs(directory)

#     for i, (link, text) in enumerate(data.items()):
#         # Evaluate the readability of the text
#         response = evaluate_readability(text)

#         # Wait for 30 seconds to avoid overloading the server
#         time.sleep(30)

#         # Get the JSON data from the response
#         json_data = json.loads(response.text)

#         # Create the filename for the JSON file
#         filename = str(i).zfill(5) + ".json"
#         filepath = os.path.join(directory, filename)

#         # Save the JSON data to a file
#         with open(filepath, "w") as f:
#             json.dump({"link": link, "json_data": json_data}, f)

#         # Print a message indicating that the file has been saved and the batch number
#         print(f"Saved {filepath} (batch {i+1}/{len(data)})")

In [122]:
def evaluate_all(data: dict, directory: str):
    """
    Evaluate the readability of all texts in a dictionary and save the results to JSON files.

    Parameters:
    data (dict): The dictionary containing the text data to evaluate.
    directory (str): The directory to save the JSON files to.

    Returns:
    None
    """
    # Create the directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    for i, (link, text) in enumerate(data.items()):
        # Check if the record has already been saved
        filename = str(i).zfill(5) + ".json"
        filepath = os.path.join(directory, filename)
        if os.path.exists(filepath):
            print(f"Record {i}/{len(data)} already saved, skipping")
            continue

        # Evaluate the readability of the text
        response = evaluate_readability(text)

        # Wait for 5 seconds to avoid overloading the server
        time.sleep(5)

        # Get the JSON data from the response
        json_data = json.loads(response.text)

        # Save the JSON data to a file
        with open(filepath, "w") as f:
            json.dump({"link": link, "json_data": json_data}, f)

        # Print a message indicating that the file has been saved and the batch number
        print(f"Saved {filepath} (batch {i+1}/{len(data)})")


### Code Execution:

In [123]:
# x = load_all_csvs('data/responses')

In [124]:
# len(x)

In [125]:
# len(x)/14637

In [126]:
# filepath = "data/responses/00001.csv"
# data = load_csv(filepath)
# print(data)

In [127]:
#x

In [128]:
response = evaluate_readability('Spider Bites: Signs, Symptoms, Diagnosis, Treatment & Prevention Important Updates + Notice of Vendor Data Event Coming to a Cleveland Clinic locatio')

Request sent to https://staging-originalityai-originstg.kinsta.cloud/tool-readability/src/API/TextHighlighter-api.php
Response successfully received
Response code: 200


In [129]:
response.content

b'{"text":"Spider Bites: Signs, Symptoms, Diagnosis, Treatment & Prevention Important Updates + Notice of Vendor Data Event Coming to a Cleveland Clinic locatio","smogScore":12,"colemanScore":12,"automatedReadabilityScore":12,"daleScore":9,"powersSumnerKearlScore":1.3999999999999999,"forcastScore":19.5,"spacheScore":4.4000000000000004,"gunningFogScore":10.6,"fleschGrade":12,"fleschScore":23}'

In [130]:
data = load_all_csvs("data/responses")
directory = "data/json"
evaluate_all(data, directory)

Record 0/13588 already saved, skipping
Record 1/13588 already saved, skipping
Record 2/13588 already saved, skipping
Record 3/13588 already saved, skipping
Record 4/13588 already saved, skipping
Record 5/13588 already saved, skipping
Record 6/13588 already saved, skipping
Record 7/13588 already saved, skipping
Record 8/13588 already saved, skipping
Record 9/13588 already saved, skipping
Record 10/13588 already saved, skipping
Record 11/13588 already saved, skipping
Record 12/13588 already saved, skipping
Record 13/13588 already saved, skipping
Record 14/13588 already saved, skipping
Record 15/13588 already saved, skipping
Record 16/13588 already saved, skipping
Record 17/13588 already saved, skipping
Record 18/13588 already saved, skipping
Record 19/13588 already saved, skipping
Record 20/13588 already saved, skipping
Record 21/13588 already saved, skipping
Record 22/13588 already saved, skipping
Record 23/13588 already saved, skipping
Record 24/13588 already saved, skipping
Record 25/