### Purpose:
The purpose of this notebook is to clean and re-integrate the reacquired content and then scan it with the readibility API

### Dependancies:

In [208]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import os
import csv
import json
import time
from IPython.display import clear_output
from urllib3.exceptions import NewConnectionError, MaxRetryError
from socket import gaierror
import json
import string

### Functions:

In [209]:
def clean_text(text: str) -> str:
    """
    Clean up text by replacing tabs and newlines with spaces, and replacing multiple spaces with single spaces.

    Parameters:
    text (str): The text to clean up.

    Returns:
    str: The cleaned up text.
    """
    # Replace all tabs with a space
    text = re.sub("\t", " ", text)

    # Replace all newlines with a space
    text = re.sub("\n", " ", text)

    # Replace all double spaces with a single space
    text = re.sub("  +", " ", text)

    text = text.strip()
    return text

In [210]:
def load_csv(filepath: str) -> dict:
    """
    Load a CSV file and return a dictionary with the link as the key and the cleaned text as the value.

    Parameters:
    filepath (str): The path to the CSV file to load.

    Returns:
    dict: A dictionary with the link as the key and the cleaned text as the value.
    """
    # Increase the field size limit to the maximum possible value
    csv.field_size_limit(2**31-1)

    data = {}
    with open(filepath, encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            link = row["Link"]
            text = row["Text"]
            cleaned_text = clean_text(text)
            data[link] = cleaned_text
    return data

In [211]:
def load_all_csvs(directory: str) -> dict:
    """
    Load all CSV files in a directory and return a dictionary with the link as the key and the cleaned text as the value.

    Parameters:
    directory (str): The path to the directory containing the CSV files to load.

    Returns:
    dict: A dictionary with the link as the key and the cleaned text as the value for all CSV files in the directory.
    """
    data = {}
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            csv_data = load_csv(filepath)
            data.update(csv_data)
    return data

In [212]:
def evaluate_readability(text: str, url: str = "https://staging-originalityai-originstg.kinsta.cloud/tool-readability/src/API/TextHighlighter-api.php") -> requests.Response:
    """
    Evaluate the readability of a given text using an API endpoint.

    Parameters:
    text (str): The text to evaluate.
    url (str): The URL of the API endpoint to use. Defaults to "https://staging-originalityai-originstg.kinsta.cloud/tool-readability/src/API/TextHighlighter-api.php".

    Returns:
    requests.Response: The response from the API endpoint.
    """
    payload = json.dumps({"text": text})
    response = requests.request("POST", url=url, data=payload)
    
    #clear_output(wait=True)
    print(f"Request sent to {url}")
    if response.status_code == 200:
        print(f"Response successfully received")
    else:
       print(f"Error response received") 
    print(f"Response code: {response.status_code}")
    return response

In [213]:
def evaluate_all(data: dict, directory: str) -> None:
    """
    Evaluate the readability of all texts in a dictionary and save the results to JSON files.

    Parameters:
    data (dict): The dictionary containing the text data to evaluate.
    directory (str): The directory to save the JSON files to.

    Returns:
    None
    """
    # create directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    for i, (link, text) in enumerate(data.items()):

        clear_output(wait=True)
        print(f"Evaluating record {i}/{len(data)}")

        # check if record has already been saved
        filename = str(i).zfill(5) + ".json"
        filepath = os.path.join(directory, filename)
        if os.path.exists(filepath):
            print(f"Record {i}/{len(data)} already saved, skipping")
            time.sleep(0.020)
            continue

        # evaluate readability of text, retrying if there is a connection error
        retries = 0
        while True:
            try:
                response = evaluate_readability(text)
                break
            except (NewConnectionError, MaxRetryError, gaierror) as e:
                retries += 1
                print(f"Connection error: {e}. Retrying in 1 minute...")
                time.sleep(60)
                if retries >= 3:
                    print(f"Max retries exceeded, skipping record {i}/{len(data)}")
                    break

        if retries < 3:
            # convert response to json
            json_data = json.loads(response.text)

            # save record to file
            with open(filepath, "w") as f:
                json.dump({"link": link, "json_data": json_data}, f)

            # print message indicating file has been saved
            print(f"Saved {filepath}")

            # delay to avoid overloading server
            time.sleep(4.2)

In [214]:
def flatten_dict(d):
    """
    Recursively flattens a dictionary with nested keys.
    """
    items = {}
    for key, value in d.items():
        #try:
        #if isinstance(value, dict):
        if type(value) == dict:
            flattened = flatten_dict(value)
            for subkey, subvalue in flattened.items():
                items[subkey] = subvalue
        else:
            items[key] = value
        #except:
        #    print("somthing bad happened")
    
    new_dict = {k: v for k, v in items.items()}
    return new_dict

In [215]:
# def get_json_data(directory):
#     """
#     Recursively flattens all dictionaries in a given directory.
#     """
#     flattened_list = []
#     for filename in os.listdir(directory):
#         filepath = os.path.join(directory, filename)
#         with open(filepath, "r") as f:
#             d = json.load(f)
#             flattened = flatten_dict(d)
#             flattened_list.append(flattened)
#     return flattened_list

In [216]:
def get_json_data(directory):
    """
    Recursively flattens all dictionaries in a given directory.
    """
    flattened_list = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        with open(filepath, "r") as f:
            try:
                d = json.load(f)
                flattened = flatten_dict(d)
                flattened_list.append(flattened)
            except json.JSONDecodeError:
                # Skip over any files that contain invalid JSON data
                print(f"Skipping file {filename}: invalid JSON data")
                pass
    return flattened_list

### Code Execution:

In [217]:
#done

data = load_all_csvs("data/responses")
directory = "data/json"
#evaluate_all(data, directory) # WARNING THIS WILL TAKE HOURS TO RUN

In [218]:
x = get_json_data(directory)

In [219]:
len(x)

13588

In [220]:
with open("data/json/00000.json", "r") as f:
    d = json.load(f)

In [221]:
d

{'link': 'https://www.tooldiscounter.com/brand/sk-hand-tool',
  'smogScore': 12,
  'colemanScore': 12,
  'automatedReadabilityScore': 10.9,
  'daleScore': 6.5,
  'powersSumnerKearlScore': 14.5,
  'forcastScore': 14.2,
  'spacheScore': 5,
  'gunningFogScore': 9.3,
  'fleschGrade': 10.3,
  'fleschScore': 56.2}}

In [222]:
# Open the file and load the contents as a dictionary
with open("data/json/00000.json", "r") as f:
    data = json.load(f)

# Print the contents of the dictionary
print(data)



In [223]:
data

{'link': 'https://www.tooldiscounter.com/brand/sk-hand-tool',
  'smogScore': 12,
  'colemanScore': 12,
  'automatedReadabilityScore': 10.9,
  'daleScore': 6.5,
  'powersSumnerKearlScore': 14.5,
  'forcastScore': 14.2,
  'spacheScore': 5,
  'gunningFogScore': 9.3,
  'fleschGrade': 10.3,
  'fleschScore': 56.2}}

In [224]:
data = {"link": "https://www.livescience.com/what-are-muscle-knots", "json_data": {"text": "Muscle", "smogScore": 12, "colemanScore": 11.8, "automatedReadabilityScore": 11.8, "daleScore": 6.3, "powersSumnerKearlScore": 8.8, "forcastScore": 10.7, "spacheScore": 5, "gunningFogScore": 13, "fleschGrade": 11.1, "fleschScore": 55}}

In [225]:
"link", "smogScore", "colemanScore", "automatedReadabilityScore", "daleScore", "powersSumnerKearlScore", "forcastScore", "spacheScore", "gunningFogScore", "fleschGrade", "fleschScore"

('link',
 'smogScore',
 'colemanScore',
 'automatedReadabilityScore',
 'daleScore',
 'powersSumnerKearlScore',
 'forcastScore',
 'spacheScore',
 'gunningFogScore',
 'fleschGrade',
 'fleschScore')