### This notebook used to scan each of the records in the dataset

In [3]:
import requests
import pandas as pd
import time
import os
import urllib.parse

### Functions

In [25]:
def load_api_key(key_name: str = "oai_key.txt", path: str = "secret/") -> None:
    """
    Load the OpenAI API key from a text file.

    Args:
        key_name (str): The name of the file containing the API key. Default is "oai_key.txt".
        path (str): The path to the directory containing the API key file. Default is "secret/".

    Returns:
        None.

    """
    global api_key
    api_key_path = path + key_name
    with open(api_key_path, "r") as file:
        api_key = file.read()
    print(f'API key loaded as global variable `api_key`: (is type {type(api_key).__name__} and {len(api_key)} characters long)')

In [5]:
def get_balance(api_key: str) -> requests.Response:
    """
    Get the account balance (in credits) for the OpenAI API.

    Args:
        api_key (str): The API key to use for the request. Default is the global variable `api_key`.

    Returns:
        A `requests.Response` object representing the HTTP response from the API.

    Raises:
        requests.exceptions.RequestException: If there was an error making the HTTP request.

    """
    headers = {'X-OAI-API-KEY': api_key}
    response = requests.get('https://api.originality.ai/api/v1/account/credits/balance', headers=headers)
    response.raise_for_status()
    return response

In [21]:
# # CLASSIFY 1 WEB PAGE AND RETURN RESPONSE

# def scan_webpage(url_in: str, api_key: str)-> requests.Response: 
#     scan_url="https://api.originality.ai/api/v1/scan/url"
#     headers={"X-OAI-API-KEY": api_key}
#     params={"url": url_in}
#     response = requests.post(url=scan_url, headers=headers, params=params)
#     return(response)

In [30]:
def scan_webpage(url_in: str, api_key: str) -> requests.Response:
    """
    Submit a URL to the OpenAI API for webpage scanning.

    Args:
        url_in (str): The URL of the webpage to scan.
        api_key (str): The API key to use for the request. Default is the global variable `api_key`.

    Returns:
        A `requests.Response` object representing the HTTP response from the API.

    Raises:
        requests.exceptions.RequestException: If there was an error making the HTTP request.

    """
    scan_url = 'https://api.originality.ai/api/v1/scan/url'
    headers = {'X-OAI-API-KEY': api_key}
    params = {'url': url_in}
    response = requests.post(url=scan_url, headers=headers, params=params)
    response.raise_for_status()
    return response

In [8]:
# EXTRACT AI/HUMAN SCORE FROM 1 RESPONSE AND RETURN DF

def extract_score(response):
    r_json = response.json()
    if r_json['success']:
        data = {
                'success': [r_json['success']],
                'word_count': [r_json['word_count']],
                'percent_human': [r_json['score']['original'] * 100],
                'percent_ai': [r_json['score']['ai'] * 100]
                # probably shoudn't round yet
                #'Percent_Human': [round(r_json['score']['original'] * 100, 1)],
                #'Percent_AI': [round(r_json['score']['ai'] * 100, 1)]
        }
        if len(r_json['score_breakdown']) > 0:
            data['text'] = [r_json['score_breakdown'][0]['text']]
            
        else:
            data['text'] = ['']

    else:
        data = {
            'success': False,
            'word_count': None,
            'percent_human': None,
            'percent_ai': None,
            'text': ['']
        }
    df = pd.DataFrame(data)
    return df

In [9]:
# DEPRICATED

# FAILS IF score_breakdown list is empty

# def extract_score(response):
#     r_json = response.json()
#     if r_json['success']:
#         data = {
#         'Success': [r_json['success']],
#         'word_count': [r_json['word_count']],
#         'Percent_Human': [round(r_json['score']['original'] * 100, 1)],
#         'Percent_AI': [round(r_json['score']['ai'] * 100, 1)],
#         'text': [r_json['score_breakdown'][0]['text']]
#         }
#     else:
#         data = {
#             'Success': [False],
#             'word_count': [None],
#             'Percent_Human': [None],
#             'Percent_AI': [None],
#             'text': ['']
#         }
#     df = pd.DataFrame(data)
#     return df

In [10]:
# DEPRICATED

# def extract_score(response):
#     r_json = response.json()
#     data = {
#         'Success': [r_json['success']],
#         'Percent_Human': [round(r_json['score']['original'] * 100, 1)],
#         'Percent_AI': [round(r_json['score']['ai'] * 100, 1)]
#     }
#     df = pd.DataFrame(data)
#     return df

In [11]:
# TAKES A DF OF KW AND BREAKS IT UP INTO BATCHES OF 100 CSV FILES
# ASSUMES THERE ARE NO MORE THAN 20 OF EACH KW

def make_csv_batches(df, batch_size=5, output_dir='data/batches'):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Get unique values in the 'kw' column
    unique_kw = df['kw'].unique()

    # Break up unique values into batches of batch_size
    kw_batches = [unique_kw[i:i+batch_size] for i in range(0, len(unique_kw), batch_size)]

    # Iterate through kw_batches and create csv files for each batch
    filenames = []
    for i, batch in enumerate(kw_batches):
        # Select rows from df where 'kw' is in the current batch
        batch_df = df[df['kw'].isin(batch)]

        # Write subset_df to csv
        filename = f'kw_batch_{i:03d}.csv'
        filepath = os.path.join(output_dir, filename)
        batch_df.to_csv(filepath, index=False)
        filenames.append(filename)

    #Write list of filenames to batch_list.csv
    with open(os.path.join(output_dir, 'batch_list.csv'), 'w') as f:
        f.write('\n'.join(filenames))

    print(f'{len(kw_batches)} csv files created in {output_dir}')

In [12]:
# READS THE FILE CONTAINING BATCH FILENAMES AND RETURNS LIST

def read_batch_list(read_path='data/batches'):
    batch_list_path = os.path.join(read_path, 'batch_list.csv')
    with open(batch_list_path, 'r') as file:
        batch_filenames = [line.strip() for line in file]
    return batch_filenames

In [13]:
# TAKES DF AND APPENDS EACH ROW WITH CLASSIFCATION INFO EXTRACTED FROM RESPONSE

def classify_batch(df):
    results = []
    for url in df['link']:
        response = scan_webpage(url)
        if response.status_code == 200:
            score_df = extract_score(response)
            success = score_df['success'][0]
            word_count = score_df['word_count'][0]
            human = score_df['percent_human'][0]
            ai = score_df['percent_ai'][0]
            # text = score_df['text'][0]
            #results.append({'link': url, 'success': success, 'word_count': word_count, 'percent_human': human, 'percent_ai': ai, 'text': text})
            results.append({'link': url, 'success': success, 'word_count': word_count, 'percent_human': human, 'percent_ai': ai})
            delay = 10 # WHEN PROCESSING ACTUALLY NEEDS TO BE DONE DELAY IS HIGHER
        else:
            print(f'RC error code: {response.status_code}, URL: {url}')
            #results.append({'link': url, 'success': False, 'percent_human': None, 'percent_ai': None, 'text': 'none'})
            results.append({'link': url, 'success': False, 'percent_human': None, 'percent_ai': None})
            delay = 2 # WHEN NO PROCESSING IS REQUIRED DELAY IS 2
        time.sleep(delay)  # wait for 1 second between requests
    return pd.DataFrame(results)

In [14]:
# NOT DOING THIS FOR NOW - TOO MANY FILES TO HANDLE

# SAVE RESPONSE FILES

#def save_response(response):
    
#    print('response saved')

In [15]:
# CLASSIFIES ALL BATCHES

def classify_all_batches():
    # LOAD CSV PATHS INTO LIST
    read_dir = 'data/batches/'
    save_dir = 'data/batches_classified'
    batches = read_batch_list()
    
    for batch_name in batches:

        # check for sufficent credits
        credits = get_balance().json()['balance']
        #if balance is not None and json.loads(balance) > 1600:
        if credits > 1600:

            # START TIMER 
            # start_time = time.time() # NOT CURRENTLY NEEDED BECAUSE EACH RECORD TAKES MORE THAT A MINUTE TO PROCESS
            print(f'Credits = {credits}, Classifying: {batch_name}')

            # LOAD CSV BATCH INTO DF
            df_in = pd.read_csv(f'{read_dir}{batch_name}', header=0)
            
            # CLASSIFY BATCH
            df_out = classify_batch(df_in)

            # CONCAT DFS
            df = pd.concat([df_in, df_out], axis=1, join='outer')

            # SAVE DF OF CLASSIFIED BATCH
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            save_path = os.path.join(save_dir, batch_name)
            df.to_csv(save_path, index=False)

            # WAIT FOR TIMER TO REACH 1 MINUTE
            # elapsed_time = time.time() - start_time
            # if elapsed_time < 60:
            #     time.sleep(60 - elapsed_time)
        else:
            print(f"*** INSUFFICENT CREDITS *** YOU HAVE: {credits} CREDITS. GO BEG CONNOR FOR MORE")
            break

In [16]:
def scheduled_batches():
    # LOAD CSV PATHS INTO LIST
    read_dir = 'data/batches/'
    save_dir = 'data/batches_classified'
    batches = read_batch_list()
    first = batches[0]
    last = batches[-1]
    count = len(batches)
    print(f'There are {count} batches scheduled for processing')
    print(f'First scheduled batch: {first}\nLast scheduled batch: {last}')

### Execution Examples

In [26]:
load_api_key()

API key loaded as global variable `api_key`: (is type str and 32 characters long)


In [27]:
get_balance(api_key=api_key).json()['balance']

36859

In [20]:
# #url_in = "https://www.thespruceeats.com/best-teas-4771438"
# url_in = "https://originality.ai/grammarly-plagiarism-checker-review"
# scan_url="https://api.originality.ai/api/v1/scan/url"
# headers={"X-OAI-API-KEY": api_key}
# params={"url": url_in}
# #response = requests.post(url=scan_url, headers=headers, params=params)
# response = requests.post(url="https://api.originality.ai/api/v1/scan/url", headers=headers, params=params)
# #response = requests.post(url= "https://api.originality.ai/api/v1/scan/url", headers={"X-OAI-API-KEY": api_key}, params={"url": url_in})
# response.json()

{'success': True,
 'url': 'https://originality.ai/grammarly-plagiarism-checker-review',
 'url_code': 200,
 'credits_used': 30,
 'credits': 36859,
 'word_count': 2948,
 'score': {'original': 0.870025118192037, 'ai': 0.1299748692351083},
 'score_breakdown': [{'original': 0.9625914096832275,
   'ai': 0.03740851581096649,
   'text': '  that is, to see if it was written by an AI versus a human. Many AI-based services use content from multiple sources to deliver their    answers   , so Originality.AI checks your content against what can reasonably be produced from an AI. Both platforms are well-known for their high levels of accuracy. Originality.AI: How it Works Originality.AI has been built from the ground up to do one thing and do it exceedingly well: check for plagiarism among AI-based writing tools. To do this, it leverages AI as a plagiarism checker rather than a writing source and scans academic journals, databases, and other repositories for content that could sound machine-written. 

In [31]:
# EXAMPLE TO CLASSIFY ONE WEB PAGE AND EXTRACT RESPONSE

url_in = "https://originality.ai/grammarly-plagiarism-checker-review/"
response  = scan_webpage(url_in=url_in, api_key=api_key)
df = extract_score(response)
df

Unnamed: 0,success,word_count,percent_human,percent_ai,text
0,True,2948,87.002512,12.997487,"billions of web pages sounds like a lot, it ..."


In [35]:
response.json()

{'success': True,
 'url': 'https://originality.ai/grammarly-plagiarism-checker-review/',
 'url_code': 200,
 'credits_used': 30,
 'credits': 36829,
 'word_count': 2948,
 'score': {'original': 0.870025118192037, 'ai': 0.1299748692351083},
 'score_breakdown': [{'original': 0.9901124835014343,
   'ai': 0.009887555614113808,
   'text': 'billions of web pages sounds like a lot, it   s actually quite limited when compared to the vast repository of information out there. If you   re worried about plagiarism, you may want to use an additional tool to scan your work beyond what Grammarly Plagiarism Checker offers.   Premium account needed: As the free version of Grammarly doesn   t offer a plagiarism checker, you   ll need to upgrade to a Premium account or higher to take advantage of this feature. Advantages of using Grammarly Plagiarism Checker Using Grammarly   s Plagiarism Checker offers you a number of advantages to help you sidestep instances of plagiarism. The system itself is built on ad

In [34]:
len(df['text'][0].split())

478

In [172]:
response.json()

{'url': 'https://originality.ai/grammarly-plagiarism-checker-review/',
 'success': False,
 'error': 'unable to process this url. Please check that the url is valid and try again. If the problem persists, please contact Originality.ai support.'}

In [29]:
def scan_webpage(url_in, api_key):
    encoded_url = urllib.parse.quote(url_in, safe='')
    scan_url = "https://api.originality.ai/api/v1/scan/url"
    headers = {"X-OAI-API-KEY": api_key}
    params = {"url": encoded_url}
    try:
        response = requests.post(url=scan_url, headers=headers, params=params)
        response.raise_for_status()  # raises an HTTPError for 4xx or 5xx status codes
        return response.json()  # returns the response body as a JSON object
    except requests.exceptions.RequestException as e:
        print(f"Error scanning webpage at {url_in}: {e}")
        return response.json()

In [199]:
url_in =  'https://www.thespruceeats.com/best-teas-4771438/'
scan_webpage(url_in, api_key)

Error scanning webpage at https://www.thespruceeats.com/best-teas-4771438/: 422 Client Error: Unprocessable Entity for url: https://api.originality.ai/api/v1/scan/url?url=https%253A%252F%252Fwww.thespruceeats.com%252Fbest-teas-4771438%252F


{'url': 'https%3A%2F%2Fwww.thespruceeats.com%2Fbest-teas-4771438%2F',
 'success': False,
 'error': 'unable to process this url. Please check that the url is valid and try again. If the problem persists, please contact Originality.ai support.'}

In [173]:
# EXAMPLE TO CLASSIFY ONE WEB PAGE AND ADD RESUTLS TO KW AND LINK

dfx = pd.DataFrame({'kw': ['blah'], 'link': ['https://www.target.com/c/table-lamps-lighting-home-decor/-/N-56d7t']})
url_in = "https://www.target.com/c/table-lamps-lighting-home-decor/-/N-56d7t"
response  = scan_webpage(url_in)
df = extract_score(response)
result = pd.concat([dfx, df], axis=1)
result

Unnamed: 0,kw,link,success,word_count,percent_human,percent_ai,text
0,blah,https://www.target.com/c/table-lamps-lighting-...,False,,,,


In [174]:
# EXAMPLE TO CLASSIFY ONE WEB PAGE AND ADD RESUTLS

url_in = "https://www.target.com/c/table-lamps-lighting-home-decor/-/N-56d7t"
response  = scan_webpage(url_in)
df = extract_score(response)
result = pd.concat([dfx, df], axis=1)
result

Unnamed: 0,kw,link,success,word_count,percent_human,percent_ai,text
0,blah,https://www.target.com/c/table-lamps-lighting-...,False,,,,


In [175]:
path = 'data/batches/kw_batch_089.csv'
dfxxx = pd.read_csv(path, header=0)
dfxxx

Unnamed: 0,kw,rank,link
0,how to attract bees,1,https://www.nrdc.org/stories/8-ways-attract-be...
1,how to attract bees,2,https://www.masterclass.com/articles/how-to-at...
2,how to attract bees,3,https://www.thespruce.com/bee-plants-1401948
3,how to attract bees,4,https://www.gardeners.com/how-to/attracting-be...
4,how to attract bees,5,https://kellogggarden.com/blog/honeybees-and-b...
...,...,...,...
95,best indoor ant killer,16,https://www.terro.com/indoor-baiting
96,best indoor ant killer,17,https://www.insider.com/guides/home/how-to-get...
97,best indoor ant killer,18,https://nationaltoday.com/review/best-ant-killer/
98,best indoor ant killer,19,https://www.medicinenet.com/what_is_the_best_h...


In [176]:
pd.concat([dfxxx, extract_score(scan_webpage('https://www.terro.com/indoor-baiting'))], axis=1)

Unnamed: 0,kw,rank,link,success,word_count,percent_human,percent_ai,text
0,how to attract bees,1,https://www.nrdc.org/stories/8-ways-attract-be...,False,,,,
1,how to attract bees,2,https://www.masterclass.com/articles/how-to-at...,,,,,
2,how to attract bees,3,https://www.thespruce.com/bee-plants-1401948,,,,,
3,how to attract bees,4,https://www.gardeners.com/how-to/attracting-be...,,,,,
4,how to attract bees,5,https://kellogggarden.com/blog/honeybees-and-b...,,,,,
...,...,...,...,...,...,...,...,...
95,best indoor ant killer,16,https://www.terro.com/indoor-baiting,,,,,
96,best indoor ant killer,17,https://www.insider.com/guides/home/how-to-get...,,,,,
97,best indoor ant killer,18,https://nationaltoday.com/review/best-ant-killer/,,,,,
98,best indoor ant killer,19,https://www.medicinenet.com/what_is_the_best_h...,,,,,


In [177]:
extract_score(scan_webpage('https://www.terro.com/indoor-baiting'))

Unnamed: 0,success,word_count,percent_human,percent_ai,text
0,False,,,,


In [180]:
# EXAMPLE TO READ IN ONE BATCH AND RUN IT

path = 'data/batches/kw_batch_089.csv'
dfxxx = pd.read_csv(path, header=0)

df_list = []
for link in dfxxx['link']:
    print(f"scanning {link}")
    df_link = extract_score(scan_webpage(link))
    df_list.append(df_link)

result_df = pd.concat([dfxxx] + df_list, axis=1)
result_df

scanning https://www.nrdc.org/stories/8-ways-attract-bees-and-butterflies
scanning https://www.masterclass.com/articles/how-to-attract-bees-to-your-garden
scanning https://www.thespruce.com/bee-plants-1401948
scanning https://www.gardeners.com/how-to/attracting-beneficial-bees/5024.html
scanning https://kellogggarden.com/blog/honeybees-and-butterflies/how-to-attract-bees-with-sugar-water/
scanning https://www.wikihow.com/Attract-Honey-Bees
scanning https://thehomespunhydrangea.com/12-ways-attract-bumble-bees-garden/
scanning https://www.gardeningknowhow.com/garden-how-to/beneficial/attracting-bees.htm
scanning https://www.pinterest.com/pin/how-to-attract-bees-with-sugar-water--615937686519099070/
scanning https://youtube.com/watch?v=YAvo6PTE6jI
scanning https://lawnlove.com/blog/how-to-attract-bees-to-garden/
scanning https://www.almanac.com/video/plant-pollination-encouraging-bees-your-garden
scanning https://minnetonkaorchards.com/how-to-attract-bees/
scanning https://www.gardenerswo

KeyError: 'success'

In [None]:
dfxxx

Unnamed: 0,kw,rank,link
0,how to attract bees,1,https://www.nrdc.org/stories/8-ways-attract-be...
1,how to attract bees,2,https://www.masterclass.com/articles/how-to-at...
2,how to attract bees,3,https://www.thespruce.com/bee-plants-1401948
3,how to attract bees,4,https://www.gardeners.com/how-to/attracting-be...
4,how to attract bees,5,https://kellogggarden.com/blog/honeybees-and-b...
...,...,...,...
95,best indoor ant killer,16,https://www.terro.com/indoor-baiting
96,best indoor ant killer,17,https://www.insider.com/guides/home/how-to-get...
97,best indoor ant killer,18,https://nationaltoday.com/review/best-ant-killer/
98,best indoor ant killer,19,https://www.medicinenet.com/what_is_the_best_h...


In [181]:
# TEST RUN ONE BATCH AND ADD ALL THE OTHER DEETS

df_in = pd.read_csv('data/batches/' + 'kw_batch_000.csv')
df_out = classify_batch(dfx)

df_out = pd.concat([df_in, df_out], axis=1, join='outer')
df_out

RC error code: 422, URL: https://www.target.com/c/table-lamps-lighting-home-decor/-/N-56d7t


Unnamed: 0,kw,rank,link,link.1,success,percent_human,percent_ai
0,side table lamps,1,https://www.target.com/c/table-lamps-lighting-...,https://www.target.com/c/table-lamps-lighting-...,False,,
1,side table lamps,2,https://www.amazon.com/Side-Table-Lamps/s?k=Si...,,,,
2,side table lamps,3,https://www.wayfair.com/lighting/sb1/bedside-t...,,,,
3,side table lamps,4,https://www.ikea.com/us/en/cat/table-lamps-10732/,,,,
4,side table lamps,5,https://www.crateandbarrel.com/lighting/table-...,,,,
...,...,...,...,...,...,...,...
95,blushing philodendron,16,https://garden.org/plants/view/112715/Blushing...,,,,
96,blushing philodendron,17,https://en.wikipedia.org/wiki/Philodendron_eru...,,,,
97,blushing philodendron,18,https://www.ecuagenera.com/Philodendron-squami...,,,,
98,blushing philodendron,19,https://floraconservancy.org/product/philodend...,,,,


In [None]:
# *** CODE SECTION *** REALLY THIS IS ALL YOU NEED

In [None]:
# CHECK TO SEE IF ACCOUNT HAS CREDIT AVAILABLE

credits = get_balance()
credits.json()['balance']

In [None]:
# LOAD KW AND LINK DATA CREATED IN GSEARCH.IPYNB 

#df = pd.read_csv('data/linked_kw_final.csv', header=0)

In [None]:
# DIVIDE DATA INTO BATCHES OF 100 RECORDS SO THAT WHEN IT FAILS I DON'T NEED TO START FROM BEGINING

# make_csv_batches(df_linked_kw_final)

In [None]:
scheduled_batches()


In [None]:
# Start saving the response files

In [None]:
# PROCESSES ALL RECORDS IN BATCHES OF 100

#classify_all_batches()

In [None]:
# classify_batch(df) should handle connection reset?