In [1]:
import requests
import json
import pandas as pd
import time
import os

In [2]:
# fetch api key from file

with open('secret/oai_key.txt', 'r') as file:
    api_key = file.read()
print(f'api_key is type {type(api_key).__name__} and is {len(api_key)} characters long')

api_key is type str and is 32 characters long


In [3]:
url = 'https://api.originality.ai/api/v1'

In [4]:
# classify web page and return 1 response

def fetch_url_class(url_in, api_key=api_key): 
    scan_url='https://api.originality.ai/api/v1/scan/url'
    headers={'X-OAI-API-KEY': api_key}
    params={"url": url_in}
    response = requests.post(url=scan_url, headers=headers, params=params)
    return(response)

In [5]:
def extract_score(response):
    r_json = response.json()
    if r_json['success']:
        data = {
                'Success': [r_json['success']],
                'word_count': [r_json['word_count']],
                'Percent_Human': [round(r_json['score']['original'] * 100, 1)],
                'Percent_AI': [round(r_json['score']['ai'] * 100, 1)]
        }
        if len(r_json['score_breakdown']) > 0:
            data['text'] = [r_json['score_breakdown'][0]['text']]
            
        else:
            data['text'] = ['']

    else:
        data = {
            'Success': False,
            'word_count': None,
            'Percent_Human': None,
            'Percent_AI': None,
            'text': ['']
        }
    df = pd.DataFrame(data)
    return df

In [None]:
# FAILS IF score_breakdown list is empty

# def extract_score(response):
#     r_json = response.json()
#     if r_json['success']:
#         data = {
#         'Success': [r_json['success']],
#         'word_count': [r_json['word_count']],
#         'Percent_Human': [round(r_json['score']['original'] * 100, 1)],
#         'Percent_AI': [round(r_json['score']['ai'] * 100, 1)],
#         'text': [r_json['score_breakdown'][0]['text']]
#         }
#     else:
#         data = {
#             'Success': [False],
#             'word_count': [None],
#             'Percent_Human': [None],
#             'Percent_AI': [None],
#             'text': ['']
#         }
#     df = pd.DataFrame(data)
#     return df

In [None]:
# def extract_score(response):
#     r_json = response.json()
#     data = {
#         'Success': [r_json['success']],
#         'Percent_Human': [round(r_json['score']['original'] * 100, 1)],
#         'Percent_AI': [round(r_json['score']['ai'] * 100, 1)]
#     }
#     df = pd.DataFrame(data)
#     return df

In [6]:
# CLASSIFY ONE WEB PAGE

url_in = "https://www.target.com/c/table-lamps-lighting-home-decor/-/N-56d7t"
response  = fetch_url_class(url_in)
df = extract_score(response)
df

Unnamed: 0,Success,word_count,Percent_Human,Percent_AI,text
0,True,303,99.9,0.1,A bedroom table lamp sets the mood for your b...


In [8]:
dfx = pd.DataFrame({'kw': ['blah'], 'link': ['https://www.target.com/c/table-lamps-lighting-home-decor/-/N-56d7t']})
dfx

Unnamed: 0,kw,link
0,blah,https://www.target.com/c/table-lamps-lighting-...


In [9]:
# CLASSIFY ONE WEB PAGE AND ADD RESUTLS TO KW AND LINK

url_in = "https://www.target.com/c/table-lamps-lighting-home-decor/-/N-56d7t"
response  = fetch_url_class(url_in)
df = extract_score(response)
result = pd.concat([dfx, df], axis=1)
result

Unnamed: 0,kw,link,Success,word_count,Percent_Human,Percent_AI,text
0,blah,https://www.target.com/c/table-lamps-lighting-...,True,303,99.9,0.1,A bedroom table lamp sets the mood for your b...


In [None]:
# Reunite results with original data


In [10]:
# NOW RUN A HUNDRED A MINUTE THROUGH A LOOP
# load 3 csv files
# relaed from csv
df_linked_kw_final = pd.read_csv('data/linked_kw_final.csv', header=0)
# combine csvs
# break 1000 list in batches of 100 and save to csv
# re-add other seo info



In [None]:
# for link in df_linked_kw_final:
#     df = extract_score(fetch_url_class(url_in))

In [None]:
# def make_csv_batches(df, batch_size=5, output_dir='data/batches'):
#     # Create output directory if it doesn't exist
#     os.makedirs(output_dir, exist_ok=True)

#     # Get unique values in the 'kw' column
#     unique_kw = df['kw'].unique()

#     # Break up unique values into batches of batch_size
#     kw_batches = [unique_kw[i:i+batch_size] for i in range(0, len(unique_kw), batch_size)]

#     # Iterate through kw_batches and create csv files for each batch
#     filenames = []
#     for i, batch in enumerate(kw_batches):
#         # Select rows from df where 'kw' is in the current batch
#         batch_df = df[df['kw'].isin(batch)]

#         # Write subset_df to csv
#         filename = os.path.join(output_dir, f'kw_batch_{i:03d}.csv')
#         batch_df.to_csv(filename, index=False)
#         filenames.append(filename)

#     #Write list of filenames to batch_list.csv
#     with open(os.path.join(output_dir, 'batch_list.csv'), 'w') as f:
#         f.write('\n'.join(filenames))

#     print(f'{len(kw_batches)} csv files created in {output_dir}')

In [11]:
def make_csv_batches(df, batch_size=5, output_dir='data/batches'):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Get unique values in the 'kw' column
    unique_kw = df['kw'].unique()

    # Break up unique values into batches of batch_size
    kw_batches = [unique_kw[i:i+batch_size] for i in range(0, len(unique_kw), batch_size)]

    # Iterate through kw_batches and create csv files for each batch
    filenames = []
    for i, batch in enumerate(kw_batches):
        # Select rows from df where 'kw' is in the current batch
        batch_df = df[df['kw'].isin(batch)]

        # Write subset_df to csv
        filename = f'kw_batch_{i:03d}.csv'
        filepath = os.path.join(output_dir, filename)
        batch_df.to_csv(filepath, index=False)
        filenames.append(filename)

    #Write list of filenames to batch_list.csv
    with open(os.path.join(output_dir, 'batch_list.csv'), 'w') as f:
        f.write('\n'.join(filenames))

    print(f'{len(kw_batches)} csv files created in {output_dir}')

In [12]:
make_csv_batches(df_linked_kw_final)

200 csv files created in data/batches


In [13]:
# read file with batch filenames into a list

def read_batch_list(read_path='data/batches'):
    batch_list_path = os.path.join(read_path, 'batch_list.csv')
    with open(batch_list_path, 'r') as file:
        batch_filenames = [line.strip() for line in file]
    return batch_filenames

In [14]:
# CLASSIFY ONE WEB PAGE AND ADD RESUTLS

url_in = "https://www.target.com/c/table-lamps-lighting-home-decor/-/N-56d7t"
response  = fetch_url_class(url_in)
df = extract_score(response)
result = pd.concat([dfx, df], axis=1)
result

Unnamed: 0,kw,link,Success,word_count,Percent_Human,Percent_AI,text
0,blah,https://www.target.com/c/table-lamps-lighting-...,True,303,99.9,0.1,A bedroom table lamp sets the mood for your b...


In [15]:
# read in one batch csv and run it
path = 'data/batches/kw_batch_089.csv'
dfxxx = pd.read_csv(path, header=0)
for link in dfxxx['link']:
    df_link = extract_score(fetch_url_class(url))
    pd.concat([dfxxx, df_link], axis=1)
dfxxx

Unnamed: 0,kw,rank,link
0,how to attract bees,1,https://www.nrdc.org/stories/8-ways-attract-be...
1,how to attract bees,2,https://www.masterclass.com/articles/how-to-at...
2,how to attract bees,3,https://www.thespruce.com/bee-plants-1401948
3,how to attract bees,4,https://www.gardeners.com/how-to/attracting-be...
4,how to attract bees,5,https://kellogggarden.com/blog/honeybees-and-b...
...,...,...,...
95,best indoor ant killer,16,https://www.terro.com/indoor-baiting
96,best indoor ant killer,17,https://www.insider.com/guides/home/how-to-get...
97,best indoor ant killer,18,https://nationaltoday.com/review/best-ant-killer/
98,best indoor ant killer,19,https://www.medicinenet.com/what_is_the_best_h...


In [25]:
# I THINK THIS IS THE PROBLEM

# THIS USED TO TAKE CSV AS ARG
def classify_batch(df):
    results = []
    for url in df['link']:
        response = fetch_url_class(url)
        if response.status_code == 200:
            score_df = extract_score(response)
            success = score_df['Success'][0]
            word_count = score_df['word_count'][0]
            human = score_df['Percent_Human'][0]
            ai = score_df['Percent_AI'][0]
            text = score_df['text'][0]
            #results.append({'link': url, 'success': success, 'word_count': word_count, 'percent_human': human, 'percent_ai': ai, 'text': text})
            results.append({'link': url, 'success': success, 'word_count': word_count, 'percent_human': human, 'percent_ai': ai})
        else:
            #results.append({'link': url, 'success': False, 'percent_human': None, 'percent_ai': None, 'text': 'none'})
            results.append({'link': url, 'success': False, 'percent_human': None, 'percent_ai': None})
    return pd.DataFrame(results)

In [28]:
# RUN ONE BATCH AND ADD ALL THE OTHER DEETS
df_in = pd.read_csv('data/batches/' + 'kw_batch_000.csv')
df_out = classify_batch(dfx)

df_out = pd.concat([df_in, df_out], axis=1, join='outer')
df_out

Unnamed: 0,kw,rank,link,link.1,success,word_count,percent_human,percent_ai
0,side table lamps,1,https://www.target.com/c/table-lamps-lighting-...,https://www.target.com/c/table-lamps-lighting-...,True,303.0,99.9,0.1
1,side table lamps,2,https://www.amazon.com/Side-Table-Lamps/s?k=Si...,https://www.amazon.com/Side-Table-Lamps/s?k=Si...,True,110.0,73.9,26.1
2,side table lamps,3,https://www.wayfair.com/lighting/sb1/bedside-t...,https://www.wayfair.com/lighting/sb1/bedside-t...,True,1931.0,99.2,0.8
3,side table lamps,4,https://www.ikea.com/us/en/cat/table-lamps-10732/,https://www.ikea.com/us/en/cat/table-lamps-10732/,True,562.0,100.0,0.0
4,side table lamps,5,https://www.crateandbarrel.com/lighting/table-...,https://www.crateandbarrel.com/lighting/table-...,False,,,
...,...,...,...,...,...,...,...,...
95,blushing philodendron,16,https://garden.org/plants/view/112715/Blushing...,https://garden.org/plants/view/112715/Blushing...,False,,,
96,blushing philodendron,17,https://en.wikipedia.org/wiki/Philodendron_eru...,https://en.wikipedia.org/wiki/Philodendron_eru...,True,417.0,99.9,0.1
97,blushing philodendron,18,https://www.ecuagenera.com/Philodendron-squami...,https://www.ecuagenera.com/Philodendron-squami...,False,,,
98,blushing philodendron,19,https://floraconservancy.org/product/philodend...,https://floraconservancy.org/product/philodend...,True,85.0,0.0,0.0


In [29]:
def run_all_batches():
    # LOAD CSV PATHS INTO LIST
    #read_path = 'data/batches/'
    #save_path = 'data/batches_classified/'
    batches = read_batch_list()
    
    # INTERATE OVER LIST
    for batch_name in batches:

        # START TIMER
        # start_time = time.time()
        print(f'running {batch_name}')

        # LOAD CSV BATCH INTO DF
        #df_in = pd.read_csv(read_path + batch_name, header=0
        df_in = pd.read_csv(f'data/batches/{batch_name}', header=0)
        
        # CLASSIFY BATCH
        df_out = classify_batch(df_in)

        # CONCAT DF_OUT TO DF_IN
        df = pd.concat([df_in, df_out], axis=1, join='outer')

        # SAVE DF OF CLASSIFIED BATCH
        save_dir = 'data/batches_classified'
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        save_path = os.path.join(save_dir, batch_name)
        df.to_csv(save_path, index=False)
    

        #df_out.to_csv(save_path + batch_name, index=False)

        # WAIT FOR TIMER TO REACH 1 MINUTE
        # elapsed_time = time.time() - start_time
        # if elapsed_time < 60:
        #     time.sleep(60 - elapsed_time)


In [34]:
# EXAMPLE GET BALANCE
def get_balance(api_key=api_key):
    headers={'X-OAI-API-KEY': api_key}
    response_1 = requests.get(url + '/account/credits/balance',headers=headers)
    print(response_1.json())

In [35]:
get_balance()

{'balance': 0}


In [33]:
run_all_batches()

running kw_batch_037.csv
running kw_batch_038.csv
running kw_batch_039.csv
running kw_batch_040.csv
running kw_batch_041.csv
running kw_batch_042.csv


KeyboardInterrupt: 