In [1]:
import requests
import pandas as pd
import time
import os

In [2]:
# *** FUNCTION SECTION ***

In [3]:
# FETCH API_KEY FROM FILE

def load_api_key(key_name="oai_key.txt", path="secret/"):
    global api_key
    api_key_path = path + key_name
    with open(api_key_path, "r") as file:
        api_key = file.read()
    print(f'api_key loaded as global varible (is type {type(api_key).__name__} and {len(api_key)} characters long)')

In [4]:
# THIS NEED TO BE HERE OR FUNCTIONS WILL ERROR EVEN BEFORE CALLED

load_api_key()

api_key loaded as global varible (is type str and 32 characters long)


In [5]:
# GET O.AI BALANCE

def get_balance(api_key=api_key):
    headers={'X-OAI-API-KEY': api_key}
    response_1 = requests.get('https://api.originality.ai/api/v1/account/credits/balance',headers=headers)
    return(response_1)

In [6]:
# CLASSIFY 1 WEB PAGE AND RETURN RESPONSE

def fetch_url_class(url_in, api_key=api_key): 
    scan_url='https://api.originality.ai/api/v1/scan/url'
    headers={'X-OAI-API-KEY': api_key}
    params={"url": url_in}
    response = requests.post(url=scan_url, headers=headers, params=params)
    return(response)

In [7]:
# EXTRACT AI/HUMAN SCORE FROM 1 RESPONSE AND RETURN DF

def extract_score(response):
    r_json = response.json()
    if r_json['success']:
        data = {
                'Success': [r_json['success']],
                'word_count': [r_json['word_count']],
                'Percent_Human': [r_json['score']['original'] * 100],
                'Percent_AI': [r_json['score']['ai'] * 100]
                # probably shoudn't round yet
                #'Percent_Human': [round(r_json['score']['original'] * 100, 1)],
                #'Percent_AI': [round(r_json['score']['ai'] * 100, 1)]
        }
        if len(r_json['score_breakdown']) > 0:
            data['text'] = [r_json['score_breakdown'][0]['text']]
            
        else:
            data['text'] = ['']

    else:
        data = {
            'Success': False,
            'word_count': None,
            'Percent_Human': None,
            'Percent_AI': None,
            'text': ['']
        }
    df = pd.DataFrame(data)
    return df

In [None]:
# DEPRICATED

# FAILS IF score_breakdown list is empty

# def extract_score(response):
#     r_json = response.json()
#     if r_json['success']:
#         data = {
#         'Success': [r_json['success']],
#         'word_count': [r_json['word_count']],
#         'Percent_Human': [round(r_json['score']['original'] * 100, 1)],
#         'Percent_AI': [round(r_json['score']['ai'] * 100, 1)],
#         'text': [r_json['score_breakdown'][0]['text']]
#         }
#     else:
#         data = {
#             'Success': [False],
#             'word_count': [None],
#             'Percent_Human': [None],
#             'Percent_AI': [None],
#             'text': ['']
#         }
#     df = pd.DataFrame(data)
#     return df

In [None]:
# DEPRICATED

# def extract_score(response):
#     r_json = response.json()
#     data = {
#         'Success': [r_json['success']],
#         'Percent_Human': [round(r_json['score']['original'] * 100, 1)],
#         'Percent_AI': [round(r_json['score']['ai'] * 100, 1)]
#     }
#     df = pd.DataFrame(data)
#     return df

In [8]:
# TAKES A DF OF KW AND BREAKS IT UP INTO BATCHES OF 100 CSV FILES
# ASSUMES THERE ARE NO MORE THAN 20 OF EACH KW

def make_csv_batches(df, batch_size=5, output_dir='data/batches'):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Get unique values in the 'kw' column
    unique_kw = df['kw'].unique()

    # Break up unique values into batches of batch_size
    kw_batches = [unique_kw[i:i+batch_size] for i in range(0, len(unique_kw), batch_size)]

    # Iterate through kw_batches and create csv files for each batch
    filenames = []
    for i, batch in enumerate(kw_batches):
        # Select rows from df where 'kw' is in the current batch
        batch_df = df[df['kw'].isin(batch)]

        # Write subset_df to csv
        filename = f'kw_batch_{i:03d}.csv'
        filepath = os.path.join(output_dir, filename)
        batch_df.to_csv(filepath, index=False)
        filenames.append(filename)

    #Write list of filenames to batch_list.csv
    with open(os.path.join(output_dir, 'batch_list.csv'), 'w') as f:
        f.write('\n'.join(filenames))

    print(f'{len(kw_batches)} csv files created in {output_dir}')

In [9]:
# READS THE FILE CONTAINING BATCH FILENAMES AND RETURNS LIST

def read_batch_list(read_path='data/batches'):
    batch_list_path = os.path.join(read_path, 'batch_list.csv')
    with open(batch_list_path, 'r') as file:
        batch_filenames = [line.strip() for line in file]
    return batch_filenames

In [None]:
# TAKES DF AND APPENDS EACH ROW WITH CLASSIFCATION INFO EXTRACTED FROM RESPONSE

def classify_batch(df):
    results = []
    for url in df['link']:
        response = fetch_url_class(url)
        if response.status_code == 200:
            score_df = extract_score(response)
            success = score_df['Success'][0]
            word_count = score_df['word_count'][0]
            human = score_df['Percent_Human'][0]
            ai = score_df['Percent_AI'][0]
            # text = score_df['text'][0]
            #results.append({'link': url, 'success': success, 'word_count': word_count, 'percent_human': human, 'percent_ai': ai, 'text': text})
            results.append({'link': url, 'success': success, 'word_count': word_count, 'percent_human': human, 'percent_ai': ai})
            delay = 10 # WHEN PROCESSING ACTUALLY NEEDS TO BE DONE DELAY IS HIGHER
        else:
            print(f'RC error code: {response.status_code}, URL: {url}')
            #results.append({'link': url, 'success': False, 'percent_human': None, 'percent_ai': None, 'text': 'none'})
            results.append({'link': url, 'success': False, 'percent_human': None, 'percent_ai': None})
            delay = 0 # WHEN NO PROCESSING IS REQUIRED DELAY IS ZERO
        time.sleep(delay)  # wait for 1 second between requests
    return pd.DataFrame(results)

In [None]:
# NOT DOING THIS FOR NOW - TOO MANY FILES TO HANDLE

# SAVE RESPONSE FILES

#def save_response(response):
    
#    print('response saved')

In [11]:
# CLASSIFIES ALL BATCHES

def classify_all_batches():
    # LOAD CSV PATHS INTO LIST
    read_dir = 'data/batches/'
    save_dir = 'data/batches_classified'
    batches = read_batch_list()
    
    for batch_name in batches:

        # check for sufficent credits
        credits = get_balance().json()['balance']
        #if balance is not None and json.loads(balance) > 1600:
        if credits > 1600:

            # START TIMER 
            # start_time = time.time() # NOT CURRENTLY NEEDED BECAUSE EACH RECORD TAKES MORE THAT A MINUTE TO PROCESS
            print(f'Credits = {credits}, Classifying: {batch_name}')

            # LOAD CSV BATCH INTO DF
            df_in = pd.read_csv(f'{read_dir}{batch_name}', header=0)
            
            # CLASSIFY BATCH
            df_out = classify_batch(df_in)

            # CONCAT DFS
            df = pd.concat([df_in, df_out], axis=1, join='outer')

            # SAVE DF OF CLASSIFIED BATCH
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            save_path = os.path.join(save_dir, batch_name)
            df.to_csv(save_path, index=False)

            # WAIT FOR TIMER TO REACH 1 MINUTE
            # elapsed_time = time.time() - start_time
            # if elapsed_time < 60:
            #     time.sleep(60 - elapsed_time)
        else:
            print(f"*** INSUFFICENT CREDITS *** YOU HAVE: {credits} CREDITS. GO BEG CONNOR FOR MORE")
            break

In [None]:
def scheduled_batches():
    # LOAD CSV PATHS INTO LIST
    read_dir = 'data/batches/'
    save_dir = 'data/batches_classified'
    batches = read_batch_list()
    first = batches[0]
    last = batches[-1]
    count = len(batches)
    print(f'There are {count} batches scheduled for processing')
    print(f'First scheduled batch: {first}\nLast scheduled batch: {last}')

In [None]:
# *** EXAMPLES SECTION ***

In [13]:
# EXAMPLE TO CLASSIFY ONE WEB PAGE AND EXTRACT RESPONSE

url_in = "https://www.target.com/c/table-lamps-lighting-home-decor/-/N-56d7t"
response  = fetch_url_class(url_in)
df = extract_score(response)
df

Unnamed: 0,Success,word_count,Percent_Human,Percent_AI,text
0,True,303,99.899775,0.100225,A bedroom table lamp sets the mood for your b...


In [14]:
# EXAMPLE TO CLASSIFY ONE WEB PAGE AND ADD RESUTLS TO KW AND LINK

dfx = pd.DataFrame({'kw': ['blah'], 'link': ['https://www.target.com/c/table-lamps-lighting-home-decor/-/N-56d7t']})
url_in = "https://www.target.com/c/table-lamps-lighting-home-decor/-/N-56d7t"
response  = fetch_url_class(url_in)
df = extract_score(response)
result = pd.concat([dfx, df], axis=1)
result

Unnamed: 0,kw,link,Success,word_count,Percent_Human,Percent_AI,text
0,blah,https://www.target.com/c/table-lamps-lighting-...,True,303,99.899775,0.100225,A bedroom table lamp sets the mood for your b...


In [15]:
# EXAMPLE TO CLASSIFY ONE WEB PAGE AND ADD RESUTLS

url_in = "https://www.target.com/c/table-lamps-lighting-home-decor/-/N-56d7t"
response  = fetch_url_class(url_in)
df = extract_score(response)
result = pd.concat([dfx, df], axis=1)
result

Unnamed: 0,kw,link,Success,word_count,Percent_Human,Percent_AI,text
0,blah,https://www.target.com/c/table-lamps-lighting-...,True,303,99.899775,0.100225,A bedroom table lamp sets the mood for your b...


In [None]:
# EXAMPLE TO READ IN ONE BATCH AND RUN IT

path = 'data/batches/kw_batch_089.csv'
dfxxx = pd.read_csv(path, header=0)
for link in dfxxx['link']:
    df_link = extract_score(fetch_url_class(url))
    pd.concat([dfxxx, df_link], axis=1)
dfxxx

In [None]:
# TEST RUN ONE BATCH AND ADD ALL THE OTHER DEETS

df_in = pd.read_csv('data/batches/' + 'kw_batch_000.csv')
df_out = classify_batch(dfx)

df_out = pd.concat([df_in, df_out], axis=1, join='outer')
df_out

In [None]:
# *** CODE SECTION *** REALLY THIS IS ALL YOU NEED

In [16]:
# CHECK TO SEE IF ACCOUNT HAS CREDIT AVAILABLE

credits = get_balance()
credits.json()['balance']

240940

In [None]:
# LOAD KW AND LINK DATA CREATED IN GSEARCH.IPYNB 

#df = pd.read_csv('data/linked_kw_final.csv', header=0)

In [None]:
# DIVIDE DATA INTO BATCHES OF 100 RECORDS SO THAT WHEN IT FAILS I DON'T NEED TO START FROM BEGINING

# make_csv_batches(df_linked_kw_final)

In [27]:
scheduled_batches()

There are 127 batches scheduled for processing
First schedueld batch: kw_batch_073.csv
Last scheduled batch: kw_batch_199.csv


In [28]:
# Start saving the response files

In [30]:
# PROCESSES ALL RECORDS IN BATCHES OF 100

classify_all_batches()

Credits = 196261, Classifying: kw_batch_078.csv
RC error code: 422, URL: https://www.youtube.com/watch?v=1FBSs7lUB7I&vl=en
RC error code: 422, URL: https://www.homedepot.com/b/Kitchen-Kitchen-Cabinets-Kitchen-Wall-Shelves/N-5yc1vZ2fkp91m
RC error code: 422, URL: https://www.target.com/s/wall+mounted+kitchen+shelves
RC error code: 422, URL: https://visualhunt.com/wall-mounted-kitchen-shelves
RC error code: 422, URL: https://www.lowes.com/n/how-to/built-in-kitchen-wall-shelf
RC error code: 422, URL: https://www.pinterest.com/krishow83/open-kitchen-shelving/
RC error code: 422, URL: https://www.amazon.com/knuckle-rings/s?k=knuckle+rings
RC error code: 422, URL: https://www.macys.com/shop/featured/knuckle-ring
RC error code: 422, URL: https://www.pinterest.com/ideas/knuckle-rings/902560036628/
RC error code: 422, URL: https://www.target.com/s/knuckle+ring+gold
RC error code: 422, URL: https://youtube.com/watch?v=3D3HR2kFCpk
RC error code: 422, URL: https://www.target.com/s/exfoliating+skin

KeyboardInterrupt: 

In [None]:
# classify_batch(df) should handle connection reset?