In [1]:
# function: get the authorization token
# output: token
def get_token():
    # store token in a file
    f = open("token.txt", "r")
    token = f.read()
    return token

# function: create corpus on wechat
# input: corpus name
# output: True if the corpus is successfully created, False otherwise
def create_corpus(project_name, token):
    
    #project_name = "wesearch-exercise-ivan-liao"    
    
    headers = {
        'Authorization': 'Bearer '+token,
        'Content-Type': 'application/json',
    }

    data = '{ "model": "lawbert" }'

    response = requests.post(wesearch_url+project_name+'/create', headers=headers, data=data)
    if(response.json()['status'] == 'created'):
        return True
    else:
        return False

In [None]:
# function: upload random files of a specific number in a directory to the corpus
# input: folder name, authentication token 
# output: reference_id_list for upload jobs
def upload_docs(folder, token, num=1000):
    headers = {'Authorization': 'Bearer '+token}
    
    reference_id_list = []
    
    # randomly select num files from the folder
    filenames = random.sample(os.listdir(folder), num)
    for file in filenames:    
        files = {'file': (file+".txt", open(folder+"/"+file, "r", encoding="utf-8"))}
        response = requests.post(wesearch_url+project_name, headers=headers, files=files)
        try:        
            # store reference_id in a list for later status check
            reference_id_list.append(response.json()['reference'])    
        except KeyError:
            print(response.json())
            raise
        
    return reference_id_list

# function: print the status of the document upload process
# input: reference_id_list of the upload jobs
def check_status(ref_list, token):
    headers = {'Authorization': 'Bearer '+token}
    
    for reference_id in ref_list:
        response = requests.get(wesearch_url+project_name+'/tasks/'+reference_id, headers=headers)
        status = response.json()['status']
        print(reference_id, status)

# function: delete the document corpus
# input: token
# output: response from requests.delete
def delete_corpus(token):
    headers = {'Authorization': 'Bearer '+token}

    response = requests.delete(wesearch_url+project_name+'/delete', headers=headers)
    return response

In [51]:
import requests
import os
import random

wesearch_url = "https://project-apollo-api.stg.gc.casetext.com/v0/"
project_name = "wesearch-exercise-ivan-liao"
folder = "docs"
file_num = 1000    

def main():
    token = get_token()
    if(create_corpus(project_name, token) == True):
        ref_list = upload_docs(folder, token, file_num)
    
    print("Finished uploading files")
        
if __name__ == "__main__":
    main()

Finished uploading files


In [179]:
# function: delete ingestfailed files
# input: token
def delete_docs(token):
    headers = {'Authorization': 'Bearer '+token}

    response = requests.get(wesearch_url+project_name, headers=headers)    
    files = response.json()['documents']
    folder = "docs"

    f = open("ingestfailed_files.txt", "a")
    
    delete_count = 0
    for file in files:
        if(file['status'] == 'IngestFailed'):            
            doc_id = file['doc_id']
            file_path = file['file_path'].split(".txt")[0]
            f.write(file_path+"\n")
            requests.delete(wesearch_url+"doc/"+doc_id, headers=headers)
            delete_count += 1

    print("Deleted %d files" %(delete_count))    
    f.close()
    
token = get_token()
delete_docs(token)

Deleted 0 files


In [173]:
# function: reupload remaining files 
# input: token
def reupload(token):

    # get current files in the corpus
    response = requests.get(wesearch_url+project_name, headers=headers)
    current_files = response.json()['documents']
    current_filenames = []
    for x in current_files:
        current_filenames.append(x['file_path'].split(".txt")[0])

    # current number of files
    count = len(current_filenames)

    # how many files need to be uploaded
    num = file_num - count

    # store the files that cannot be ingested
    ingestfailed_files = []
    f = open("ingestfailed_files.txt", "r")
    for x in f:
        ingestfailed_files.append(x.split("\n")[0])    
    
    # reupload files
    for i in range(num):
        filename = random.choice(os.listdir(folder))
        while(filename in current_filenames) and (filename not in ingestfailed_files):
            filename = random.choice(os.listdir(folder))

        files = {'file': (filename+".txt", open(folder+"/"+filename, "r", encoding="utf-8"))}
        response = requests.post(wesearch_url+project_name, headers=headers, files=files)

    print("reupload complete")
    
reupload(get_token())

reupload complete


In [180]:
# simple query function

def query(token, string):
    import pprint

    headers = {'Authorization': 'Bearer '+token,}

    params = (
        ('q', string),
    )

    response = requests.get(wesearch_url+project_name+'/search', headers=headers, params=params)

    pprint.pprint(response.json())

query(get_token(), "religious")

[{'doc_id': 'd0608410-cf0e-4f15-920a-ef61b44a2f51',
  'file_path': '-us-judgment-us-1819-02-17us518-main.txt',
  'metadata': {},
  'page': 41,
  'post_context': 'Approved, June 27th, 1816.',
  'pre_context': 'And the governor is hereby authorized and requested to '
                 'summon the first meeting of the said trustees and overseers, '
                 'to be held at Hanover, on the 26th day of August next.',
  'score': 0.6013742,
  'text': 'Â§ 8. Be it further enacted, that perfect freedom of religious '
          'opinion shall be enjoyed by all the officers and students of the '
          'university; and no officer or student shall be deprived of any '
          'honors, privileges or benefits of the institution, on account of '
          'his religious creed or belief.'},
 {'doc_id': 'dd27061f-90d2-4c8c-b385-f888fff60144',
  'file_path': '-us-judgment-us-1987-02-25-85-993-main.txt',
  'metadata': {},
  'page': 6,
  'post_context': 'Establishment Clause.',
  'pre_context':