In [None]:
import requests
import certifi
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from matplotlib.ticker import FixedLocator
import json


api_key = "118322912aa8543272e4c9f2401f3a084cf4b1c7f0ff0b2809955ed44a96050a"
excludeWords = ["[","\\", "]", "_", "`", "!", "\"", "#", "%", "'", "(", ")", "+", ",", "-", "–", ".", "/", ":", ";", "{", "|", "}", "=", "~", "?" ]

In [None]:
# List All Text Sets
url = "https://intelligentarchive.sydney.edu.au/api/v1/text-sets"

response = requests.get(url, headers={"X-API-KEY": api_key})

if response.status_code == 200:
    # Parse JSON response into Python object
    text_sets = response.json()
    
    # Loop through each text set to print its ID and name
    print("List of Text Sets:")
    for text_set in text_sets:
        set_id = text_set.get('id', 'N/A')  # 'N/A' will be used if 'id' is not available
        set_name = text_set.get('name', 'N/A')  # 'N/A' will be used if 'name' is not available
        print(f"  - Name: {set_name}   ID: {set_id}")

else:
    print(f"Failed to get data: {response.status_code} {response.reason}")

In [None]:
#Top 20 most frequent words

textset_id = 86  # Replace with your actual text set ID

# URL and Data for Word Frequencies
url = "https://intelligentarchive.sydney.edu.au/api/v1/word-frequencies"
word_freq_request = {
    'textSet': textset_id,
    'option' :{
        'outputSize' : 30
    }
}

# Make the API Request
response = requests.post(url, json=word_freq_request, headers={"X-API-KEY": api_key}, timeout=1200)

# Create a dictionary to hold word frequencies
word_frequency_map = {}

# Handle the Response
if response.status_code == 200:
    response_data = response.json()
    blocks = response_data.get("blocks", [])

    for block in blocks:
        freqs = block.get('frequencies', [])

        for freq in freqs:
            word = freq.get('word')
            value = freq.get('value')

            if word not in excludeWords:
             word_frequency_map[word] = word_frequency_map.get(word, 0) + value

    # Sort by frequency and take the top 20 words
    sorted_items = sorted(word_frequency_map.items(), key=lambda x: x[1], reverse=True)[:20]
    sorted_words = [item[0] for item in sorted_items]
    sorted_frequencies = [item[1] for item in sorted_items]

    # Plotting
    plt.figure(figsize=(12, 6))
    plt.bar(sorted_words, sorted_frequencies, color='blue')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.title('20 Most Frequent Words')
    plt.xticks(rotation=45)
    plt.show()


else:
    print(f"Failed: {response.status_code} {response.reason}")

In [None]:
# EX1: The top 10 6-grams in the 20 plays by frequency

# Text set = Shakespeare 20 for demos
# Segment by text
# Ngrams = 6
# Output Size = 10

textset_id = 86  # Replace with your actual text set ID

# URL and Data for Word Frequencies
url = "https://intelligentarchive.sydney.edu.au/api/v1/word-frequencies"
word_freq_request = {
    'textSet': textset_id,
    'option':{
        'numberOfNGrams' : 6,
        'outputSize': 10
    }
}

# Make the API Request
response = requests.post(url, json=word_freq_request, headers={"X-API-KEY": api_key}, timeout=1200)

# Create a dictionary to hold word frequencies
word_frequency_map = {}

# Handle the Response
if response.status_code == 200:
    response_data = response.json()
    blocks = response_data.get("blocks", [])

    for block in blocks:
        freqs = block.get('frequencies', [])

        for freq in freqs:
            word = freq.get('word').replace('.' , ' ')
            value = freq.get('value')

            word_frequency_map[word] = word_frequency_map.get(word, 0) + value

    # Sort by frequency and take the top 20 words
    sorted_items = sorted(word_frequency_map.items(), key=lambda x: x[1], reverse=True)
    sorted_words = [item[0] for item in sorted_items]
    sorted_frequencies = [item[1] for item in sorted_items]

    # # Plotting
    plt.figure(figsize=(12, 6))
    plt.bar(sorted_words, sorted_frequencies, color='blue')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.title('Top 10 6-grams in the 20 plays by frequency ')
    plt.xticks(rotation=70)
    plt.show()


    # # Create a table
    # fig, ax = plt.subplots(figsize=(12, 3))
    # ax.axis('tight')
    # ax.axis('off')
    # table_data = [["Word"] + sorted_words,
    #               ["Frequency"] + sorted_frequencies]
    # the_table = ax.table(cellText=table_data, colLabels=None, cellLoc = 'center', loc='center')

    # # Make cells larger to fit long words
    # the_table.auto_set_font_size(False)
    # the_table.set_fontsize(10)
    # the_table.scale(2.0, 2.0)


else:
    print(f"Failed: {response.status_code} {response.reason}")

In [None]:
# EX2: The size of character parts from largest to smallest in these 20 plays 

# Text set = Shakespeare 20 for demos 
# Segment by text and by character 
# Top 10 words 
# Output – n=640 
# Transform and chart – sort by Size, largest to smallest. Chart this row as a scatterplot. 

# Note From Hugh  

textset_id = 86  # Replace with your actual text set ID 


character_parts_request = {
    'textSet': textset_id,
    'option': {
        # 'segmentByCharacter': True,
        'outputSize': 660  
    }
}

# URL
url = "https://intelligentarchive.sydney.edu.au/api/v1/word-frequencies"

# Make API request
response = requests.post(url, json=character_parts_request, headers={"X-API-KEY": api_key}, timeout=1200)

# Create a dictionary to hold character frequencies
character_frequency_map = {}

# Handle the response
if response.status_code == 200:
    response_data = response.json()
    blocks = response_data.get("blocks", [])

    for block in blocks:
        freqs = block.get('frequencies', [])

        for freq in freqs:
            word = freq.get('word' , "Unknown")
            word_count = freq.get('value', 0)
            
            if word not in excludeWords:
                character_frequency_map[word] = character_frequency_map.get(word, 0) + word_count

    # Sort by size and take the top 640 characters
    sorted_characters = sorted(character_frequency_map.items(), key=lambda x: x[1], reverse=True)[:640]
    names = [item[0] for item in sorted_characters]
    sizes = [item[1] for item in sorted_characters]

    # Scatterplot
    plt.figure(figsize=(12, 6))
    plt.scatter(names, sizes, c='blue')
    plt.xticks(rotation=90)
    plt.xlabel('Character Names')
    plt.ylabel('Size of Spoken Part')
    plt.title('Size of Character Parts in 20 Shakespeare Plays')
    plt.show()

else:
    print(f"Failed: {response.status_code} {response.reason}")

In [None]:
# 3. Frequencies of HAS and HATH in the plays 

# Text set = Shakespeare 20 for demos 
# Segment by text 
# Include metadata in output 
# Words unsorted – HAS HATH 
# Output – choose proportions 
# Transform and chart – sort by DATE, plot HAS and HATH as lines 

textset_id = 86  # Replace with your actual text set ID 


character_parts_request = {
    'textSet': textset_id,
    'option': {
        'segmentByCharacter': False,
        'outputSize': 1000 , 
         #'outputSpecialWords' : ["has","hath"],
        'outputSpecialWordsOption' : 0
    }
}

# URL
url = "https://intelligentarchive.sydney.edu.au/api/v1/word-frequencies"

# Make API request
response = requests.post(url, json=character_parts_request, headers={"X-API-KEY": api_key}, timeout=1200)

special_word_count_by_year = {"has": {}, "hath": {}}
total_word_count_by_year = {}

# Handle the response
if response.status_code == 200:
    response_data = response.json()
    blocks = response_data.get("blocks", [])
    
    for block in blocks:
        name = block.get("name", "")
        freqs = block.get("frequencies", [])
        
        # Extract year from name
        year = name.split("_")[-1].split(" ")[0]
        
        for freq in freqs:
            word = freq.get("word", "")
            value = freq.get("value", 0)
            
            if word.lower() == "has":
                special_word_count_by_year["has"][year] = special_word_count_by_year["has"].get(year, 0) + value
            elif word.lower() == "hath":
                special_word_count_by_year["hath"][year] = special_word_count_by_year["hath"].get(year, 0) + value
                
            total_word_count_by_year[year] = total_word_count_by_year.get(year, 0) + value

    # Sort by Year
    sorted_years = sorted(set(special_word_count_by_year["has"].keys()) | set(special_word_count_by_year["hath"].keys()))
    has_counts = [(special_word_count_by_year["has"].get(year, 0) / total_word_count_by_year.get(year, 1)) * 100 for year in sorted_years]
    hath_counts = [(special_word_count_by_year["hath"].get(year, 0) / total_word_count_by_year.get(year, 1)) * 100 for year in sorted_years]

    plt.figure(figsize=(12, 6))
    plt.plot(sorted_years, has_counts, label='HAS', marker='o')
    plt.plot(sorted_years, hath_counts, label='HATH', marker='o')
    plt.scatter(sorted_years, has_counts)
    plt.scatter(sorted_years, hath_counts)
    plt.xlabel('Year')
    plt.ylabel('Percentage')
    plt.title('Percentage of "HAS" and "HATH" over years')

    y_ticks = plt.gca().get_yticks()
    plt.gca().yaxis.set_major_locator(FixedLocator(y_ticks))
    plt.gca().set_yticklabels(['{:.2f}%'.format(y) for y in y_ticks])

    plt.legend()
    plt.show()
else:
    print(f"Failed: {response.status_code} {response.reason}")