In [1]:
# Install rocrate-tabular

!pip install git+https://github.com/Sydney-Informatics-Hub/rocrate-tabular.git

Collecting git+https://github.com/Sydney-Informatics-Hub/rocrate-tabular.git
  Cloning https://github.com/Sydney-Informatics-Hub/rocrate-tabular.git to /private/var/folders/40/q4r3c00d3tgd2_v26lscqw1c0000gq/T/pip-req-build-ha0k56ug
  Running command git clone --filter=blob:none --quiet https://github.com/Sydney-Informatics-Hub/rocrate-tabular.git /private/var/folders/40/q4r3c00d3tgd2_v26lscqw1c0000gq/T/pip-req-build-ha0k56ug
  Resolved https://github.com/Sydney-Informatics-Hub/rocrate-tabular.git to commit 4029885ea4503ab62fb485378f0197ffa8c2333b
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [3]:
# Import libraries

import os                                                  # Functions for interacting with the operating system.
import zipfile                                             # Tools to create, read, write, append and list a ZIP file.
import requests                                            # Send HTTP requests.
from io import BytesIO                                     # Perform file operations on byte data.
from rocrate_tabular.tabulator import ROCrateTabulator     # Python library to turn an RO-Crate into tabular formats.

In [5]:
# Specify the names of the database, folder and configuration file to be created, or leave as the defaults.

database = 'cooee.db'     # Edit the section in quotes to rename the database.
folder = 'cooee'          # Edit the section in quotes to rename the folder that is created for the database.
config = 'config.json'    # Edit the section in quotes to rename the configuration file to generate the database.

In [7]:
# Download the COOEE collection ZIP from the LDaCA data portal and extract it to a folder in the current working directory.

zip_url = "https://data.ldaca.edu.au/api/object/arcp%3A%2F%2Fname%2Chdl10.26180~23961609.zip"
cwd = os.getcwd()
extract_to = os.path.join(cwd, folder)
os.makedirs(extract_to, exist_ok=True)
response = requests.get(zip_url, stream=True)
response.raise_for_status()
with zipfile.ZipFile(BytesIO(response.content)) as zip_ref:
    zip_ref.extractall(extract_to)

In [15]:
# Convert the RO-Crate to a database. Arguments specified are the RO-Crate directory and the output name of the database.

tb = ROCrateTabulator()
tb.crate_to_db(folder, database)

<Database <sqlite3.Connection object at 0x1620bb2e0>>

In [17]:
# Create or update the config file, specifying that `indexableText` is the entity containing text data for COOEE.

if os.path.exists(config):
    tb.load_config(config)
    print("load")
else:
    tb.infer_config()
    print("infer")

for table in tb.cf["tables"]:
    print(f"Building entity table for {table}")
    tb.entity_table(table, 'indexableText')

tb.write_config(config)

infer


<div class="alert alert-block alert-success">

Once the `config.json` file is generated, right-click it in the File Browser and select 'Open With' > 'Editor'.

Replace the section `"tables": {},` with the following:
```
    "tables": {
        "RepositoryObject": {
            "all_props": [],
            "ignore_props": [],
            "expand_props": []
        }
    },
```
<br>

This indicates that we want to use the `RepositoryObject` class to generate the table in the database.

Then remove the following section from `potential_tables`:
```
        "RepositoryObject": {
            "all_props": [],
            "ignore_props": [],
            "expand_props": []
        },
```
<br>

Save `config.json` and close it.

In [22]:
# Now that the config table has been updated, re-generate the database output.

tb.crate_to_db(folder, database)

if os.path.exists(config):
    tb.load_config(config)
    print("load")
else:
    tb.infer_config()
    print("infer")

for table in tb.cf["tables"]:
    print(f"Building entity table for {table}")
    tb.entity_table(table, 'indexableText')

tb.write_config(config)

load
Building entity table for RepositoryObject
looking for target: data/1-001-plain.txt
looking for target: data/1-002-plain.txt
looking for target: data/1-003-plain.txt
looking for target: data/1-004-plain.txt
looking for target: data/1-005-plain.txt
looking for target: data/1-006-plain.txt
looking for target: data/1-007-plain.txt
looking for target: data/1-008-plain.txt
looking for target: data/1-009-plain.txt
looking for target: data/1-010-plain.txt
looking for target: data/1-011-plain.txt
looking for target: data/1-012-plain.txt
looking for target: data/1-013-plain.txt
looking for target: data/1-014-plain.txt
looking for target: data/1-015-plain.txt
looking for target: data/1-016-plain.txt
looking for target: data/1-017-plain.txt
looking for target: data/1-018-plain.txt
looking for target: data/1-019-plain.txt
looking for target: data/1-020-plain.txt
looking for target: data/1-021-plain.txt
looking for target: data/1-022-plain.txt
looking for target: data/1-023-plain.txt
looking f

<div class="alert alert-block alert-success">

Open `config.json` again. The `all_props` section should be populated. If you need to do a subquery on a target ID to make expanded properties such as `author_name` and `author_id`, copy the required properties to the `expand_props` section. For example:

```
            "expand_props": [
                "author",
                "register",
                "recipient"
            ]
```
<br>

This indicates that we want the `author`, `register` and `recipient` properties to be expanded in the database.

Save `config.json` and close it.

In [None]:
# Re-generate the database output to include the expanded properties in the database.

tb.crate_to_db(folder, database)

if os.path.exists(config):
    tb.load_config(config)
    print("load")
else:
    tb.infer_config()
    print("infer")

for table in tb.cf["tables"]:
    print(f"Building entity table for {table}")
    tb.entity_table(table, 'indexableText')

tb.write_config(config)

In [24]:
#tb.export_csv()

In [None]:
## cooee.db version

import sqlite3
import pandas

# Connect to the SQLite database
conn = sqlite3.connect(database)

# Write an SQL query to select data
query = "SELECT * FROM RepositoryObject"  # Replace with your table name if not using RepositoryObject

# Read data into a Pandas DataFrame
df = pandas.read_sql_query(query, conn)

# Close the connection
conn.close()

#Remove the first row of the DataFrame
cooee = df.iloc[1:]

# Remove rows where 'indexableText' column has NaN values
#cooee = df.dropna(subset=['indexableText'])

# Display the DataFrame
cooee.info()
cooee

In [None]:
# Filter the DataFrame where 'indexableText' is NaN
filtered_df = cooee[cooee['indexableText'].isna()]

# Select only 'entity_id' and 'indexableText' columns
result = filtered_df[['entity_id', 'name', 'indexableText']]

# Display the result
print(result)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
# split cooee data frame into 16 subsets

registers = {"full": ["Government English", "Private Written", "Public Written", "Speech Based"], "short": ["ge", "prw", "puw", "sb"]}
registers = pandas.DataFrame(registers)
print(registers)

periods = {"period": [1,2,3,4], "start": [1788, 1826, 1851, 1876], "end": [1825, 1850, 1875, 1900]}
time_periods = pandas.DataFrame(periods)
print(time_periods)

In [None]:
## cooee.db version

# make a single document for each subset from indexableText

## make dataframe with sub-corpora
sub_titles = []
documents = []

for i in range(0, 4):
    for j in range(0, 4):
        sub_title = registers.iloc[i, 1] + "_period" + str(time_periods.iloc[j, 0])
        sub_titles.append(sub_title)
        
        # Ensure the values for time_periods are numeric
        start_period = pandas.to_numeric(time_periods.iloc[j, 1], errors='coerce')  # Convert to numeric
        end_period = pandas.to_numeric(time_periods.iloc[j, 2], errors='coerce')    # Convert to numeric
        
        # Convert column 19 of cooee DataFrame to numeric values (errors='coerce' will turn non-numeric into NaN)
        cooee.iloc[:, 19] = pandas.to_numeric(cooee.iloc[:, 19], errors='coerce')
        
        # Filter cooee DataFrame based on conditions
        temp = cooee.loc[
            (cooee["register"] == registers.iloc[i, 0]) & 
            (cooee.iloc[:, 19] >= start_period) &  # Ensure comparison with numeric values
            (cooee.iloc[:, 19] <= end_period)    # Ensure comparison with numeric values
        ]
        
        texts = ""

        for row in temp:
            text = temp["indexableText"].to_string()
            texts = texts + text
        documents.append(texts)

In [None]:
documents[0]

In [None]:
# tokenizing and lemmatizing
 
clean_documents = []

for i in range(0,len(documents)):
    text = str(documents[i])
    tokens = word_tokenize(text)
    cleaned_tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.isalpha() and word not in stop_words]
    clean_documents.append(cleaned_tokens)  

In [None]:
# tokenizing and lemmatizing

from nltk import pos_tag
nltk.download('averaged_perceptron_tagger_eng')
clean_documents = []

for i in range(0,len(documents)):
    text = str(documents[i])
    tokens = word_tokenize(text)
    # Function to get the POS tag for lemmatization
def get_pos(word):
    tag = pos_tag([word])[0][1]
    if tag.startswith('VB'):
        return 'v'  # Verb
    elif tag.startswith('NN'):
        return 'n'  # Noun
    elif tag.startswith('JJ'):
        return 'a'  # Adjective
    else:
        return 'n'  # Default to noun if unknown

# Lemmatize and remove stopwords
cleaned_tokens = [
    lemmatizer.lemmatize(word.lower(), get_pos(word)) 
    for word in tokens 
    if word.isalpha() and lemmatizer.lemmatize(word.lower(), get_pos(word)) not in stop_words
]

clean_documents.append(cleaned_tokens)  

In [None]:
clean_documents[0]

In [None]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

In [None]:
# Creating a dictionary and corpus
dictionary = Dictionary(clean_documents)
corpus = [dictionary.doc2bow(doc) for doc in clean_documents]

# Running LDA TO DO: check parameters especailly passes
lda_model = LdaModel(corpus, num_topics=20, id2word=dictionary, passes=100, random_state=100)
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

In [None]:
# TO DO: revise this to give nice screen display like Sam's

top_words_per_topic = []
for t in range(lda_model.num_topics):
    top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 10)])

# top_words_per_topic
top_words = pandas.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P'])
top_words
# top_words_transpose = top_words.transpose()
# top_words_transpose

# pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv("top_words.csv")

In [None]:
top_words

In [None]:
# tabular display

topics_table = pandas.DataFrame()
for i in range(0,20):
    col_name = "Topic" + str(i+1)
    temp = top_words.loc[(top_words["Topic"] == i)]
    temp_words = temp["Word"].to_list()
    topics_table[col_name] = temp_words
topics_table

In [None]:
# get weightings for each document

doc_weights = []

for doc in clean_documents:
    bow = dictionary.doc2bow(doc)
    t = lda_model.get_document_topics(bow, minimum_probability=0)
    doc_weights.append(t)

doc_weights

In [None]:
# drop document numbers from weights list

weights = []

for doc_row in doc_weights:

    out = []

    for item in doc_row:
        weight = item[1]
        out.append(weight)
    weights.append(out)
weights

In [None]:
topic_names = []

for i in range(0,20):
    topic_name = "Topic"+ str(i+1) + " " + topics_table.iloc[1, i] + " " + topics_table.iloc[2, i]  +" " + topics_table.iloc[3, i]
    topic_names.append(topic_name)

topic_names

In [None]:
# massage data to be input for visualisation

topic_df = (pandas.DataFrame(weights, columns= topic_names))

topics_transpose = topic_df.transpose()
topics_transpose.columns = sub_titles       
# Output the DataFrame
print(topics_transpose)

In [None]:
# not needed

pip install pheatmap

In [None]:
# not needed

from pheatmap import pheatmap
import numpy as np

In [None]:
# don't use this one! 

fig = pheatmap(topics_transpose,  cmap = "Greens", colnames_style={"rotation": 90})
fig

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Create the heatmap
plt.figure(figsize=(10, 6))  # Adjust the width and height
sns.heatmap(topics_transpose, 
            cmap='Blues',  # 'Reds' colormap corresponds to the red color scheme
            cbar_kws={'label': 'Topic Weight'},  # Color bar label
            linewidths=0,  # No lines between cells
            xticklabels=True,  # Show column labels
            yticklabels=True,  # Show row labels
            square=False,  # To avoid forcing the aspect ratio to be square
            cbar=True)  # Display color bar

# Rotate column labels
plt.xticks(rotation=90)

# Save the heatmap to a PDF file
# plt.savefig("results/convo_topic_heatmap.pdf", format="pdf")

# Show the plot
plt.show()