In [None]:
import requests
import zipfile
import io
import os
import shutil
import pandas as pd
import re

In [None]:
# mounting the Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# defining the paths to save the zip files for the data + metadata

#OPENITI CORPUS
data_path = "/content/drive/My Drive/OpenITI/data.zip"
metadata_path = "/content/drive/My Drive/OpenITI/metadata.zip"



In [None]:
# URLs of the zip files from the Zenodo website (corresponding to 2023 version) - 10.5281/zenodo.7687795
#OPENITI CORPUS
data_url = "https://zenodo.org/record/7687795/files/data.zip?download=1"
metadata_url = "https://zenodo.org/record/7687795/files/metadata.zip?download=1"


In [None]:
# Define the path to the OpenITI folder in Google Drive
openiti_path = "/content/drive/My Drive/OpenITI"
if not os.path.exists(openiti_path):
    os.makedirs(openiti_path)



In [None]:
# Download the data zip file and save it to Google Drive (OPENITI)
response = requests.get(data_url, stream=True)
with open(data_path, 'wb') as f:
    for chunk in response.iter_content(chunk_size=8192): # using chunks to save on RAM
        f.write(chunk)



In [None]:
# Download the metadata zip file and save it to Google Drive (OPENITI)
response = requests.get(metadata_url, stream=True)
with open(metadata_path, 'wb') as f:
    for chunk in response.iter_content(chunk_size=8192):
        f.write(chunk)


In [None]:
# Extract the metadata file
# Define the paths for the metadata zip file and extracted directory
metadata_zip_path = "/content/drive/My Drive/OpenITI/metadata.zip"
metadata_dir_path = "/content/drive/My Drive/OpenITI"

# Extract the metadata zip file
with zipfile.ZipFile(metadata_zip_path, 'r') as zip_ref:
    zip_ref.extractall(metadata_dir_path)


In [None]:
# Extract the data file
# Define the paths for the data zip file and extracted directory
data_zip_path = "/content/drive/My Drive/OpenITI/data.zip"
data_dir_path = "/content/drive/My Drive/OpenITI"

# Extract the metadata zip file
with zipfile.ZipFile(data_zip_path, 'r') as zip_ref:
    zip_ref.extractall(data_dir_path)


In [None]:
# Dividing the corpus into two time periods T1 and T2

data_dir_path = "/content/drive/My Drive/OpenITI/data"
t1_start = 575  # Start year for time period 1
t1_end = 900    # End year for time period 1
t2_start = 1100  # Start year for time period 2
t2_end = 1450    # End year for time period 2

# Create the T1 directory
t1_dir_path = os.path.join(data_dir_path, "T1")
os.makedirs(t1_dir_path, exist_ok=True)

# Create the T2 directory
t2_dir_path = os.path.join(data_dir_path, "T2")
os.makedirs(t2_dir_path, exist_ok=True)

# List all files and directories in the data directory
files = os.listdir(data_dir_path)

# Move the directories to their respective time period directories
for file in files:
    file_path = os.path.join(data_dir_path, file)
    if os.path.isdir(file_path) and file not in ["T1", "T2"]:
        year_str = file[:4]
        try:
            year = int(year_str)
            if t1_start <= year <= t1_end or t1_start <= int(year_str.lstrip('0')) <= t1_end:
                shutil.move(file_path, t1_dir_path)
            elif t2_start <= year <= t2_end or t2_start <= int(year_str.lstrip('0')) <= t2_end:
                shutil.move(file_path, t2_dir_path)
        except ValueError:
            continue


In [None]:
import os

def count_files(directory):
    """
    Count the number of files in a directory (recursively).
    """
    count = 0
    for root, dirs, files in os.walk(directory):
        count += len(files)
    return count

t1_dir_path = "/content/drive/My Drive/OpenITI/T1"
t2_dir_path = "/content/drive/My Drive/OpenITI/T2"

def extract_statistics(directory):
    num_files = 0
    num_words = 0
    num_books = 0
    num_authors = 0

    for author_dir in os.listdir(directory):
        author_path = os.path.join(directory, author_dir)
        if os.path.isdir(author_path):
            num_authors += 1

            for book_dir in os.listdir(author_path):
                book_path = os.path.join(author_path, book_dir)
                if os.path.isdir(book_path):
                    num_books += 1

                    for root, dirs, files in os.walk(book_path):
                        for file in files:
                            if file.endswith("ara1"):
                                file_path = os.path.join(root, file)
                                with open(file_path, 'r', encoding='utf-8') as f:
                                    num_words += len(f.read().split())
                                    num_files += 1

    return num_files, num_words, num_books, num_authors

# Extract statistics for T1
t1_files, t1_words, t1_books, t1_authors = extract_statistics(t1_dir_path)

# Extract statistics for T2
t2_files, t2_words, t2_books, t2_authors = extract_statistics(t2_dir_path)

# Print the statistics
print("Time Period 1 (T1) Statistics:")
print("Files:", t1_files)
print("Words:", t1_words)
print("Books:", t1_books)
print("Authors:", t1_authors)
print()
print("Time Period 2 (T2) Statistics:")
print("Files:", t2_files)
print("Words:", t2_words)
print("Books:", t2_books)
print("Authors:", t2_authors)


Time Period 1 (T1) Statistics:
Files: 3103
Words: 777403570
Books: 2183
Authors: 801

Time Period 2 (T2) Statistics:
Files: 1454
Words: 517059619
Books: 1183
Authors: 647
