In [None]:
# import analysis libraries
import json
import cohere
import os
from dotenv import find_dotenv, load_dotenv
import pandas as pd
import time

In [None]:
# load env and cohere API key
load_dotenv(find_dotenv(".env"))
COHERE_API_KEY=os.getenv("COHERE_API_KEY")

In [None]:
# load scraped data.json
with open("data.json", "r") as file:
    data = json.load(file)

In [None]:
# init Cohere client for token analysis
cohere_client = cohere.ClientV2(COHERE_API_KEY)

In [None]:
# prepare rows list for dataframe
rows = []

In [None]:
# count tokens per document using Cohere tokenizer
for label, item in data.items():
    for i, value in enumerate(item):
        tokens = cohere_client.tokenize(
            text=value['content'], model="embed-v4.0"
        )

        rows.append({"label": label, "item_no": i, "url": value['url'], "tokens": len(tokens.tokens)})

In [None]:
# convert rows to pandas DataFrame
df = pd.DataFrame(rows)

In [None]:
# statistical summary of token counts
df.describe()

Unnamed: 0,item_no,tokens
count,70.0,70.0
mean,7.071429,991.042857
std,7.126845,1491.699288
min,0.0,79.0
25%,2.0,399.5
50%,5.0,582.5
75%,9.75,946.25
max,26.0,9017.0


In [None]:
# median token count
df['tokens'].median()

np.float64(582.5)

In [None]:
# most common token counts
df['tokens'].mode()

0    392
Name: tokens, dtype: int64

In [None]:
# rows exceeding 1024 tokens
df[df['tokens'] > 1024]

Unnamed: 0,label,item_no,url,tokens
11,learning,0,https://www.sunmarke.com/learning/nursery/our-...,1499
14,learning,3,https://www.sunmarke.com/learning/eyfs/early-y...,1404
20,learning,9,https://www.sunmarke.com/learning/primary/enri...,2526
23,learning,12,https://www.sunmarke.com/learning/secondary/ou...,1026
24,learning,13,https://www.sunmarke.com/learning/secondary/ou...,6004
27,learning,16,https://www.sunmarke.com/learning/secondary/en...,1371
30,learning,19,https://www.sunmarke.com/learning/sixth-form/o...,1157
37,learning,26,https://www.sunmarke.com/learning/sixth-form/c...,1694
47,admissions,0,https://enquiry.sunmarke.com/enrolment-campaig...,1147
48,admissions,1,https://www.sunmarke.com/admissions/tuition-fees/,1772


In [None]:
# rows exceeding 2048 tokens
df[df['tokens'] > 2048]

Unnamed: 0,label,item_no,url,tokens
20,learning,9,https://www.sunmarke.com/learning/primary/enri...,2526
24,learning,13,https://www.sunmarke.com/learning/secondary/ou...,6004
52,admissions,5,https://www.sunmarke.com/admissions/faqs/,9017
63,activities,2,https://www.sunmarke.com/activities/third-part...,7587
