In [None]:
import pandas as pd

df = pd.read_csv('data/echr_qa_dataset.csv')

# print length of dataset
print(df.shape)
df.head()

In [None]:
import json
import tiktoken
import spacy


nlp = spacy.load("en_core_web_trf")


def get_sentences(response: str):
    doc = nlp(response)
    sentences = [s.text for s in doc.sents if s.text.strip()]
    return sentences

enc = tiktoken.get_encoding("cl100k_base")

# get the average token length of questions
df['question_length'] = df['question'].apply(lambda x: len(enc.encode(x)))
df['answer_length'] = df['answer'].apply(lambda x: len(enc.encode(x)))
# answer sentences 
df['answer_sentences_length'] = df['answer'].apply(lambda x: len(get_sentences(x)))
df['citations'] = df['citations'].apply(lambda x: json.loads(x))
df['citations_length'] = df['citations'].apply(lambda x: sum([len(c["paragraph_numbers"]) for c in x]))


print(df['question_length'].mean())
print(df['answer_length'].mean())
print(df['answer_sentences_length'].mean())
print(df['citations_length'].mean())

df.head()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

# Create dataframes for the different lengths
question_length_df = pd.DataFrame({'Question Tokens': df['question_length']})
answer_length_df = pd.DataFrame({'Answer Tokens': df['answer_length']})
answer_sentences_length_df = pd.DataFrame({'Answer Sentences': df['answer_sentences_length']})
citations_length_df = pd.DataFrame({'Citations Length': df['citations_length']})

# Create 2x2 grid plots
fig, axs = plt.subplots(2, 2, figsize=(14, 12))

# Plot 1 - Question Length Distribution
axs[0, 0].set_title('Question Tokens Distribution')
sns.kdeplot(data=question_length_df, x="Question Tokens", ax=axs[0, 0])

# Plot 2 - Answer Length Distribution
axs[0, 1].set_title('Answer Tokens Distribution')
sns.kdeplot(data=answer_length_df, x="Answer Tokens", ax=axs[0, 1])

# Plot 3 - Answer Sentence Length Distribution
axs[1, 0].set_title('Answer Sentences Distribution')
sns.kdeplot(data=answer_sentences_length_df, x="Answer Sentences", ax=axs[1, 0])

# Plot 4 - Citations Length Distribution
axs[1, 1].set_title('Citations Distribution')
sns.kdeplot(data=citations_length_df, x="Citations Length", ax=axs[1, 1])

# Adjust layout
plt.tight_layout()

# Show plot
plt.show()