In [1]:
import pandas as pd
import ir_datasets

In [2]:
# mapping dataset to variable names
dataset_vars = {
    "neuclir/1/fa": "fa",
    "neuclir/1/fa/hc4-filtered": "fa_hc4_filtered",
    "neuclir/1/fa/trec-2022": "fa_trec_2022",
    "neuclir/1/fa/trec-2023": "fa_trec_2023",
    "neuclir/1/multi": "multi",
    "neuclir/1/multi/trec-2023": "multi_trec_2023",
    "neuclir/1/ru": "ru",
    "neuclir/1/ru/hc4-filtered": "ru_hc4_filtered",
    "neuclir/1/ru/trec-2022": "ru_trec_2022",
    "neuclir/1/ru/trec-2023": "ru_trec_2023",
    "neuclir/1/zh": "zh",
    "neuclir/1/zh/hc4-filtered": "zh_hc4_filtered",
    "neuclir/1/zh/trec-2022": "zh_trec_2022",
    "neuclir/1/zh/trec-2023": "zh_trec_2023"
}

### DOC & QUERY COUNTS ###
# loading each dataset
loaded_datasets = {}
for ds_id, var_name in dataset_vars.items():
    try:
        loaded_datasets[var_name] = ir_datasets.load(ds_id)
        print(f"Dataset: {var_name}")
        # print each dataset's document and query counts
        if hasattr(loaded_datasets[var_name], 'docs_count'):
            print(f"  Number of documents: {loaded_datasets[var_name].docs_count()}")
        if hasattr(loaded_datasets[var_name], 'queries_count'):
            print(f"  Number of queries: {loaded_datasets[var_name].queries_count()}")
        if hasattr(loaded_datasets[var_name], 'qrels'):
            print(f"  Number of qrels: {loaded_datasets[var_name].qrels}")
    except Exception as e:
        print(f"Error loading dataset {var_name}: {e}")
    print("=" * 40)

Dataset: fa
  Number of documents: 2232016
Dataset: fa_hc4_filtered
  Number of documents: 391703
  Number of queries: 60
  Number of qrels: BetaPythonApiQrels(Dataset(id='neuclir/1/fa/hc4-filtered', provides=['docs', 'queries', 'qrels']))
Dataset: fa_trec_2022
  Number of documents: 2232016
  Number of queries: 46
  Number of qrels: BetaPythonApiQrels(Dataset(id='neuclir/1/fa/trec-2022', provides=['docs', 'queries', 'qrels']))
Dataset: fa_trec_2023
  Number of documents: 2232016
  Number of queries: 76
  Number of qrels: BetaPythonApiQrels(Dataset(id='neuclir/1/fa/trec-2023', provides=['docs', 'queries', 'qrels']))
Dataset: multi
  Number of documents: 10038768
Dataset: multi_trec_2023
  Number of documents: 10038768
  Number of queries: 76
  Number of qrels: BetaPythonApiQrels(Dataset(id='neuclir/1/multi/trec-2023', provides=['docs', 'queries', 'qrels']))
Dataset: ru
  Number of documents: 4627543
Dataset: ru_hc4_filtered
  Number of documents: 964719
  Number of queries: 54
  Number

In [9]:

### EXPLORE PERSIAN DOC ###
# load persian datasets
persian_datasets = {
    #"fa": loaded_datasets["fa"],
    "fa_hc4_filtered": loaded_datasets["fa_hc4_filtered"],
    #"fa_trec_2022": loaded_datasets["fa_trec_2022"],
    #"fa_trec_2023": loaded_datasets["fa_trec_2023"]
}

df = pd.DataFrame()
# display head of selected Persian dataset
for var_name, dataset in persian_datasets.items():
    print(f"First document from dataset: {var_name}")
    print(f"Docs in dataset: {dataset.docs_count()}")
    documents = [doc for doc in dataset.docs_iter()[:10]]
    df = pd.DataFrame(documents)

df.head()

First document from dataset: fa_hc4_filtered
Docs in dataset: 391703


Unnamed: 0,doc_id,title,text,url,time,cc_file
0,f5c30695-2fea-4a08-b87c-49a7e39c7945,کاهش نرخ رسمی ۲۵ ارز,امروز که همزمان با روز تعطیل رسمی در بازارهای ...,https://www.irna.ir/news/83444978/%DA%A9%D8%A7...,2019-08-21T05:56:07+00:00,crawl-data/CC-NEWS/2019/08/CC-NEWS-20190821054...
1,c6900c70-f1b8-4157-a5f1-d38dba547dd6,سالروز ربوده شدن سردار احمد متوسلیان و همراهان...,امروز ۱۴ تیرماه سالروز ربوده شدن چهار دیپلمات ...,https://www.isna.ir/news/98041306954/%D8%B3%D8...,2019-07-04T22:30:00+00:00,crawl-data/CC-NEWS/2019/07/CC-NEWS-20190704202...
2,8d9669f4-c2e2-495e-a425-503fafab2356,سالروز اجرای عملیات «ظفر ۲»,پنجم تیرماه، سالروز اجرایی شدن عملیات «ظفر ۲» ...,https://www.isna.ir/news/98040502688/%D8%B3%D8...,2019-06-26T04:23:28+00:00,crawl-data/CC-NEWS/2019/06/CC-NEWS-20190626053...
3,9bf618b0-c4a7-49a4-8516-45af79dc632a,خسارتی از زلزله «بمپور» گزارش نشده است,عبدالرحمن شهنوازی در گفت وگو با خبرنگار مهر، ا...,https://www.mehrnews.com/news/4691573/%D8%AE%D...,2019-08-13T10:34:08+00:00,crawl-data/CC-NEWS/2019/08/CC-NEWS-20190813100...
4,698bcdca-b879-4c61-b03a-eaa8f016032b,سکه طرح جدید امروز ۱۴ آبان‌ماه، ۴ میلیون و ۷۵۰...,به گزارش خبرنگار مهر، قیمت هر قطعه سکه تمام به...,https://www.mehrnews.com/news/4450707/%D8%B3%D...,2018-11-05T10:23:22+00:00,crawl-data/CC-NEWS/2018/11/CC-NEWS-20181105101...


In [5]:
from explore.funcs import calculate_average_document_length

avg_lengths_persian = calculate_average_document_length(persian_datasets)

Average document length in fa_hc4_filtered: 1973.53 characters
