In [18]:
import re
import html
import string
import emoji
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
from nltk import pos_tag, word_tokenize


# تحميل كلمات التوقف
stop_words = set(stopwords.words('english'))

# أداة Lemmatizer
lemmatizer = WordNetLemmatizer()

# خريطة تحويل POS من شكل nltk إلى ما يفهمه lemmatizer
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return ADJ
    elif tag.startswith('V'):
        return VERB
    elif tag.startswith('N'):
        return NOUN
    elif tag.startswith('R'):
        return ADV
    else:
        return NOUN  # الافتراضي

# خريطة مرادفات مخصصة (يمكن توسيعها)
synonym_map = {
    'car': 'automobile',
    'automobile': 'automobile',
    'bike': 'bicycle',
    'bicycle': 'bicycle',
    'tv': 'television',
    'television': 'television',
}

def normalize_text_en(text):
    text = html.unescape(text)
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = emoji.replace_emoji(text, replace='')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def process_query_en(query):
    query = normalize_text_en(query)

    # Tokenization
    words = word_tokenize(query)

    # POS Tagging
    tagged = pos_tag(words)

    # Lemmatization + إزالة كلمات التوقف + توحيد المرادفات
    processed_tokens = []
    for word, tag in tagged:
        if word in stop_words or len(word) <= 2:
            continue

        wn_pos = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(word, pos=wn_pos)

        # استبدال الكلمة بالمرادف الموحد إن وجد
        lemma = synonym_map.get(lemma, lemma)

        processed_tokens.append(lemma)

    return ' '.join(processed_tokens)
# تمرير استعلام واختبار الدالة
query = "i watching tv"
output = process_query_en(query)
print("الاستعلام بعد المعالجة:", output)

الاستعلام بعد المعالجة: watch


In [21]:
!python -m textblob.download_corpora

Finished.


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [11]:
# استيراد المكتبات
import re
import html
import string
import emoji
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
from nltk import pos_tag, word_tokenize
from textblob import TextBlob

# تحميل كلمات التوقف
stop_words = set(stopwords.words('english'))

# أداة Lemmatizer
lemmatizer = WordNetLemmatizer()

# خريطة تحويل POS من NLTK إلى WordNet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return ADJ
    elif tag.startswith('V'):
        return VERB
    elif tag.startswith('N'):
        return NOUN
    elif tag.startswith('R'):
        return ADV
    else:
        return NOUN

# خريطة مرادفات مخصصة
synonym_map = {
    'car': 'automobile',
    'automobile': 'automobile',
    'bike': 'bicycle',
    'bicycle': 'bicycle',
    'tv': 'television',
    'television': 'television',
}

# تنظيف النص
def normalize_text_en(text):
    text = html.unescape(text)
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = emoji.replace_emoji(text, replace='')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'(.)\1{2,}', r'\1', text)  # مثل: soooo → so
    # مثل: soooo → so
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# المعالجة الشاملة
def process_query_en(query):
    # تصحيح الإملاء باستخدام TextBlob
    corrected = str(TextBlob(query).correct())

    # تنظيف النص
    query = normalize_text_en(corrected)

    # Tokenization
    words = word_tokenize(query)

    # POS Tagging
    tagged = pos_tag(words)

    # Lemmatization + إزالة كلمات التوقف + توحيد المرادفات
    processed_tokens = []
    for word, tag in tagged:
        if word in stop_words or len(word) <= 2:
            continue

        wn_pos = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(word, pos=wn_pos)

        # توحيد المرادف إن وجد
        lemma = synonym_map.get(lemma, lemma)

        processed_tokens.append(lemma)

    return ' '.join(processed_tokens)

# تجربة الكود باستعلام يحتوي على خطأ إملائي
query = "i swie"
output = process_query_en(query)
print("الاستعلام بعد المعالجة:", output)


الاستعلام بعد المعالجة: swim


In [33]:
import re
import html
import string
import emoji
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
from nltk import pos_tag, word_tokenize

# تحميل كلمات التوقف
stop_words = set(stopwords.words('english'))

# أداة Lemmatizer
lemmatizer = WordNetLemmatizer()

# خريطة تحويل POS من شكل nltk إلى ما يفهمه lemmatizer
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return ADJ
    elif tag.startswith('V'):
        return VERB
    elif tag.startswith('N'):
        return NOUN
    elif tag.startswith('R'):
        return ADV
    else:
        return NOUN  # الافتراضي

# خريطة مرادفات مخصصة (يمكن توسيعها)
synonym_map = {
    'car': 'automobile',
    'automobile': 'automobile',
    'bike': 'bicycle',
    'bicycle': 'bicycle',
    'tv': 'television',
    'television': 'television',
}

def normalize_text_en(text):
    text = html.unescape(text)                         # فك ترميز HTML
    text = ''.join(c for c in text if c.isprintable()) # إزالة الأحرف غير القابلة للطباعة
    text = text.lower()                                # تحويل لحروف صغيرة
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # إزالة الروابط
    text = re.sub(r'\S+@\S+', '', text)                # إزالة الإيميلات
    text = re.sub(r'<.*?>', '', text)                  # إزالة الوسوم
    text = emoji.replace_emoji(text, replace='')       # إزالة الإيموجي
    text = text.translate(str.maketrans('', '', string.punctuation)) # إزالة علامات الترقيم
    text = re.sub(r'\d+', '', text)                    # إزالة الأرقام
    text = re.sub(r'(.)\1{2,}', r'\1', text)           # إزالة تكرار الحروف
    text = re.sub(r'\s+', ' ', text).strip()           # إزالة المسافات الزائدة
    return text

def process_query_en(query):
    query = normalize_text_en(query)

    # Tokenization
    words = word_tokenize(query)

    # POS Tagging
    tagged = pos_tag(words)

    # Lemmatization + إزالة كلمات التوقف + توحيد المرادفات
    processed_tokens = []
    for word, tag in tagged:
        if word in stop_words or len(word) <= 2:
            continue

        wn_pos = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(word, pos=wn_pos)

        # استبدال الكلمة بالمرادف الموحد إن وجد
        lemma = synonym_map.get(lemma, lemma)

        processed_tokens.append(lemma)

    return ' '.join(processed_tokens)

# 🧪 اختبار
if __name__ == "__main__":
    query = "projectkxcs"
    output = process_query_en(query)
    print("الاستعلام بعد المعالجة:", output)


الاستعلام بعد المعالجة: projectkxcs


In [84]:
import re
import html
import string
import emoji
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
from nltk import pos_tag, word_tokenize
from textblob import TextBlob  # ✅ تصحيح الأخطاء الإملائية

# تحميل كلمات التوقف
stop_words = set(stopwords.words('english'))

# أداة Lemmatizer
lemmatizer = WordNetLemmatizer()

# خريطة تحويل POS من شكل nltk إلى ما يفهمه lemmatizer
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return ADJ
    elif tag.startswith('V'):
        return VERB
    elif tag.startswith('N'):
        return NOUN
    elif tag.startswith('R'):
        return ADV
    else:
        return NOUN  # الافتراضي

# خريطة مرادفات مخصصة (يمكن توسيعها)
synonym_map = {
    'car': 'automobile',
    'automobile': 'automobile',
    'bike': 'bicycle',
    'bicycle': 'bicycle',
    'tv': 'television',
    'television': 'television',
}

# هل الكلمة موجودة في WordNet؟
def is_valid_word(word):
    return bool(wordnet.synsets(word))

# تصحيح الكلمات غير المفهومة
def correct_word(word):
    if is_valid_word(word):
        return word
    blob = TextBlob(word)
    corrected = str(blob.correct())
    # تأكد أن التصحيح له معنى
    return corrected if is_valid_word(corrected) else ''

def normalize_text_en(text):
    text = html.unescape(text)
    text = ''.join(c for c in text if c.isprintable())
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = emoji.replace_emoji(text, replace='')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def process_query_en(query):
    query = normalize_text_en(query)

    # Tokenization
    words = word_tokenize(query)

    # تصحيح الكلمات أولاً
    corrected_words = [correct_word(w) for w in words]
    corrected_words = [w for w in corrected_words if w]  # إزالة الفارغ

    # POS Tagging
    tagged = pos_tag(corrected_words)

    # Lemmatization + إزالة كلمات التوقف + توحيد المرادفات
    processed_tokens = []
    for word, tag in tagged:
        if word in stop_words or len(word) <= 2:
            continue

        wn_pos = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(word, pos=wn_pos)

        # استبدال الكلمة بالمرادف الموحد إن وجد
        lemma = synonym_map.get(lemma, lemma)

        processed_tokens.append(lemma)

    return ' '.join(processed_tokens)

# 🧪 اختبار
if __name__ == "__main__":
    query = "i am wachin the movi"
    output = process_query_en(query)
    print("الاستعلام بعد المعالجة:", output)


الاستعلام بعد المعالجة: within move


In [1]:
import re
import html
import string
import emoji
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
from nltk import pos_tag, word_tokenize
from textblob import TextBlob
from wordsegment import load as ws_load, segment as ws_segment

# تفعيل وضع التصحيح التفصيلي (True = تشغيل، False = إخفاء)
DEBUG = False

# كلمات التوقف
stop_words = set(stopwords.words('english'))

# Lemmatizer
lemmatizer = WordNetLemmatizer()

# تحويل POS من NLTK إلى WordNet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return ADJ
    elif tag.startswith('V'):
        return VERB
    elif tag.startswith('N'):
        return NOUN
    elif tag.startswith('R'):
        return ADV
    else:
        return NOUN

# التحقق من وجود الكلمة في WordNet
def is_valid_word(word):
    return bool(wordnet.synsets(word))

# تصحيح الكلمة
def correct_word(word):
    if is_valid_word(word):
        return word
    blob = TextBlob(word)
    corrected = str(blob.correct())
    if DEBUG and corrected != word:
        print(f"✏️ تصحيح '{word}' → '{corrected}'")
    return corrected if is_valid_word(corrected) else ''

# تنظيف النص
def normalize_text_en(text):
    text = html.unescape(text)
    text = ''.join(c for c in text if c.isprintable())
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = emoji.replace_emoji(text, replace='')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# المعالجة الكاملة
def process_query_en(query):
    query = normalize_text_en(query)
    words = word_tokenize(query)

    final_tokens = []
    for word in words:
        # تقسيم الكلمة إن لم تكن مفهومة
        segments = [word] if is_valid_word(word) else ws_segment(word)
        if DEBUG and segments != [word]:
            print(f"🔍 تقسيم '{word}' → {segments}")
        for segment in segments:
            corrected = correct_word(segment)
            if not corrected or corrected in stop_words or len(corrected) <= 2:
                continue
            final_tokens.append(corrected)

    # POS Tagging
    tagged = pos_tag(final_tokens)

    # Lemmatization + توحيد مرادفات
    processed_tokens = []
    for word, tag in tagged:
        wn_pos = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(word, pos=wn_pos)
        processed_tokens.append(lemma)

    return ' '.join(processed_tokens)

# 🧪 تجربة تفاعلية
if __name__ == "__main__":
    print("🔎 اكتب استعلامك (أو اكتب 'exit' للخروج):")
    while True:
        query = input("play")
        if query.strip().lower() == 'exit':
            break
        output = process_query_en(query)
        print("✅ بعد المعالجة:", output)
        print("—" * 40)


🔎 اكتب استعلامك (أو اكتب 'exit' للخروج):


play exit


In [2]:
import re
import html
import string
import emoji
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
from nltk import pos_tag, word_tokenize
from textblob import TextBlob
from wordsegment import load as ws_load, segment as ws_segment

# تفعيل وضع التصحيح التفصيلي (True = تشغيل، False = إخفاء)
DEBUG = False

# كلمات التوقف
stop_words = set(stopwords.words('english'))

# Lemmatizer
lemmatizer = WordNetLemmatizer()

# تحويل POS من NLTK إلى WordNet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return ADJ
    elif tag.startswith('V'):
        return VERB
    elif tag.startswith('N'):
        return NOUN
    elif tag.startswith('R'):
        return ADV
    else:
        return NOUN

# التحقق من وجود الكلمة في WordNet
def is_valid_word(word):
    return bool(wordnet.synsets(word))

# تصحيح الكلمة
def correct_word(word):
    if is_valid_word(word):
        return word
    blob = TextBlob(word)
    corrected = str(blob.correct())
    if DEBUG and corrected != word:
        print(f"✏️ تصحيح '{word}' → '{corrected}'")
    return corrected if is_valid_word(corrected) else ''

# تنظيف النص
def normalize_text_en(text):
    text = html.unescape(text)
    text = ''.join(c for c in text if c.isprintable())
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = emoji.replace_emoji(text, replace='')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# المعالجة الكاملة
def process_query_en(query):
    query = normalize_text_en(query)
    words = word_tokenize(query)

    final_tokens = []
    for word in words:
        # تقسيم الكلمة إن لم تكن مفهومة
        segments = [word] if is_valid_word(word) else ws_segment(word)
        if DEBUG and segments != [word]:
            print(f"🔍 تقسيم '{word}' → {segments}")
        for segment in segments:
            corrected = correct_word(segment)
            if not corrected or corrected in stop_words or len(corrected) <= 2:
                continue
            final_tokens.append(corrected)

    # POS Tagging
    tagged = pos_tag(final_tokens)

    # Lemmatization + توحيد مرادفات
    processed_tokens = []
    for word, tag in tagged:
        wn_pos = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(word, pos=wn_pos)
        processed_tokens.append(lemma)

    return ' '.join(processed_tokens)

# 🧪 تجربة تفاعلية
if __name__ == "__main__":
    print("🔎 اكتب استعلامك (أو اكتب 'exit' للخروج):")
    while True:
        query = input("play")
        if query.strip().lower() == 'exit':
            break
        output = process_query_en(query)
        print("✅ بعد المعالجة:", output)
        print("—" * 40)


🔎 اكتب استعلامك (أو اكتب 'exit' للخروج):


play bik


ValueError: max() iterable argument is empty

In [None]:
import re
import html
import string
import emoji
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
from nltk import pos_tag, word_tokenize
from textblob import TextBlob
from wordsegment import load as ws_load, segment as ws_segment

# تفعيل تقسيم الكلمات
ws_load()

# تفعيل التصحيح التفصيلي (True = لعرض التفاصيل)
DEBUG = False

# كلمات التوقف
stop_words = set(stopwords.words('english'))

# Lemmatizer
lemmatizer = WordNetLemmatizer()

# تحويل POS من NLTK إلى WordNet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return ADJ
    elif tag.startswith('V'):
        return VERB
    elif tag.startswith('N'):
        return NOUN
    elif tag.startswith('R'):
        return ADV
    else:
        return NOUN

# التحقق من وجود الكلمة في WordNet
def is_valid_word(word):
    return bool(wordnet.synsets(word))

# تصحيح الكلمة
def correct_word(word):
    if is_valid_word(word):
        return word
    blob = TextBlob(word)
    corrected = str(blob.correct())
    if DEBUG and corrected != word:
        print(f"✏️ تصحيح '{word}' → '{corrected}'")
    return corrected if is_valid_word(corrected) else ''

# تنظيف النص
def normalize_text_en(text):
    text = html.unescape(text)
    text = ''.join(c for c in text if c.isprintable())
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = emoji.replace_emoji(text, replace='')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# المعالجة الكاملة
def process_query_en(query):
    query = normalize_text_en(query)
    words = word_tokenize(query)

    final_tokens = []
    for word in words:
        segments = []
        if is_valid_word(word):
            segments = [word]
        else:
            try:
                segments = ws_segment(word)
                if not segments or all(seg.strip() == '' for seg in segments):
                    continue  # تجاهل الكلمات غير القابلة للتقسيم
                if DEBUG and segments != [word]:
                    print(f"🔍 تقسيم '{word}' → {segments}")
            except ValueError:
                continue  # حماية من الخطأ

        for segment in segments:
            corrected = correct_word(segment)
            if not corrected or corrected in stop_words or len(corrected) <= 2:
                continue
            final_tokens.append(corrected)

    # POS Tagging
    tagged = pos_tag(final_tokens)

    # Lemmatization فقط
    processed_tokens = []
    for word, tag in tagged:
        wn_pos = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(word, pos=wn_pos)
        processed_tokens.append(lemma)

    return ' '.join(processed_tokens)

# 🧪 تجربة تفاعلية
if __name__ == "__main__":
    print("🔎 اكتب استعلامك (أو 'exit' للخروج):")
    while True:
        query = input("💬 استعلام: ")
        if query.strip().lower() == 'exit':
            break
        if not query.strip():
            print("⚠️ أدخل نصًا صالحًا.")
            continue
        output = process_query_en(query)
        print("✅ بعد المعالجة:", output)
        print("—" * 40)


🔎 اكتب استعلامك (أو 'exit' للخروج):


💬 استعلام:  I pla Football


✅ بعد المعالجة: plan football
————————————————————————————————————————


💬 استعلام:  I pla basketball


✅ بعد المعالجة: plan basketball
————————————————————————————————————————


💬 استعلام:  I arry my wife


✅ بعد المعالجة: army wife
————————————————————————————————————————


💬 استعلام:  I mry my wife


✅ بعد المعالجة: may wife
————————————————————————————————————————


In [2]:
import ir_datasets
dataset = ir_datasets.load("msmarco-passage/train")
# for qrel in dataset.qrels_iter():
#     print(qrel) # namedtuple<query_id, doc_id, relevance, iteration

[INFO] Please confirm you agree to the MSMARCO data usage agreement found at <http://www.msmarco.org/dataset.aspx>
[INFO] If you have a local copy of https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.train.tsv, you can symlink it here to avoid downloading it again: C:\Users\HP\.ir_datasets\downloads\733fb9fe12d93e497f7289409316eccf
[INFO] [starting] https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.train.tsv
[INFO] [finished] https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.train.tsv: [00:44] [10.6MB] [240kB/s]
                                                                                                   

TrecQrel(query_id='1185869', doc_id='0', relevance=1, iteration='0')
TrecQrel(query_id='1185868', doc_id='16', relevance=1, iteration='0')
TrecQrel(query_id='597651', doc_id='49', relevance=1, iteration='0')
TrecQrel(query_id='403613', doc_id='60', relevance=1, iteration='0')
TrecQrel(query_id='1183785', doc_id='389', relevance=1, iteration='0')
TrecQrel(query_id='312651', doc_id='616', relevance=1, iteration='0')
TrecQrel(query_id='80385', doc_id='723', relevance=1, iteration='0')
TrecQrel(query_id='645590', doc_id='944', relevance=1, iteration='0')
TrecQrel(query_id='645337', doc_id='1054', relevance=1, iteration='0')
TrecQrel(query_id='186154', doc_id='1160', relevance=1, iteration='0')
TrecQrel(query_id='457407', doc_id='1172', relevance=1, iteration='0')
TrecQrel(query_id='441383', doc_id='1389', relevance=1, iteration='0')
TrecQrel(query_id='683408', doc_id='1605', relevance=1, iteration='0')
TrecQrel(query_id='1164799', doc_id='1713', relevance=1, iteration='0')
TrecQrel(query_i

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



TrecQrel(query_id='338576', doc_id='4284279', relevance=1, iteration='0')
TrecQrel(query_id='539863', doc_id='4284292', relevance=1, iteration='0')
TrecQrel(query_id='824886', doc_id='4284302', relevance=1, iteration='0')
TrecQrel(query_id='835387', doc_id='4284312', relevance=1, iteration='0')
TrecQrel(query_id='318962', doc_id='4284316', relevance=1, iteration='0')
TrecQrel(query_id='948322', doc_id='3336501', relevance=1, iteration='0')
TrecQrel(query_id='673952', doc_id='4284334', relevance=1, iteration='0')
TrecQrel(query_id='1035220', doc_id='4284343', relevance=1, iteration='0')
TrecQrel(query_id='978069', doc_id='4284385', relevance=1, iteration='0')
TrecQrel(query_id='1179754', doc_id='4284397', relevance=1, iteration='0')
TrecQrel(query_id='119116', doc_id='4284414', relevance=1, iteration='0')
TrecQrel(query_id='773637', doc_id='4284442', relevance=1, iteration='0')
TrecQrel(query_id='774047', doc_id='4284460', relevance=1, iteration='0')
TrecQrel(query_id='966398', doc_id='

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [17]:
import numpy as np
qrels = np.array(list(dataset.qrels_iter()))

In [27]:
qrels[:,2] = qrels[:,2].astype(np.int32)

In [31]:
(qrels[:,2]=='1').all()

True

In [16]:
list(dataset.qrels_iter())

[TrecQrel(query_id='1185869', doc_id='0', relevance=1, iteration='0'),
 TrecQrel(query_id='1185868', doc_id='16', relevance=1, iteration='0'),
 TrecQrel(query_id='597651', doc_id='49', relevance=1, iteration='0'),
 TrecQrel(query_id='403613', doc_id='60', relevance=1, iteration='0'),
 TrecQrel(query_id='1183785', doc_id='389', relevance=1, iteration='0'),
 TrecQrel(query_id='312651', doc_id='616', relevance=1, iteration='0'),
 TrecQrel(query_id='80385', doc_id='723', relevance=1, iteration='0'),
 TrecQrel(query_id='645590', doc_id='944', relevance=1, iteration='0'),
 TrecQrel(query_id='645337', doc_id='1054', relevance=1, iteration='0'),
 TrecQrel(query_id='186154', doc_id='1160', relevance=1, iteration='0'),
 TrecQrel(query_id='457407', doc_id='1172', relevance=1, iteration='0'),
 TrecQrel(query_id='441383', doc_id='1389', relevance=1, iteration='0'),
 TrecQrel(query_id='683408', doc_id='1605', relevance=1, iteration='0'),
 TrecQrel(query_id='1164799', doc_id='1713', relevance=1, iter

In [8]:
# import pandas as pd

# pd.read_csv(dataset.queries_iter())
for query in dataset.queries_iter():
    query

[INFO] If you have a local copy of https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz, you can symlink it here to avoid downloading it again: C:\Users\HP\.ir_datasets\downloads\c177b2795d5f2dcc524cf00fcd973be1
[INFO] [starting] https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz
[INFO] [finished] https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz: [01:54] [18.9MB] [164kB/s]
                                                                                                  

GenericQuery(query_id='121352', text='define extreme')
GenericQuery(query_id='634306', text='what does chattel mean on credit history')
GenericQuery(query_id='920825', text='what was the great leap forward brainly')
GenericQuery(query_id='510633', text='tattoo fixers how much does it cost')
GenericQuery(query_id='737889', text='what is decentralization process.')
GenericQuery(query_id='278900', text="how many cars enter the la jolla concours d' elegance?")
GenericQuery(query_id='674172', text='what is a bank transit number')
GenericQuery(query_id='303205', text='how much can i contribute to nondeductible ira')
GenericQuery(query_id='570009', text='what are the four major groups of elements')
GenericQuery(query_id='492875', text='sanitizer temperature')
GenericQuery(query_id='54528', text='blood clots in urine after menopause')
GenericQuery(query_id='203218', text='highmark address')
GenericQuery(query_id='473204', text='per sf cost in california for tenant build out')
GenericQuery(quer

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



GenericQuery(query_id='664557', text='what group of animals includes worms')
GenericQuery(query_id='1137754', text='who plays captain in the crown')
GenericQuery(query_id='168869', text="does medical marijuana saved people's lives")
GenericQuery(query_id='57072', text='bull zodiac meaning')
GenericQuery(query_id='232421', text='how far is the denver airport to colorado springs')
GenericQuery(query_id='443490', text='lughan name meaning')
GenericQuery(query_id='485523', text='rci points airfare cost')
GenericQuery(query_id='832303', text='what is the medical term for pumping heart chamber')
GenericQuery(query_id='557032', text='what are considered enteric precautions')
GenericQuery(query_id='1050394', text='who sang you are so beautiful')
GenericQuery(query_id='320248', text='how much hertz do fetal heart rate monitors use?')
GenericQuery(query_id='750754', text='what is georgia unemployment tax rate')
GenericQuery(query_id='20864', text='are animal warts contagious')
GenericQuery(query

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 7966: character maps to <undefined>