In [1]:
from os import environ

if "CHATNOIR_API_KEY_STAGING" in environ:
    api_key = environ["CHATNOIR_API_KEY_STAGING"]
else:
    api_key = input("ChatNoir API key: ")

In [2]:
from sys import modules

if "google.colab" in modules:
    !pip install -q chatnoir-pyterrier python-terrier

In [2]:
from pyterrier import init, started

if not started():
    init()

PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



In [3]:
from requests import get
from pandas import DataFrame, read_xml
from pathlib import Path


def download_read_topics(url: str, path: Path) -> DataFrame:
    if not path.exists():
        with path.open("wb") as file:
            file.write(get(url).content)
    return read_xml(path).rename(columns={"number": "qid", "title": "query"}).drop(columns=["description", "narrative"])

In [4]:
topics_task_1 = download_read_topics(
    "https://touche.webis.de/clef23/touche23-data/topics-task1.xml",
    Path("topics_task_1.xml")
)
topics_task_2 = download_read_topics(
    "https://touche.webis.de/clef23/touche23-data/topics-task2.xml",
    Path("topics_task_2.xml")
)

In [5]:
topics_task_1

Unnamed: 0,qid,query
0,1,Should teachers get tenure?
1,2,Is vaping with e-cigarettes safe?
2,3,Should insider trading be allowed?
3,4,Should corporal punishment be used in schools?
4,5,Should social security be privatized?
5,6,Is a college education worth it?
6,7,Should felons who have completed their sentenc...
7,8,Should abortion be legal?
8,9,Should students have to wear school uniforms?
9,10,Should any vaccines be required for children?


In [121]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import string
import stanza

stop_words = set(stopwords.words('english'))

nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma', use_gpu=False)

def lemmatize_sentence(sentence):
    doc = nlp(sentence)
    lemmatized_words = [word.lemma for sent in doc.sentences for word in sent.words]
    lemmatized_sentence = ' '.join(lemmatized_words)
    return lemmatized_sentence

def lemmatize_sentence_remove_punct(sentence):
    doc = nlp(sentence)
    lemmatized_words = [word.lemma for sent in doc.sentences for word in sent.words]
    lemmatized_sentence = ' '.join(lemmatized_words)
    return lemmatized_sentence.replace('?', '')

custom_stopword_list = ['be', 'a', 'the', 'do', 'to', 'at', 'we', 'for', 'or', 'in', 'it']

def remove_custom_stop_words(sentence):
    return ' '.join([word.lower() for word in sentence.split() if word.lower() not in custom_stopword_list])

custom_arg_words = ['pro', 'con', 'benefit', 'because', 'opinion', 'believe', 'think', 'issue', 'justify', 'debate', 'argument', 'evidence', 'fact', 'example', 'reason']

def add_arg_cues(sentence):
    return sentence + ' ' + ' '.join(custom_arg_words)

#topics_task_1['query'] = topics_task_1['query'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word.lower() not in stop_words])\
#                                                  .translate(str.maketrans('', '', string.punctuation)))
#topics_task_1['query'] = topics_task_1['query'].apply(lemmatize_sentence)


topics_task_1['query'] = topics_task_1['query'].apply(lemmatize_sentence_remove_punct)
topics_task_1['query'] = topics_task_1['query'].apply(remove_custom_stop_words)
#topics_task_1['query'] = topics_task_1['query'].apply(add_arg_cues)

2023-04-27 10:37:22 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |

2023-04-27 10:37:22 INFO: Use device: cpu
2023-04-27 10:37:22 INFO: Loading: tokenize
2023-04-27 10:37:22 INFO: Loading: pos
2023-04-27 10:37:23 INFO: Loading: lemma
2023-04-27 10:37:23 INFO: Done loading processors!


In [126]:
topics_task_1

Unnamed: 0,qid,query
0,1,should teacher get tenure pro con benefit beca...
1,2,vape with e-cigarette safe pro con benefit bec...
2,3,should insider trading allow pro con benefit b...
3,4,should corporal punishment use school pro con ...
4,5,should social security privatize pro con benef...
5,6,college education worth pro con benefit becaus...
6,7,should felon who have complete they sentence a...
7,8,should abortion legal pro con benefit because ...
8,9,should student have wear school uniform pro co...
9,10,should any vaccine require child pro con benef...


In [6]:
from chatnoir_pyterrier.retrieve import ChatNoirRetrieve, Feature
from chatnoir_api import Index
from chatnoir_pyterrier import ChatNoirRetrieve

features = Feature.CONTENT_PLAIN | Feature.TITLE_TEXT  # plaintext and title
chatnoir_all = ~ChatNoirRetrieve(api_key, index=Index.ClueWeb22, staging=True, features=features, verbose=True, num_results=10)
chatnoir_all.transform(topics_task_1).to_csv('../../data/chatnoir_10_custom_stopw_lemmas.tsv', sep='\t', index=False)

Searching with ChatNoir: 100%|██████████| 50/50 [02:24<00:00,  2.89s/query]


Unnamed: 0,qid,query,docno,score,title_text,html_plain,rank
0,1,Should teachers get tenure?,clueweb22-en0031-49-02531,2800.194,Pro & Con Quotes: Should Teachers Get Tenure? ...,Last updated on: 1/13/2011 | Author: ProCon.or...,0
1,2,Is vaping with e-cigarettes safe?,clueweb22-en0040-72-06099,3060.89,Safe Vaping - Best-E-Cigarette-Guide,Category: Safe Vaping\n\nIf you're concerned a...,0
2,3,Should insider trading be allowed?,clueweb22-en0024-00-17639,1680.8656,INSIDER TRADING UNDER SEBI (PROHIBITION OF INS...,INSIDER TRADING UNDER SEBI (PROHIBITION OF INS...,0
3,4,Should corporal punishment be used in schools?,clueweb22-en0007-61-13734,3291.8857,Should corporal punishment be reintroduced in ...,Should corporal punishment be reintroduced in ...,0
4,5,Should social security be privatized?,clueweb22-en0020-45-14264,2150.8496,Controversial Essay: Should Public Sector Be P...,Controversial Essay: Should Public Sector Be P...,0
5,6,Is a college education worth it?,clueweb22-en0025-69-08332,2511.429,Pro & Con Quotes: Is a College Education Worth...,Last updated on: 1/27/2020 | Author: ProCon.or...,0
6,7,Should felons who have completed their sentenc...,clueweb22-en0005-72-03842,4479.922,Should People Who Have Completed Felony Senten...,Last updated on: 8/6/2021 | Author: ProCon.org...,0
7,8,Should abortion be legal?,clueweb22-en0028-26-19184,3196.5083,Should abortion be legal? | Debate.org,Should abortion be legal? | Debate.org\n • Ab...,0
8,9,Should students have to wear school uniforms?,clueweb22-en0034-17-10307,4018.7214,Should Students Have to Wear School Uniforms? ...,Should Students Have to Wear School Uniforms?\...,0
9,10,Should any vaccines be required for children?,clueweb22-en0034-75-08648,3538.7249,Pro & Con Quotes: Should Any Vaccines Be Requi...,Last updated on: 5/24/2019 | Author: ProCon.or...,0


In [128]:
#df_chat = pd.read_csv("../../data/chatnoir_50_custom_stopw_lemmas_argterms.tsv", sep='\t')

In [129]:
#df_chat.head()

Unnamed: 0,qid,query,docno,score,title_text,html_plain,rank
0,1,should teacher get tenure pro con benefit beca...,clueweb22-en0018-92-13182,1324.4674,[Pros & Cons] | White Coat Investor,This post was originally just a collection of ...,0
1,2,vape with e-cigarette safe pro con benefit bec...,clueweb22-en0026-12-06583,1443.133,Federal Register :: Treatment of E-Cigarettes ...,Federal Register :: Treatment of E-Cigarettes ...,0
2,3,should insider trading allow pro con benefit b...,clueweb22-en0027-68-01600,1396.5693,The Pros And Cons Of Bethesda's Creation Club,The Pros And Cons Of Bethesda's Creation Club\...,0
3,4,should corporal punishment use school pro con ...,clueweb22-en0037-17-09544,1926.3115,The History of Corporal Punishment - The Art a...,The History of Corporal Punishment - The Art a...,0
4,5,should social security privatize pro con benef...,clueweb22-en0038-21-01810,1007.98505,Why do some people think climate change is a h...,Why do some people think climate change is a h...,0
