# Week 2, Lesson 3, Activity 7: Term-weighting techniques

&copy;2021, Ekaterina Kochmar \
(revised: Nadejda Roubtsova, June 2022)

Your task in this activity is to:

- Implement TF-IDF weighting using the material presented in this lesson.
- Apply these techniques to the collection of documents provided.
- Return the TF-IDF scores for the provided set of words.

## Step 1: Read in the data

There are three components to this data:
- documents with their ids and content – there are $1460$ of those to be precise;
- questions / queries with their ids and content – there are $112$ of those;
- mapping between the queries and relevant documents.

First, let's read in documents from the `CISI.ALL` file and store the result in the `documents` data structure where document contents are stored under corresponding document ids:

In [None]:
def read_documents():
    f = open("cisi/CISI.ALL")
    merged = ""
    
    for a_line in f.readlines():
        if a_line.startswith("."):
            merged += "\n" + a_line.strip()
        else:
            merged += " " + a_line.strip()
    
    documents = {}

    content = ""
    doc_id = ""

    for a_line in merged.split("\n"):
        if a_line.startswith(".I"):
            doc_id = a_line.split(" ")[1].strip()
        elif a_line.startswith(".X"):
            documents[doc_id] = content
            content = ""
            doc_id = ""
        else:
            content += a_line.strip()[3:] + " "
    f.close()
    return documents

documents = read_documents()
print(f"{len(documents)} documents in total")
print("Document with id 1:")
print(documents.get("1"))

Second, let's read in queries from the `CISI.QRY` file and store the result in the `queries` data structure where query contents are stored under corresponding query ids:

In [None]:
def read_queries():
    f = open("cisi/CISI.QRY")
    merged = ""
    
    for a_line in f.readlines():
        if a_line.startswith("."):
            merged += "\n" + a_line.strip()
        else:
            merged += " " + a_line.strip()
    
    queries = {}

    content = ""
    qry_id = ""

    for a_line in merged.split("\n"):
        if a_line.startswith(".I"):
            if not content=="":
                queries[qry_id] = content
                content = ""
                qry_id = ""
            qry_id = a_line.split(" ")[1].strip()
        elif a_line.startswith(".W") or a_line.startswith(".T"):
            content += a_line.strip()[3:] + " "
    queries[qry_id] = content
    f.close()
    return queries

queries = read_queries()
print(f"{len(queries)} queries in total")
print("Query with id 1:")
print(queries.get("1"))

Finally, let's read in the mapping between the queries and the documents. We'll keep these in the `mappings` data structure where each query index (key) corresponds to the list of one or more document indices (value).

In [None]:
def read_mappings():
    f = open("cisi/CISI.REL")
    
    mappings = {}

    for a_line in f.readlines():
        voc = a_line.strip().split()
        key = voc[0].strip()
        current_value = voc[1].strip()
        value = []
        if key in mappings.keys():
            value = mappings.get(key)
        value.append(current_value)
        mappings[key] = value

    f.close()
    return mappings

mappings = read_mappings()
print(f"{len(mappings)} mappings in total")
print(mappings.keys())
print("Mapping for query with id 1:")
print(mappings.get("1"))

## Step 2: Preprocess the data

Practise application of the following steps:
- tokenize the texts
- put all to lowercase
- remove stopwords
- apply stemming

Implement and apply these steps to a sample text:

In [None]:
import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer

def process(text): 
    stoplist = set(stopwords.words('english'))
    st = LancasterStemmer()
    word_list = [st.stem(word) for word in 
                 # a tokenized list of words, all converted to lowercase,
                 # if the word is not in the stoplist and not a punctuation mark (from string.punctuation)
                 ]
    return word_list
  
word_list = process(documents.get("27"))
print(word_list)

## Step 3: Term weighting

First calculate the term frequency in each document:

In [None]:
def get_terms(text): 
    terms = {}
    st = LancasterStemmer()
    stoplist = # as above
    word_list = # as above
    for word in word_list:
        terms[word] = terms.get(word, 0) + 1
    return terms

doc_terms = {}
qry_terms = {}
for doc_id in documents.keys():
    doc_terms[doc_id] = get_terms(# apply to the content of the document with id doc_id
                                  )
for qry_id in queries.keys():
    qry_terms[qry_id] = get_terms(# apply to the content of the query with id qry_id
                                  )


print(f"{len(doc_terms)} documents in total") # Sanity check – this should be the same number as before
d1_terms = doc_terms.get("1")
print("Terms and frequencies for document with id 1:")
print(d1_terms)
print(f"{len(d1_terms)} terms in this document")
print()
print(f"{len(qry_terms)} queries in total") # Sanity check – this should be the same number as before
q1_terms = qry_terms.get("1")
print("Terms and frequencies for query with id 1:")
print(q1_terms)
print(f"{len(q1_terms)} terms in this query")

Second, collect shared vocabulary from all documents and queries:

In [None]:
def collect_vocabulary():
    all_terms = []
    for doc_id in doc_terms.keys():
        for term in doc_terms.get(doc_id).keys():            
            all_terms.append(term)
    for qry_id in qry_terms.keys():
        # apply the same procedure to the query terms
    return sorted(set(all_terms))

all_terms = collect_vocabulary()
print(f"{len(all_terms)} terms in the shared vocabulary")
print("First 10:")
print(all_terms[:10])

Represent each document and query as vectors containing word counts in the shared space:

In [None]:
def vectorize(input_terms, shared_vocabulary):
    output = {}
    for item_id in input_terms.keys(): # e.g., a document in doc_terms
        terms = input_terms.get(item_id)
        output_vector = []
        for word in shared_vocabulary:
            if word in terms.keys():
                # add the raw count of the word from the shared vocabulary in doc to the doc vector
                output_vector.append(int(terms.get(word)))
            else:
                # if the word from the shared vocabulary is not in doc, add 0 to the doc vector in this position
                output_vector.append(0)
        output[item_id] = output_vector
    return output

doc_vectors = vectorize(# apply vectorize to the doc_terms and the shared vocabulary all_terms
                        )
qry_vectors = vectorize(# apply vectorize to the qry_terms and the shared vocabulary all_terms
                        )

print(f"{len(doc_vectors)} document vectors") # This should be the same number as before
d1460_vector = doc_vectors.get("1460")
print(f"{len(d1460_vector)} terms in this document") # This should be the same number as before
print(f"{len(qry_vectors)} query vectors") # This should be the same number as before
q112_vector = qry_vectors.get("112")
print(f"{len(q112_vector)} terms in this query") # This should be the same number as before

In [None]:
import math

def calculate_idfs(shared_vocabulary, d_terms):
    doc_idfs = {}
    for term in shared_vocabulary:
        doc_count = 0 # the number of documents containing this term
        for doc_id in d_terms.keys():
            terms = d_terms.get(doc_id)
            if term in terms.keys():
                doc_count += 1
        doc_idfs[term] = math.log(float(len(d_terms.keys()))/float(1 + doc_count), 10)
    return doc_idfs

doc_idfs = calculate_idfs(# apply calculate_idfs to the shared vocabulary all_terms and to doc_terms
                        )
print(f"{len(doc_idfs)} terms with idf scores") # This should be the same number as before
print("Idf score for the word system:")
print(doc_idfs.get("system"))

In [None]:
def vectorize_idf(input_terms, input_idfs, shared_vocabulary):
    output = {}
    for item_id in input_terms.keys():
        terms = # collect terms from the document
        output_vector = []
        for term in shared_vocabulary:
            if term in terms.keys():
                output_vector.append(input_idfs.get(term)*float(terms.get(term)))
            else:
                output_vector.append(float(0))
        output[item_id] = output_vector
    return output

doc_vectors = vectorize_idf(# apply to the relevant data structures
                            )

print(f"{len(doc_vectors)} document vectors") # This should be the same number as before
print("Number of idf-scored words in a particular document:")
print(len(doc_vectors.get("1460"))) # This should be the same number as before