Name: Sam Pierre-Louis, Joseph Champeau

CSI4107 Assignment 1 - Information Retreival System

In [3]:
#Import all necessary libraries
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from collections import defaultdict
import json

import math
from sklearn.feature_extraction.text import TfidfVectorizer

Step 1: Preprocessing
The preprocessing step involves tokenization, stopword removal, and optionally stemming.

Tokenization: Split the text into individual words.

Stopword Removal: Remove common words that do not contribute much to the meaning of the text.

Stemming: Reduce words to their root form.

In [4]:
# Load stopwords
with open('StopWords.txt', 'r') as f:
    stop_words = set(f.read().splitlines())

# Initialize stemmer
stemmer = PorterStemmer()

def preprocess(text):
    # Tokenization and lowercasing
    tokens = re.findall(r'\b\w+\b', text.lower())
    
    # Stopword removal
    tokens = [token for token in tokens if token not in stop_words]
    
    # Optional stemming
    tokens = [stemmer.stem(token) for token in tokens]
    
    return tokens

Step 2: Indexing
Build an inverted index where each term points to a list of documents that contain it.

In [None]:
# Initialize inverted index
inverted_index = defaultdict(list)

# Read corpus and build index
with open('C:\dev\CSI4107\Assignment 1\scifact\corpus.jsonl', 'r') as f:
    for line in f:
        doc = json.loads(line)
        doc_id = doc['_id']
        text = doc['title'] + ' ' + doc['text']
        tokens = preprocess(text)
        
        # Update inverted index
        for token in set(tokens):  # Use set to avoid duplicate entries
            inverted_index[token].append(doc_id)

# Save inverted index to a file
with open('inverted_index.json', 'w') as f:
    json.dump(inverted_index, f)

  with open('C:\dev\CSI4107\Assignment 1\scifact\corpus.jsonl', 'r') as f:


Step 3: Retrieval and Ranking
For each query, compute the cosine similarity between the query and each document using TF-IDF weighting.

In [12]:
# Compute document frequencies
doc_freq = {term: len(docs) for term, docs in inverted_index.items()}
total_docs = len(inverted_index)

# Function to compute TF-IDF
def compute_tfidf(tokens, doc_freq, total_docs):
    tfidf = {}
    for token in set(tokens):
        tf = tokens.count(token)
        idf = math.log(total_docs / doc_freq.get(token, 1))
        tfidf[token] = tf * idf
    return tfidf

#helper function to retrieve the text of a document given its ID
def get_document_text(doc_id):
    with open('C:\dev\CSI4107\Assignment 1\scifact\corpus.jsonl', 'r') as f:
        for line in f:
            doc = json.loads(line)
            if doc['_id'] == doc_id:
                return doc['title'] + ' ' + doc['text']
    return ''


# Function to compute cosine similarity
def cosine_similarity(query_tfidf, doc_tfidf):
    dot_product = sum(query_tfidf.get(token, 0) * doc_tfidf.get(token, 0) for token in query_tfidf)
    query_norm = math.sqrt(sum(val ** 2 for val in query_tfidf.values()))
    doc_norm = math.sqrt(sum(val ** 2 for val in doc_tfidf.values()))
    return dot_product / (query_norm * doc_norm) if query_norm * doc_norm != 0 else 0

# Read queries and compute similarity
results = []
with open('C:\dev\CSI4107\Assignment 1\scifact\queries.jsonl', 'r') as f:
    for line in f:
        query = json.loads(line)
        query_id = query['_id']
        query_text = query['text']
        query_tokens = preprocess(query_text)
        query_tfidf = compute_tfidf(query_tokens, doc_freq, total_docs)
        
        # Compute similarity with each document
        doc_scores = {}
        for term in query_tokens:
            if term in inverted_index:
                for doc_id in inverted_index[term]:
                    if doc_id not in doc_scores:
                        doc_text = get_document_text(doc_id)  # Function to get document text
                        doc_tokens = preprocess(doc_text)
                        doc_tfidf = compute_tfidf(doc_tokens, doc_freq, total_docs)
                        doc_scores[doc_id] = cosine_similarity(query_tfidf, doc_tfidf)
        
        # Sort documents by score
        sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
        
        # Save top-100 results
        for rank, (doc_id, score) in enumerate(sorted_docs[:100], start=1):
            results.append(f"{query_id} Q0 {doc_id} {rank} {score} run_name")

# Save results to file
with open('Results.txt', 'w') as f:
    f.write('\n'.join(results))

  with open('C:\dev\CSI4107\Assignment 1\scifact\corpus.jsonl', 'r') as f:
  with open('C:\dev\CSI4107\Assignment 1\scifact\queries.jsonl', 'r') as f:
  with open('C:\dev\CSI4107\Assignment 1\scifact\corpus.jsonl', 'r') as f:
  with open('C:\dev\CSI4107\Assignment 1\scifact\queries.jsonl', 'r') as f:


KeyboardInterrupt: 