## Loading the files from Abstract folder

In [11]:
import os
import re

folder_path = "Abstracts"
files = [] 


# filtering only .txt files
for file in os.listdir(folder_path):
    if file.endswith(".txt"):
        files.append(file)
        
files.sort() # increasing order


print("Found Files: ", len(files))
print("Files:", files)


Found Files:  448
Files: ['1.txt', '10.txt', '100.txt', '101.txt', '102.txt', '103.txt', '104.txt', '105.txt', '106.txt', '107.txt', '108.txt', '109.txt', '11.txt', '110.txt', '111.txt', '112.txt', '113.txt', '114.txt', '115.txt', '116.txt', '117.txt', '118.txt', '119.txt', '12.txt', '120.txt', '121.txt', '122.txt', '123.txt', '124.txt', '125.txt', '126.txt', '127.txt', '128.txt', '129.txt', '13.txt', '130.txt', '131.txt', '132.txt', '133.txt', '134.txt', '135.txt', '136.txt', '137.txt', '138.txt', '139.txt', '14.txt', '140.txt', '141.txt', '142.txt', '143.txt', '144.txt', '145.txt', '146.txt', '147.txt', '148.txt', '149.txt', '15.txt', '150.txt', '151.txt', '152.txt', '153.txt', '154.txt', '155.txt', '156.txt', '157.txt', '158.txt', '159.txt', '16.txt', '160.txt', '161.txt', '162.txt', '163.txt', '164.txt', '165.txt', '166.txt', '167.txt', '168.txt', '169.txt', '17.txt', '170.txt', '171.txt', '172.txt', '173.txt', '174.txt', '175.txt', '176.txt', '177.txt', '178.txt', '179.txt', '18.t

## Loading the Stopwords file

In [15]:
stopwords_file = "Stopword-List.txt"

stopwords = set() 


with open(stopwords_file, 'r', encoding='utf-8') as f:

    for line in f:
        word = line.strip().lower() #remove extra spaces and convert to lower case
        
        if word != "":
            stopwords.add(word)

print("Loaded stopwords: ",len(stopwords))
print("Stopwords:", stopwords)


Loaded stopwords:  26
Stopwords: {'on', 'had', 'we', 'are', 'his', 'at', 'has', 'no', 'am', 'to', 'in', 'her', 'do', 'of', 'can', 'the', 'for', 'have', 'a', 'all', 'up', 'be', 'is', 'once', 'as', 'and'}


## Preprocessing each file

#### Importing Requried Libs

In [17]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


In [30]:
raw_docs = {}  


for file in files:
    file_path = os.path.join(folder_path, file)

    # read each file and store its content ino content
    with open(file_path, 'r', encoding='latin1') as f:  # latin1 encoding to avoid decoding issues
        content = f.read()
    
    # converting string filename (like "1.txt") into a numeric document ID
    base_name = os.path.splitext(file)[0]
    docID = int(base_name)
    raw_docs[docID] = content

print("Raw documents have been read.")


Raw documents have been read.


In [31]:
raw_docs

{1: 'Ensemble Statistical and Heuristic Models for Unsupervised Word Alignment\n\nstatistical word alignment, ensemble learning, heuristic word alignment\n\nStatistical word alignment models need large amount of training data while they are weak in small-size corpora. This paper proposes a new approach of unsupervised hybrid word alignment technique using ensemble learning method. This algorithm uses three base alignment models in several rounds to generate alignments. The ensemble algorithm uses a weighed scheme for resampling training data and a voting score to consider aggregated alignments. The underlying alignment algorithms used in this study include IBM Model 1, 2 and a heuristic method based on Dice measurement. Our experimental results show that by this approach, the alignment error rate could be improved by at least %15 for the base alignment models.',
 10: 'Detection of abnormal human behavior using a matrix approximation-based approach\n\nabnormal event detection, low-rank 

In [32]:
raw_docs = dict(sorted(raw_docs.items()))

In [33]:
raw_docs

{1: 'Ensemble Statistical and Heuristic Models for Unsupervised Word Alignment\n\nstatistical word alignment, ensemble learning, heuristic word alignment\n\nStatistical word alignment models need large amount of training data while they are weak in small-size corpora. This paper proposes a new approach of unsupervised hybrid word alignment technique using ensemble learning method. This algorithm uses three base alignment models in several rounds to generate alignments. The ensemble algorithm uses a weighed scheme for resampling training data and a voting score to consider aggregated alignments. The underlying alignment algorithms used in this study include IBM Model 1, 2 and a heuristic method based on Dice measurement. Our experimental results show that by this approach, the alignment error rate could be improved by at least %15 for the base alignment models.',
 2: 'Improving Spectral Learning by Using Multiple Representations\n\nrepresentation, spectral learning, discrete fourier, ba

## converting text into lower case


In [34]:
lower_docs = {} 

for docID in raw_docs:
    # converting all text to lower case
    lower_docs[docID] = raw_docs[docID].lower()

print("All document's content converted to lower case.")


All document's content converted to lower case.


In [35]:
lower_docs

{1: 'ensemble statistical and heuristic models for unsupervised word alignment\n\nstatistical word alignment, ensemble learning, heuristic word alignment\n\nstatistical word alignment models need large amount of training data while they are weak in small-size corpora. this paper proposes a new approach of unsupervised hybrid word alignment technique using ensemble learning method. this algorithm uses three base alignment models in several rounds to generate alignments. the ensemble algorithm uses a weighed scheme for resampling training data and a voting score to consider aggregated alignments. the underlying alignment algorithms used in this study include ibm model 1, 2 and a heuristic method based on dice measurement. our experimental results show that by this approach, the alignment error rate could be improved by at least %15 for the base alignment models.',
 2: 'improving spectral learning by using multiple representations\n\nrepresentation, spectral learning, discrete fourier, ba

## removing punctuations

In [36]:
punct_docs = {}  # contnet after punctuation removal

for docID in lower_docs:
    # removing punctuation and non-alpha chars with a space
    punct_docs[docID] = re.sub(r"[^a-z\s]", " ", lower_docs[docID])

print("Punctuation and non-alpha characters removed.")


Punctuation and non-alpha characters removed.


In [37]:
punct_docs

{1: 'ensemble statistical and heuristic models for unsupervised word alignment\n\nstatistical word alignment  ensemble learning  heuristic word alignment\n\nstatistical word alignment models need large amount of training data while they are weak in small size corpora  this paper proposes a new approach of unsupervised hybrid word alignment technique using ensemble learning method  this algorithm uses three base alignment models in several rounds to generate alignments  the ensemble algorithm uses a weighed scheme for resampling training data and a voting score to consider aggregated alignments  the underlying alignment algorithms used in this study include ibm model      and a heuristic method based on dice measurement  our experimental results show that by this approach  the alignment error rate could be improved by at least     for the base alignment models ',
 2: 'improving spectral learning by using multiple representations\n\nrepresentation  spectral learning  discrete fourier  ba

## removing white spaces

In [38]:
norm_docs = {} # content after removal of white spaces

for docID in punct_docs:
    # removing extra spaces and newlines'\n'
    norm_docs[docID] = re.sub(r"\s+", " ", punct_docs[docID]).strip()

print("Extra spaces and newlines removed.")


Extra spaces and newlines removed.


In [39]:
norm_docs

{1: 'ensemble statistical and heuristic models for unsupervised word alignment statistical word alignment ensemble learning heuristic word alignment statistical word alignment models need large amount of training data while they are weak in small size corpora this paper proposes a new approach of unsupervised hybrid word alignment technique using ensemble learning method this algorithm uses three base alignment models in several rounds to generate alignments the ensemble algorithm uses a weighed scheme for resampling training data and a voting score to consider aggregated alignments the underlying alignment algorithms used in this study include ibm model and a heuristic method based on dice measurement our experimental results show that by this approach the alignment error rate could be improved by at least for the base alignment models',
 2: 'improving spectral learning by using multiple representations representation spectral learning discrete fourier basis selection ensemble spectra

## Tokenizing the sentences

In [40]:
token_docs = {}  # for tokenized words

for docID in norm_docs:
    
    token_docs[docID] = word_tokenize(norm_docs[docID])  # breaking content into tokens

print("Documents tokenized into words.")


Documents tokenized into words.


In [43]:
for key,value in token_docs.items():
    print(f"DocID #{key} : -> Tokens: {value}\n")

DocID #1 : -> Tokens: ['ensemble', 'statistical', 'and', 'heuristic', 'models', 'for', 'unsupervised', 'word', 'alignment', 'statistical', 'word', 'alignment', 'ensemble', 'learning', 'heuristic', 'word', 'alignment', 'statistical', 'word', 'alignment', 'models', 'need', 'large', 'amount', 'of', 'training', 'data', 'while', 'they', 'are', 'weak', 'in', 'small', 'size', 'corpora', 'this', 'paper', 'proposes', 'a', 'new', 'approach', 'of', 'unsupervised', 'hybrid', 'word', 'alignment', 'technique', 'using', 'ensemble', 'learning', 'method', 'this', 'algorithm', 'uses', 'three', 'base', 'alignment', 'models', 'in', 'several', 'rounds', 'to', 'generate', 'alignments', 'the', 'ensemble', 'algorithm', 'uses', 'a', 'weighed', 'scheme', 'for', 'resampling', 'training', 'data', 'and', 'a', 'voting', 'score', 'to', 'consider', 'aggregated', 'alignments', 'the', 'underlying', 'alignment', 'algorithms', 'used', 'in', 'this', 'study', 'include', 'ibm', 'model', 'and', 'a', 'heuristic', 'method', 'b

## Removal of StopWords and stemming

In [44]:
stemmer = PorterStemmer()

processed_docs = {}  #  final processed text

for docID in token_docs:
    processed_tokens = []  # list to hold processed tokens for curr document
    for token in token_docs[docID]:
        if token not in stopwords:
            # applying stemming on token
            processed_tokens.append(stemmer.stem(token))
    
    # Join the tokens back into a single string
    processed_text = " ".join(processed_tokens)
    processed_docs[docID] = processed_text

print("All documents have been preprocessed.")


All documents have been preprocessed.


In [45]:
processed_docs

{1: 'ensembl statist heurist model unsupervis word align statist word align ensembl learn heurist word align statist word align model need larg amount train data while they weak small size corpora thi paper propos new approach unsupervis hybrid word align techniqu use ensembl learn method thi algorithm use three base align model sever round gener align ensembl algorithm use weigh scheme resampl train data vote score consid aggreg align underli align algorithm use thi studi includ ibm model heurist method base dice measur our experiment result show that by thi approach align error rate could improv by least base align model',
 2: 'improv spectral learn by use multipl represent represent spectral learn discret fourier basi select ensembl spectral learn algorithm learn an unknown function by learn spectral e g fourier represent function howev there mani possibl spectral represent none which will best situat consequ it seem natur consid how spectral learner could make use multipl represent

# Building Indexes:-

### Inverted Index

In [46]:
from nltk.tokenize import word_tokenize

invertedIndex = {}


for docID, content in processed_docs.items():
    
    words = word_tokenize(content)   # breaking sentences into words
    #print(words)
    
    
    for position, word in enumerate(words):
        # INVERTED INDEX 
        
        if word not in invertedIndex:
            invertedIndex[word] = []
        if docID not in invertedIndex[word]:
            invertedIndex[word].append(docID)


In [51]:
for key,value in invertedIndex.items():
    print(f"Term: '{key}' -> DocIDs: {value}\n")

Term: 'ensembl' -> DocIDs: [1, 2, 3, 5, 32, 52, 89, 105, 120, 171, 198, 229, 256, 262, 268, 284, 310, 311, 327, 352, 378, 386, 425]

Term: 'statist' -> DocIDs: [1, 14, 15, 17, 41, 42, 43, 60, 71, 72, 92, 102, 112, 115, 116, 121, 128, 145, 147, 156, 158, 170, 190, 193, 194, 202, 204, 208, 228, 255, 269, 283, 319, 320, 336, 343, 355, 368, 370, 385, 393, 405, 429, 430, 438, 445, 447, 448]

Term: 'heurist' -> DocIDs: [1, 93, 136, 174, 213, 257, 306, 413, 429, 435]

Term: 'model' -> DocIDs: [1, 2, 3, 4, 5, 9, 10, 11, 13, 16, 18, 19, 20, 22, 29, 32, 34, 35, 36, 37, 39, 42, 43, 46, 51, 52, 54, 55, 59, 61, 62, 65, 66, 70, 71, 73, 77, 79, 86, 89, 92, 98, 99, 103, 104, 105, 110, 113, 114, 117, 119, 120, 121, 130, 131, 132, 133, 134, 136, 137, 139, 140, 142, 143, 145, 146, 147, 148, 149, 151, 153, 156, 157, 158, 159, 161, 163, 164, 165, 166, 167, 169, 178, 182, 188, 189, 190, 195, 196, 197, 198, 201, 202, 204, 205, 207, 209, 213, 216, 217, 221, 227, 228, 229, 230, 231, 232, 234, 236, 237, 239, 24

In [53]:
invertedIndex = dict(sorted(invertedIndex.items()))

In [55]:
for key,value in invertedIndex.items():
    print(f"'{key}' -> DocIDs: {value}\n")

'aal' -> DocIDs: [327]

'aapl' -> DocIDs: [114]

'abbrevi' -> DocIDs: [128]

'abc' -> DocIDs: [226]

'abdomin' -> DocIDs: [222]

'abduct' -> DocIDs: [284]

'abil' -> DocIDs: [5, 8, 22, 23, 41, 49, 52, 54, 55, 69, 84, 104, 106, 114, 141, 143, 149, 155, 157, 171, 191, 206, 239, 251, 257, 365, 372, 374, 384, 431]

'abl' -> DocIDs: [18, 50, 58, 66, 68, 93, 95, 135, 136, 154, 155, 158, 162, 168, 197, 205, 225, 239, 256, 259, 260, 267, 288, 291, 292, 297, 310, 313, 326, 328, 335, 358, 386, 408, 439, 447]

'abnorm' -> DocIDs: [10, 131, 197, 203, 276, 321, 329, 405]

'abound' -> DocIDs: [112]

'about' -> DocIDs: [4, 5, 19, 48, 54, 55, 81, 83, 84, 91, 93, 127, 129, 132, 133, 150, 151, 158, 178, 182, 186, 188, 206, 213, 217, 253, 281, 283, 291, 294, 308, 340, 344, 356, 365, 373, 392, 445]

'abov' -> DocIDs: [30, 49, 271, 276]

'abp' -> DocIDs: [138]

'absenc' -> DocIDs: [51, 98, 140, 435]

'absent' -> DocIDs: [258]

'absolut' -> DocIDs: [217, 388, 419, 431, 437]

'abstract' -> DocIDs: [17, 45, 1

### Positional Index

In [56]:
from nltk.tokenize import word_tokenize

positionalIndex = {}

for docID, content in processed_docs.items():
    
    words = word_tokenize(content)   # breaking sentences into words
    #print(words)
    
    for pos, word in enumerate(words):
        # POSITIONAL INDEX 
        
        if word not in positionalIndex:
            positionalIndex[word] = {}  #add a dict i word not pressent
        if docID not in positionalIndex[word]:
            positionalIndex[word][docID] = []  # add a list if docID not present
        positionalIndex[word][docID].append(pos)  # store the position in that list
        

In [57]:
positionalIndex

{'ensembl': {1: [0, 10, 41, 55],
  2: [14, 72, 90],
  3: [2, 18, 51, 62, 78],
  5: [101],
  32: [151, 170],
  52: [175],
  89: [7, 24, 45, 66, 87, 123],
  105: [4, 63],
  120: [53, 112, 152],
  171: [9, 139],
  198: [8, 51, 75],
  229: [11, 63, 74],
  256: [2, 9, 67, 86, 97, 129],
  262: [119],
  268: [195],
  284: [5, 10, 53, 60, 131],
  310: [161],
  311: [6, 9, 110],
  327: [11, 84],
  352: [79, 99],
  378: [6, 14, 59],
  386: [56],
  425: [105]},
 'statist': {1: [1, 7, 15],
  14: [48],
  15: [2, 4, 17, 87],
  17: [54],
  41: [14, 127],
  42: [35],
  43: [101],
  60: [7],
  71: [43],
  72: [62],
  92: [15, 51],
  102: [14, 87, 95],
  112: [116],
  115: [0],
  116: [0, 5],
  121: [109, 166],
  128: [99, 119],
  145: [72],
  147: [8],
  156: [0, 16, 51],
  158: [36],
  170: [127],
  190: [39],
  193: [157],
  194: [42, 60],
  202: [83],
  204: [0, 14, 78, 100],
  208: [102, 168],
  228: [43],
  255: [40],
  269: [133],
  283: [122],
  319: [94],
  320: [18, 62, 100, 108, 113],
  336: 

In [58]:
positionalIndex = dict(sorted(positionalIndex.items()))

In [59]:
positionalIndex

{'aal': {327: [20, 51, 100]},
 'aapl': {114: [32]},
 'abbrevi': {128: [88]},
 'abc': {226: [93]},
 'abdomin': {222: [4, 21, 61]},
 'abduct': {284: [42]},
 'abil': {5: [31, 36],
  8: [59],
  22: [41],
  23: [68],
  41: [30],
  49: [34],
  52: [114],
  54: [104],
  55: [96],
  69: [110],
  84: [82],
  104: [140],
  106: [150],
  114: [106],
  141: [65],
  143: [140],
  149: [39],
  155: [21],
  157: [40],
  171: [104],
  191: [81],
  206: [66, 112],
  239: [20],
  251: [86],
  257: [116],
  365: [96],
  372: [109],
  374: [18],
  384: [113],
  431: [62]},
 'abl': {18: [27],
  50: [104],
  58: [70],
  66: [22],
  68: [82],
  93: [182, 191],
  95: [88],
  135: [34],
  136: [28],
  154: [137],
  155: [33],
  158: [150],
  162: [30],
  168: [121],
  197: [169],
  205: [73],
  225: [100],
  239: [144],
  256: [133],
  259: [169],
  260: [91],
  267: [139],
  288: [27, 34, 98],
  291: [81],
  292: [179],
  297: [90],
  310: [92],
  313: [95],
  326: [94],
  328: [96],
  335: [121],
  358: [100

In [61]:
for key,value in positionalIndex.items():
    print(f"'{key}' -> : {value}\n")

'aal' -> : {327: [20, 51, 100]}

'aapl' -> : {114: [32]}

'abbrevi' -> : {128: [88]}

'abc' -> : {226: [93]}

'abdomin' -> : {222: [4, 21, 61]}

'abduct' -> : {284: [42]}

'abil' -> : {5: [31, 36], 8: [59], 22: [41], 23: [68], 41: [30], 49: [34], 52: [114], 54: [104], 55: [96], 69: [110], 84: [82], 104: [140], 106: [150], 114: [106], 141: [65], 143: [140], 149: [39], 155: [21], 157: [40], 171: [104], 191: [81], 206: [66, 112], 239: [20], 251: [86], 257: [116], 365: [96], 372: [109], 374: [18], 384: [113], 431: [62]}

'abl' -> : {18: [27], 50: [104], 58: [70], 66: [22], 68: [82], 93: [182, 191], 95: [88], 135: [34], 136: [28], 154: [137], 155: [33], 158: [150], 162: [30], 168: [121], 197: [169], 205: [73], 225: [100], 239: [144], 256: [133], 259: [169], 260: [91], 267: [139], 288: [27, 34, 98], 291: [81], 292: [179], 297: [90], 310: [92], 313: [95], 326: [94], 328: [96], 335: [121], 358: [100], 386: [92], 408: [108], 439: [108], 447: [113]}

'abnorm' -> : {10: [1, 9, 19, 34, 67, 87], 13

## Verifying the indices of Inverted & Positional Indexes of word 'abdomin'. i.e: [4,18,58]

In [62]:
invertedIndex['abdomin']

[222]

In [63]:
positionalIndex['abdomin']

{222: [4, 21, 61]}

In [64]:
processed_docs[222]

'artifici neural network base abdomin organ segment review organ segment neural network mr imag ct imag there mani neural network base abdomin organ segment approach from medic imag comput tomographi imag were mostli use these approach appli techniqu usual base prior inform regard posit shape size organ these method literatur there onli few neural network base techniqu that were implement segment abdomin organ from magnet reson base imag thi paper present these method their result'

In [65]:
text = processed_docs[222]
words = text.split()  # Tokenize text
indices = [i for i, word in enumerate(words) if word == "abdomin"]  # Find all occurrences
print(indices)

# hence, verified. Our Both Inverted and Positional Index is pointing to correct terms in all docs.


[4, 21, 61]


## Saving both Indexes in .json format

In [66]:
import json

positionalIndex = dict(sorted(positionalIndex.items())) # sorting the positionalIndec by keys(words)

# saving inverted index
# Writing JSON with compact formatting (each word in a new line)
with open("inverted_index_updt.json", "w") as f:
    for word, doc_ids in invertedIndex.items():
        json.dump({word: doc_ids}, f, separators=(',', ':'), ensure_ascii=False)
        f.write("\n")  # New line for each word

# ssaving positional index
with open("positional_index_updt.json", "w") as f:
    for word, doc_data in positionalIndex.items():
        json.dump({word: doc_data}, f, separators=(',', ':'), ensure_ascii=False)
        f.write("\n")  # Write each word in a new line
