In [27]:
import os
import nltk
from google.colab import drive
import pandas as pd
import re
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer

In [28]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
nltk.download('words')
from nltk.corpus import words

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [30]:
# Helper function to remove punctuation and digits
def remove_punctuation(text):
    import string
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    text = ''.join([char for char in text if not char.isdigit()])
    return text

In [31]:
# Function to create inverted index
def invertor_index(folder_path):
    inverted_index = {}
    unique_tokens = set()

    # List all files in the specified folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as file:
                data = file.read()
                data = remove_punctuation(data)
                tokens = nltk.word_tokenize(data)

                for token in tokens:
                    if token not in inverted_index:
                        inverted_index[token] = set()
                    inverted_index[token].add(filename)
                    unique_tokens.add(token)
    inverted_index = dict(sorted(inverted_index.items()))
    return inverted_index, unique_tokens

In [32]:
folder_path = '/content/drive/MyDrive/CSE 419/Lab Assign-4/1000_documents'


In [33]:
# Generate inverted index and unique tokens
inverted_index, unique_tokens = invertor_index(folder_path)


In [35]:
inverted_index

{'A': {'business_10.txt',
  'business_14.txt',
  'business_15.txt',
  'business_16.txt',
  'business_17.txt',
  'business_18.txt',
  'business_21.txt',
  'business_23.txt',
  'business_25.txt',
  'business_27.txt',
  'business_28.txt',
  'business_29.txt',
  'business_33.txt',
  'business_35.txt',
  'business_36.txt',
  'business_37.txt',
  'business_43.txt',
  'business_46.txt',
  'business_49.txt',
  'business_53.txt',
  'business_56.txt',
  'business_57.txt',
  'business_58.txt',
  'business_61.txt',
  'business_62.txt',
  'business_63.txt',
  'business_66.txt',
  'business_77.txt',
  'business_78.txt',
  'business_82.txt',
  'business_83.txt',
  'business_85.txt',
  'business_86.txt',
  'business_87.txt',
  'business_93.txt',
  'business_94.txt',
  'business_97.txt',
  'entertainment_10.txt',
  'entertainment_100.txt',
  'entertainment_13.txt',
  'entertainment_17.txt',
  'entertainment_2.txt',
  'entertainment_26.txt',
  'entertainment_28.txt',
  'entertainment_29.txt',
  'enterta

In [34]:
print(f"Total Unique Tokens: {len(unique_tokens)}")
print(len(inverted_index))
print(f"Inverted Index Sample: {dict(list(inverted_index.items())[:5])}")

Total Unique Tokens: 34564
34564
Inverted Index Sample: {'A': {'politics_211.txt', 'entertainment_40.txt', 'technologie_71.txt', 'technologie_94.txt', 'politics_112.txt', 'politics_319.txt', 'technologie_96.txt', 'technologie_74.txt', 'business_66.txt', 'business_97.txt', 'medical_488.txt', 'medical_644.txt', 'medical_660.txt', 'politics_101.txt', 'space_38.txt', 'entertainment_43.txt', 'entertainment_70.txt', 'historical_39.txt', 'historical_36.txt', 'business_85.txt', 'graphics_57.txt', 'business_16.txt', 'technologie_37.txt', 'historical_81.txt', 'graphics_45.txt', 'medical_646.txt', 'medical_300.txt', 'politics_294.txt', 'historical_62.txt', 'politics_138.txt', 'technologie_32.txt', 'entertainment_50.txt', 'food_86.txt', 'medical_58.txt', 'technologie_14.txt', 'medical_692.txt', 'graphics_37.txt', 'space_84.txt', 'entertainment_98.txt', 'food_57.txt', 'entertainment_13.txt', 'business_82.txt', 'space_30.txt', 'politics_250.txt', 'sport_68.txt', 'medical_691.txt', 'food_100.txt', 's

In [36]:
# Filter tokens based on English words
english_words = set(words.words())
final_indexed = {}

In [37]:
for token, files in inverted_index.items():
    if token.lower() in english_words:
        final_indexed[token] = files

In [38]:
final_indexed.keys()



In [39]:
len(final_indexed)

15380

In [40]:
# Create DataFrame for the original inverted index
data = {
    'Freq.': [len(posting_list) for posting_list in final_indexed.values()],
    'Term': final_indexed.keys(),
    'Posting List': [list(posting_list) for posting_list in final_indexed.values()]
}

In [42]:
original_df = pd.DataFrame(data)
original_df

Unnamed: 0,Freq.,Term,Posting List
0,294,A,"[politics_211.txt, entertainment_40.txt, techn..."
1,3,AA,"[space_14.txt, historical_41.txt, graphics_26...."
2,1,ABOVE,[historical_4.txt]
3,1,ACE,[space_14.txt]
4,1,ACT,[graphics_74.txt]
...,...,...,...
15375,4,zinc,"[sport_88.txt, sport_49.txt, food_97.txt, food..."
15376,2,zip,"[technologie_13.txt, technologie_76.txt]"
15377,1,zo,[space_70.txt]
15378,6,zone,"[historical_18.txt, historical_35.txt, histori..."


In [43]:
# Create a concatenated string of unique terms
completestring = ""
unique_tokens = sorted(unique_tokens)
index_mapping = {}  # Dictionary to store the index of each term

In [44]:
current_index = 0  # Track the current position in the concatenated string

for s in final_indexed.keys():
    completestring += s
    index_mapping[s] = current_index  # Store the current index for this term
    current_index += len(s)  # Move the index forward by the length of the term

In [45]:
print(f"Length of complete string: {len(completestring)}")
print(f"Concatenated string: {completestring}")
print(f"Index mapping: {index_mapping}")

Length of complete string: 110439


In [46]:
# Create minimized DataFrame
min_df = original_df
min_df['Index'] = original_df['Term'].map(index_mapping)
min_df = min_df.drop(columns=['Term'])

In [47]:
min_df

Unnamed: 0,Freq.,Posting List,Index
0,294,"[politics_211.txt, entertainment_40.txt, techn...",0
1,3,"[space_14.txt, historical_41.txt, graphics_26....",1
2,1,[historical_4.txt],3
3,1,[space_14.txt],8
4,1,[graphics_74.txt],11
...,...,...,...
15375,4,"[sport_88.txt, sport_49.txt, food_97.txt, food...",110418
15376,2,"[technologie_13.txt, technologie_76.txt]",110422
15377,1,[space_70.txt],110425
15378,6,"[historical_18.txt, historical_35.txt, histori...",110427


In [48]:
original_df = original_df.drop(columns=['Index'])
original_df
min_df

Unnamed: 0,Freq.,Posting List,Index
0,294,"[politics_211.txt, entertainment_40.txt, techn...",0
1,3,"[space_14.txt, historical_41.txt, graphics_26....",1
2,1,[historical_4.txt],3
3,1,[space_14.txt],8
4,1,[graphics_74.txt],11
...,...,...,...
15375,4,"[sport_88.txt, sport_49.txt, food_97.txt, food...",110418
15376,2,"[technologie_13.txt, technologie_76.txt]",110422
15377,1,[space_70.txt],110425
15378,6,"[historical_18.txt, historical_35.txt, histori...",110427


In [49]:
df_memory_usage = original_df.memory_usage(deep=True).sum()
print(f"Original DataFrame size in bytes: {df_memory_usage}")


Original DataFrame size in bytes: 3396923


In [50]:
df_min_usage = min_df.memory_usage(deep=True).sum()
print(f"Minimised DataFrame size in bytes: {df_min_usage}")

Minimised DataFrame size in bytes: 2532864


In [51]:
print(f"Memory saved: {df_memory_usage - df_min_usage}")

Memory saved: 864059


In [52]:
original_memory = original_df.memory_usage(deep=True).sum()
minimized_memory = min_df.memory_usage(deep=True).sum()

In [53]:
memory_saved = original_memory - minimized_memory
percentage_saved = (memory_saved / original_memory) * 100

In [54]:
rows_original = original_df.shape[0]
rows_minimized = min_df.shape[0]

In [55]:
avg_memory_per_row_original = original_memory / rows_original
avg_memory_per_row_minimized = minimized_memory / rows_minimized

In [56]:
original_column_memory = original_df.memory_usage(deep=True)
minimized_column_memory = min_df.memory_usage(deep=True)

In [57]:
print(f"Original DataFrame size: {original_memory} bytes")
print(f"Minimized DataFrame size: {minimized_memory} bytes")
print(f"Memory saved: {memory_saved} bytes")
print(f"Percentage of memory saved: {percentage_saved:.2f}%")

print(f"Average memory usage per row (Original): {avg_memory_per_row_original:.2f} bytes")
print(f"Average memory usage per row (Minimized): {avg_memory_per_row_minimized:.2f} bytes")

Original DataFrame size: 3396923 bytes
Minimized DataFrame size: 2532864 bytes
Memory saved: 864059 bytes
Percentage of memory saved: 25.44%
Average memory usage per row (Original): 220.87 bytes
Average memory usage per row (Minimized): 164.69 bytes
