In [4]:
import re
import os
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import math
import collections
import pickle

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/harshil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/harshil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Function to process text
def process_text():
    stop_words = set(stopwords.words('english'))
    stop_words.update(['redirect', 'category', 'total', 'films', 'list', 'awards', 'following', 'characteristics', 'consist', 'demonic', 'pages', 'alphabet', 'prefixes', 'suffixes', 'alphabetical', 'according', 'language', 'industry', 'birth'])
    punctuation = set(string.punctuation)
    fileNames = [filename for filename in os.listdir() if filename.startswith("page") and filename.endswith(".txt")]
    for filename in fileNames:
        with open(os.path.join(filename), 'r', encoding='utf-8') as f:
            text = f.read().lower()
            
            # Tokenize text
            processed_text = ''.join(sent_tokenize(text))
            tokens = word_tokenize(processed_text)
            
            # Remove stopwords and punctuation
            wnl = WordNetLemmatizer()
            processed_text = [wnl.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words and word not in punctuation]
            
            # Write processed text to a new file
            number = re.sub("\D", "", filename)
            new_filename = f"cleans{number}.txt"
            filepath = os.path.join(new_filename)
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(' '.join(processed_text))

In [6]:
# Clean up the files and store them
process_text()

In [7]:
# Function to read text from files and create TF dictionaries
def create_tf_dicts():
    tf_dicts = []
    vocab = set()

    # Iterate over each file in the directory
    for file_name in os.listdir():
        if file_name.startswith("cleans") and file_name.endswith(".txt"):
            file_path = os.path.join(file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                # Read the text from the file
                doc = file.read()
                
                # Create TF dictionary for the document
                tf_dict = {}
                tokens = word_tokenize(doc)
                wnl = WordNetLemmatizer()
                tokens = [wnl.lemmatize(w) for w in tokens if w.isalpha() and w not in stopwords.words('english')]
                
                # Calculate term frequencies
                for t in tokens:
                    if t in tf_dict:
                        tf_dict[t] += 1
                    else:
                        tf_dict[t] = 1
                
                # Normalize TF by the number of tokens
                num_tokens = len(tokens)
                for t in tf_dict:
                    tf_dict[t] /= num_tokens
                
                # Add TF dictionary to the list
                tf_dicts.append(tf_dict)
                
                # Add terms to the vocabulary
                vocab.update(tf_dict.keys())
    
    return tf_dicts, vocab

In [8]:
tf_dicts, vocab = create_tf_dicts()

In [9]:
def calculate_idf(tf_dicts):
    num_docs = len(tf_dicts)
    idf_dict = {}
    
    # Iterate over each term in the vocabulary
    for term in vocab:
        # Count the number of documents containing the term
        num_docs_with_term = sum(1 for tf_dict in tf_dicts if term in tf_dict)
        # Calculate IDF for the term
        idf_dict[term] = math.log((1 + num_docs) / (1 + num_docs_with_term))
    
    return idf_dict

In [10]:
idf_dict = calculate_idf(tf_dicts)

In [11]:
def create_tfidf(tf_dicts, idf_dict):
    tfidf_dicts = []
    
    # Iterate over each TF dictionary
    for tf_dict in tf_dicts:
        
        # Create TF-IDF dictionary for the document
        tfidf_dict = {}
        for term, tf in tf_dict.items():
            tfidf_dict[term] = tf * idf_dict[term]
        tfidf_dicts.append(tfidf_dict)
    
    return tfidf_dicts

In [12]:
tfidf_dicts = create_tfidf(tf_dicts, idf_dict)

In [13]:
# Function to get the top important terms based on TF-IDF values
def get_top_terms(tfidf_dicts, n=40):
    term_scores = collections.defaultdict(float)
    
    # Combine TF-IDF values across all documents
    for tfidf_dict in tfidf_dicts:
        for term, tfidf in tfidf_dict.items():
            term_scores[term] += tfidf
    
    # Sort terms based on TF-IDF scores
    sorted_terms = sorted(term_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Extract the top n terms
    top_terms = [term for term, score in sorted_terms[:n]]
    return top_terms, sorted_terms

In [14]:
top_terms, sorted_terms = get_top_terms(tfidf_dicts, n=40)

In [15]:
top_terms

['subcategories',
 'reflect',
 'horror',
 'stem',
 'recent',
 'theme',
 'sanskrit',
 'root',
 'film',
 'persian',
 'change',
 'subgenre',
 'tend',
 'mimic',
 'may',
 'commonly',
 'force',
 'evil',
 'spirit',
 'haunted',
 'hindi',
 'explored',
 'dance',
 'used',
 'house',
 'cinema',
 'since',
 'around',
 'graph',
 'indian',
 'bollywood',
 'silver',
 'india',
 'among',
 'language',
 'award',
 'ethnologue',
 'best',
 'size',
 'endangerment']

In [16]:
# Hand Picked Important terms
ImpTerms = ['horror', 'film', 'hindi', 'bollywood', 'kamal', 'drama', 'khan', 'hindustani', 'bhangra', 'director', 'shows', 'devanāgarī', 'indian', 'sanskrit', 'kamal']

In [17]:
# Creating the Knowledge Base
def createKnowledgeBase():
    fileNames = [filename for filename in os.listdir() if filename.startswith("page") and filename.endswith(".txt")]
    KB = collections.defaultdict(list)
    for filename in fileNames:
        with open(os.path.join(filename), 'r', encoding='utf-8') as f:
            text = f.read().lower()
            
            # Tokenize text
            processed_text = sent_tokenize(text)
            
            # add Sentences to Knowledge Base
            for sent in processed_text:
                # Remove '\n' characters
                processed_sent = re.sub(r'\n', '', sent)
                
                # Remove '[number]' occurrences
                processed_sent = re.sub(r'\[\d+\]', '', processed_sent)
                for term in ImpTerms:
                    if term in processed_sent:
                        KB[term].append(processed_sent)
    return KB

In [18]:
KB = createKnowledgeBase()

In [19]:
# Storing the knowledge base using pickle
with open('knowledgeBase.pickle', 'wb') as handle:
    pickle.dump(KB, handle)

In [20]:
KB

defaultdict(list,
            {'horror': ['hindi-language horror films have been a subgenre of the hindi film industry in india since the birth of hindi films.',
              'these films tend to mimic the  characteristics and themes of horror films around the world.',
              'this is a list of indian horror films in hindi language.',
              'the film succeeds in making us all witnesses to its horrors – and complicit in them when, like so many of the characters, we say nothing.',
              'the big scenedevi and her gang attack a wedding party in search of recompense for the horrors she suffered in her youth.',
              'hindi-language horror films have been a subgenre of the hindi film industry in india since the birth of hindi films.',
              'these films tend to mimic the  characteristics and themes of horror films around the world.',
              'this is a list of indian horror films in hindi language.'],
             'film': ['hindi-language horror