In [600]:
import numpy as np
import pandas as pd
import json
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from gensim import corpora
from gensim.models.ldamodel import LdaModel

import spacy

from googletrans import Translator

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [601]:
file_path = "../scraper/scraped_data/data_selenium.json"
# Load JSON file
with open(file_path) as file:
    data = json.load(file)

In [602]:
# Convert to pandas dataframe for better visualisation
df = pd.DataFrame(data)
df.head()

Unnamed: 0,url,title,texts,images,pdf_links,pdf_extracted,image_extracted
0,https://www.svf.gov.lk/index.php?lang=en,Shrama Vasana Fund - Home,"[Menu, About Us, Contributions, Services, Down...","[https://www.svf.gov.lk/images/homeicon.png, h...",[],{},{'https://www.svf.gov.lk/images/homeicon.png':...
1,https://www.svf.gov.lk/index.php?option=com_co...,Shrama Vasana Fund - Overview,"[Menu, About Us, Contributions, Services, Down...","[https://www.svf.gov.lk/images/homeicon.png, h...",[],{},{'https://www.svf.gov.lk/images/homeicon.png':...
2,https://www.svf.gov.lk/index.php?option=com_co...,Shrama Vasana Fund - Contributions,"[Menu, About Us, Contributions, Services, Down...","[https://www.svf.gov.lk/images/homeicon.png, h...",[],{},{'https://www.svf.gov.lk/images/homeicon.png':...
3,https://www.svf.gov.lk/index.php?option=com_co...,Shrama Vasana Fund - Services,"[Menu, About Us, Contributions, Services, Down...","[https://www.svf.gov.lk/images/homeicon.png, h...",[],{},{'https://www.svf.gov.lk/images/homeicon.png':...
4,https://www.svf.gov.lk/index.php?option=com_co...,Shrama Vasana Fund - Downloads,"[Menu, About Us, Contributions, Services, Down...","[https://www.svf.gov.lk/images/homeicon.png, h...",[https://www.svf.gov.lk/images/pdfs/act_en.pdf...,{'https://www.svf.gov.lk/images/pdfs/act_en.pd...,{'https://www.svf.gov.lk/images/homeicon.png':...


# Define Functions for Data Preprocessing 

In [603]:
remove = ['menu', 'home', 'about', 'contributions', 'services', 'downloads', 'gallery', 'news', '&', 'events', 
          'donate', 'vacancy', 'faqs', 'contact', 'us', 'sitemap', 'shrama', 'vasana', 'fund'] # words from navigation bar or frequently used words

In [604]:
def remove_duplicates(text_list):
    """Remove duplicates and words in remove list from the text list.

    Args:
        text_list: list of phrases.

    Returns:
        A list of words that are unique and not in the remove list.
    """
    removed = []
    for phrase in text_list: 
        words = phrase.split()
        for word in words: 
            if word not in removed:
                if word.lower() not in remove:
                    removed.append(word)
    return removed

In [605]:
stop_words = set(stopwords.words('english'))

In [606]:
def process_texts(text_list): 
    """Cleans text data but does not include stemming and lemmatization, simply cleans the text data.
      Trailing spaces are removed, words are converted to lower case. 
      Special characters and punctuations are removed from words.
      Removal of stop words.
    
    Args:
        text_list: list of phrases.

    Returns:
        A cleaned list of words.
    """
    processed = []

    for word in text_list:
        word = word.strip().lower() # remove spaces and convert to lowercase 
        word = re.sub(r'[^A-Za-z0-9\s]', '', word) # remove special characters and punctuations from text (this also removes Sinhala language)
        word = re.sub(r'\n+', ' ', word) # replace \n with space
        if word and word not in stop_words: # remove stop words and empty strings
            processed.append(word) 

    return processed

In [607]:
# https://www.ibm.com/topics/stemming-lemmatization#:~:text=The%20practical%20distinction%20between%20stemming,be%20found%20in%20the%20dictionary.

def transform_texts(text_list): 
    """Cleans text data including stemming and lemmatising.
      Trailing spaces are removed, words are converted to lower case. 
      Special characters and punctuations are removed from words.
      Removal of stop words.
    
    Args:
        text_list: list of phrases.

    Returns:
        A cleaned list of words that are tokenised. 
    """
    processed = []

    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    for phrase in text_list:
        phrase = phrase.strip().lower() # remove spaces and convert to lowercase 
        phrase = re.sub(r'[^A-Za-z0-9\s]', '', phrase) # remove special characters and punctuations from text
        phrase = re.sub(r'\n+', ' ', phrase) # replace \n with space
        tokens = word_tokenize(phrase) # tokenise text
        tokens = [word for word in tokens if word not in stop_words] # remove stop words
        
        #stemming
        stemming = [stemmer.stem(token) for token in tokens]

        #lemmatization
        lem = [lemmatizer.lemmatize(token) for token in stemming]

        processed.append(lem) 

    return processed

In [608]:
def tokenise_text_list(text_list): 
    """Helper function for topic_modelling(tokenised_text_list), tokenises text to feed into lda_model.
    
    Args:
        text_list: list of phrases.

    Returns:
        A cleaned list of words that are tokenised. 
    """
    processed = []

    for phrase in text_list:
        phrase = phrase.strip().lower() # remove spaces and convert to lowercase 
        phrase = re.sub(r'[^A-Za-z0-9\s]', '', phrase) # remove special characters and punctuations from text
        phrase = re.sub(r'\n+', ' ', phrase) # replace \n with space
        tokens = word_tokenize(phrase) # tokenise text
        
        tokens = [word for word in tokens if word not in stop_words] # remove stop words
        processed.append(tokens) 

    return processed


In [609]:
# https://towardsdatascience.com/topic-modelling-in-python-with-spacy-and-gensim-dc8f7748bdbf

def topic_modelling(text_list):
    """Using the lda_model, generate a list of topics with weights from a given text list.
    
    Args:
        text_list: list of phrases.

    Returns:
        A list of tuples consisting topics and weights. 
    """
    topic_weights = [] # store topics and weights in a list [(topics, weights)]
    texts = tokenise_text_list(text_list)
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]  
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=20)
    topics = lda_model.print_topics(num_words=1)

    for index, topic in topics:
        weight_str, topic_str = topic.split("*")
        topic = topic_str.replace('"', '')
        weight = float(weight_str)
        topic_weights.append((topic, weight))
    return topic_weights


In [610]:
def count_words(phrases_list): 
    """Generates a word count from a given list of words.
    
    Args:
        phrases_list: list of phrases/words.

    Returns:
        An integer representing the word count.
    """
    word_count = 0 
    for phrase in phrases_list: 
        word_count += len(phrase.split())
    return word_count

In [611]:
# Possible text entities 
nlp = spacy.load("en_core_web_sm")
nlp.get_pipe("ner").labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [612]:
def generate_questions(text):
    """Generates a possible list of questions from a given text.
    
    Args:
        text: Any text string.

    Returns:
        A list of questions.
    """
    nlp = spacy.load('en_core_web_sm') # load spacy language model
    text = nlp(text) # process text
    questions = []
    
    for entity in text.ents: # look for potential nouns/subjects, generate possible questions with who/where/what
        if not any(char.isdigit() for char in entity.text):
            if entity.label_ == 'GPE':
                question = f"Where is {entity.text}?"
                questions.append(question) 
            # elif entity.label_ == 'DATE':
            #     question = f"When did {entity.text} happen?"
                # questions.append(question) 
            elif entity.label == 'PERSON': 
                question = f"Who is {entity.text}?"
                questions.append(question) 
            else: 
                question = f"What is {entity.text}?"
                questions.append(question) 

    return questions

In [613]:
# https://pypi.org/project/googletrans/

def translate_to_english(text_list):
    """Translates texts in a text_list from any langugage to english.
    
    Args:
        text_list: A list of texts.

    Returns:
        A list of translated texts.
    """
    lst = []
    translator = Translator()
    for text in text_list:
        try:
            if text:
                translated = translator.translate(text, dest='en')
                lst.append(translated.text)
            else:
                lst.append("")
        except Exception as e:
            lst.append("") 
    return lst

In [614]:
def tfidf_matrix(text_list):
    """Converts a text list to a tfidf matrix
    
    Args:
        text_list: A list of texts.

    Returns:
        A sparse tfidf matrix
    """
    vectorizer = TfidfVectorizer() # general tfidf vectorizer that converts a list of processed texts to a tfidf matrix
    tfidf_matrix = vectorizer.fit_transform(text_list)
    return tfidf_matrix

# Preprocessing Scraped Data (1)

In [615]:
titles = [item['title'] for item in data]
texts = [item['texts'] for item in data]
pdf_extracted = [item['pdf_extracted'] for item in data]
image_extracted = [item['image_extracted'] for item in data]

In [616]:
# convert titles to lower case
clean_titles = [t.lower() for t in titles]
print(clean_titles)

['shrama vasana fund - home', 'shrama vasana fund - overview', 'shrama vasana fund - contributions', 'shrama vasana fund - services', 'shrama vasana fund - downloads', 'shrama vasana fund - image gallery', 'shrama vasana fund - video gallery', 'shrama vasana fund - news & events', 'shrama vasana fund - donate us', 'shrama vasana fund - vacancy', 'shrama vasana fund - faqs', 'shrama vasana fund - inquiry', 'shrama vasana fund - contact details', 'shrama vasana fund - sitemap']


In [617]:
clean_texts = []
for text_list in texts: 
    text = translate_to_english(text_list) # translate texts to english
    text = remove_duplicates(text) # remove duplicates, words from remove list
    text = process_texts(text) # convert to lower case, removal of special characters, stop words and punctuations 
    clean_texts.append(text)


In [618]:
# generate word count for lists of texts
word_count = []

for text_list in clean_texts: 
    word_count.append(count_words(text_list))


In [619]:
image_texts = []  # extract only texts from image_extracted
for i in range(len(image_extracted)):
    text = []
    for url, image_text in image_extracted[i].items(): 
        if image_text: 
            text.append(image_text)
    image_texts.append(text)



In [620]:
clean_image_texts = []

for text_list in image_texts: 
    text = translate_to_english(text_list) # translate text to english
    text = remove_duplicates(text) # remove duplicates, remove words from remove list
    text = process_texts(text) # convert to lower case, remove stop words, special characters and punctuations 
    clean_image_texts.append(text)


In [621]:
pdf_texts = [] # extract only texts from pdf_extracted
for i in range(len(pdf_extracted)):
    text = []
    for url, pdf_text in pdf_extracted[i].items(): 
        text.append(pdf_text)

    pdf_texts.append(text)


In [622]:
clean_pdf_texts = [] 
for text_list in pdf_texts: 
    text = translate_to_english(text_list) # translate text to english
    text = remove_duplicates(text) # remove duplicates, remove words from remove list
    text = transform_texts(text) # convert to lower case, remove stop words, special characters and punctuations, stem and lemmatize words
    clean_pdf_texts.append(text)


In [623]:
# generate topics for lists of texts

topic_modelling_texts = []

for text_list in clean_texts: 
    topics = topic_modelling(text_list)
    topic_modelling_texts.append(topics)

In [624]:
# generate questions for lists of texts

question_texts = []

for text_list in clean_texts: 
    questions = generate_questions(" ".join(text_list))
    question_texts.append(questions)
    

In [625]:
# converts list of text to tfidf matrix

tfidf_texts = []

for text_list in clean_texts: 
    tfidf = tfidf_matrix(text_list)
    tfidf_texts.append(tfidf)


In [626]:
processed_data = {}
for i in range(len(titles)): 
    processed_data[clean_titles[i]] = {
        'texts': clean_texts[i], 
        'texts_word_count': word_count[i],
        'texts_topics': topic_modelling_texts[i],
        'texts_questions': question_texts[i],   
        # 'texts_tfidf': tfidf_texts[i], 
        'pdf': clean_pdf_texts[i], 
        'image': clean_image_texts[i]  
    }

# texts still contain random integers (not sure if it should be removed)

## processed_data
| **label**                               | method of processing                                                                                                           |
|-------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
| **titles**                               | titles of webpage converted to lower case |
| **texts**                               | texts of webpage that has been cleaned (translated texts are converted to lower case, stop words, special characters and words from remove list are removed) |
| **texts_word_count**                    | word count of texts (excluding image and pdf texts) on the webpage                                                                                                                  |
| **texts_topics**                        | possible topics obtained topic modelling of texts (excluding image and pdf texts) from the webpage                                                                                                                  |
| **texts_questions**                     | possible questions obtained from texts (excluding image and pdf texts) from the webpage                                                                                                                  |
| **texts_tfidf**                         | tfidf matrix obtained from list of texts (excluding image and pdf texts) from the webpage                                                                                                                  |
| **pdf**                                 | translated pdf texts that has been cleaned (converted to lower case, stop words, special characters and words from remove list are removed, words are also tokenised by stemming and lemmatization)                                                                                                                  |
| **image**                               | extracted image texts of webpage that has been cleaned (translated texts are converted to lower case, stop words, special characters and words from remove list are removed)                                                                                                                 |


Note: Stemming and lemmatization was not conducted on webpage's text data as scraped webpage data is noisy, unstructured and fragmented. Stemming does not consider the context and reduces words to its base form. Although lemmatisation considers the grammar, it struggles to find the correct lemma when texts are fragmented.

- PDF texts does not make sense when translated to english. Translation also seem to be inaccurate. 
- To consider storing image texts/pdf texts/texts without translating to english
- To consider advanced contextual NLP techniques: Named entity recognition (NER) (already used in topic modelling), part-of-speech tagging, or word embeddings (e.g., Word2Vec, BERT)


# Preprocessing Scraped Data (2)

In [627]:
titles = [item['title'] for item in data]
texts = [item['texts'] for item in data]
pdf_extracted = [item['pdf_extracted'] for item in data]
image_extracted = [item['image_extracted'] for item in data]

In [628]:
# convert titles to lower case

clean_titles2 = [t.lower() for t in titles]

In [629]:
 # convert texts to lower case, remove duplicates, stop words and words that are in remove list 

temp = [remove_duplicates(text_list) for text_list in texts]
clean_texts2 = []
for text_list in temp: 
    lst = []
    for text in text_list: 
        text = text.lower()
        lst.append(text)
    clean_texts2.append(lst)

In [630]:
# generate word count for lists of texts

word_count2 = []
for text_list in clean_texts2: 
    word_count2.append(count_words(text_list))

In [631]:
# generate possible topics for lists of texts

topic_modelling_texts2 = []

for text_list in clean_texts2: 
    topics = topic_modelling(text_list)
    topic_modelling_texts2.append(topics)

In [632]:
# generate possible questions for lists of texts
question_texts2 = []

for text_list in clean_texts2: 
    questions = generate_questions(" ".join(text_list))
    question_texts2.append(questions)

In [633]:
# convert text lists to tfidf matrix
tfidf_texts2 = []

for text_list in clean_texts2: 
    tfidf = tfidf_matrix(text_list)
    tfidf_texts2.append(tfidf)

In [634]:
 # convert pdf texts to lower case, remove duplicates, stop words and words that are in remove list 

temp = [remove_duplicates(text_list) for text_list in pdf_texts]
clean_pdf_texts2= []
for text_list in temp: 
    lst = []
    for text in text_list: 
        text = text.lower()
        lst.append(text)
    clean_pdf_texts2.append(lst)

In [635]:
 # convert image texts to lower case, remove duplicates, stop words and words that are in remove list 

temp = [remove_duplicates(text_list) for text_list in image_texts]
clean_image_texts2= []
for text_list in temp: 
    lst = []
    for text in text_list: 
        text = text.lower()
        lst.append(text)
    clean_image_texts2.append(lst)

In [636]:
processed_data2 = {}
for i in range(len(titles)): 
    processed_data2[clean_titles2[i]] = {
        'texts': clean_texts2[i], 
        'texts_word_count': word_count2[i],
        'texts_topics': topic_modelling_texts2[i],
        'texts_questions': question_texts2[i],   
        #'texts_tfidf': tfidf_texts2[i], 
        'pdf': clean_pdf_texts2[i], 
        'image': clean_image_texts2[i]  
    } 

# texts still contain random integers (not sure if it should be removed)

## processed_data2
| **label**                               | method of processing                                                                                                           |
|-------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
| **titles**                               | titles of webpage converted to lower case |
| **texts**                               | texts of webpage that has been cleaned (original texts are converted to lower case, stop words and words from remove list are removed) |
| **texts_word_count**                    | word count of texts (excluding image and pdf texts) on the webpage                                                                                                                  |
| **texts_topics**                        | possible topics obtained topic modelling of texts (excluding image and pdf texts) from the webpage                                                                                                                  |
| **texts_questions**                     | possible questions obtained from texts (excluding image and pdf texts) from the webpage                                                                                                                  |
| **texts_tfidf**                         | tfidf matrix obtained from list of texts (excluding image and pdf texts) from the webpage                                                                                                                  |
| **pdf**                                 | pdf texts that has been cleaned (orginal texts are converted to lower case, stop words and words from remove list are removed)                                                                                                                  |
| **image**                               | image texts of webpage that has been cleaned (original texts are converted to lower case, stop words and words from remove list are removed)                                                                                                                 |


Note: Key differences between processed_data and processed_data2
- Translation from Sinhala language to English is NOT conducted
- Words that contain special characters are NOT removed (Sinhala language remains)
- Stemming and lemmatization NOT conducted (it should only be carried out when special characters are removed)

# Save processed data to MongoDB 

In [1]:
from dotenv import load_dotenv
import os
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from mongo_utils import get_database, insert_many_documents, find_all_documents

ModuleNotFoundError: No module named 'mongo_utils'

In [638]:
load_dotenv() 

True

In [639]:
password = os.getenv("MONGO_DB_PASSWORD")
username = os.getenv("MONGO_DB_USERNAME")

uri = f"mongodb+srv://{username}:{password}@bt4103.cnngw.mongodb.net/?retryWrites=true&w=majority&appName=BT4103&tlsCAFile=isrgrootx1.pem"

In [640]:
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [641]:
client = MongoClient(uri, server_api=ServerApi('1'))

db = client["shrama_vasana_fund_uat"]
collection = db["processed_data"]

# Insert the data into the collection
result = collection.insert_one(processed_data)
print(f"Inserted document with ID: {result.inserted_id}")

Inserted document with ID: 66f53ed5c5bd3090516d5d98


In [642]:
collection2 = db["processed_data2"]

# Insert the data into the collection
result = collection2.insert_one(processed_data2)
print(f"Inserted document with ID: {result.inserted_id}")

Inserted document with ID: 66f53ed6c5bd3090516d5d99


In [643]:
# Use find_all_documents to retrieve documents from MongoDB 
# print(find_all_documents(collection))
# print(find_all_documents(collection2))

### Note: Removed tfidf matrices from the processed datasets as MongoDB does not support sparse matrix, to think of another way to store tfidf matrices