In [1]:
import numpy as np
import pandas as pd
import json
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from gensim import corpora
from gensim.models.ldamodel import LdaModel

import spacy

from googletrans import Translator

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
file_path = "../scraper/scraped_data/data_selenium.json"
# Load JSON file
with open(file_path) as file:
    data = json.load(file)

In [4]:
# Convert to pandas dataframe
df = pd.DataFrame(data)
df.head()

Unnamed: 0,url,title,texts,images,pdf_links,pdf_extracted,image_extracted
0,https://www.svf.gov.lk/index.php?lang=en,Shrama Vasana Fund - Home,"[Menu, About Us, Contributions, Services, Down...","[https://www.svf.gov.lk/images/homeicon.png, h...",[],{},{'https://www.svf.gov.lk/images/homeicon.png':...
1,https://www.svf.gov.lk/index.php?option=com_co...,Shrama Vasana Fund - Overview,"[Menu, About Us, Contributions, Services, Down...","[https://www.svf.gov.lk/images/homeicon.png, h...",[],{},{'https://www.svf.gov.lk/images/homeicon.png':...
2,https://www.svf.gov.lk/index.php?option=com_co...,Shrama Vasana Fund - Contributions,"[Menu, About Us, Contributions, Services, Down...","[https://www.svf.gov.lk/images/homeicon.png, h...",[],{},{'https://www.svf.gov.lk/images/homeicon.png':...
3,https://www.svf.gov.lk/index.php?option=com_co...,Shrama Vasana Fund - Services,"[Menu, About Us, Contributions, Services, Down...","[https://www.svf.gov.lk/images/homeicon.png, h...",[],{},{'https://www.svf.gov.lk/images/homeicon.png':...
4,https://www.svf.gov.lk/index.php?option=com_co...,Shrama Vasana Fund - Downloads,"[Menu, About Us, Contributions, Services, Down...","[https://www.svf.gov.lk/images/homeicon.png, h...",[https://www.svf.gov.lk/images/pdfs/act_en.pdf...,{'https://www.svf.gov.lk/images/pdfs/act_en.pd...,{'https://www.svf.gov.lk/images/homeicon.png':...


In [5]:
text_list_sample = df.loc[0]['texts']
text_list_sample

['Menu',
 'About Us',
 'Contributions',
 'Services',
 'Downloads',
 'Gallery',
 'News & Events',
 'Donate Us',
 'Vacancy',
 'FAQs',
 'Contact Us',
 'Sitemap',
 'About Us',
 'Overview',
 'Our Team',
 'Organisation Structure',
 'Gallery',
 'Image Gallery',
 'Video Gallery',
 'Contact Us',
 'Inquiry',
 'Contact Details',
 'සිංහල',
 'தமிழ்',
 'About Us',
 'Overview',
 'Our Team',
 'Organisation Structure',
 'Contributions',
 'Services',
 'Downloads',
 'Gallery',
 'Image Gallery',
 'Video Gallery',
 'News & Events',
 'Donate Us',
 'Vacancy',
 'FAQs',
 'Contact Us',
 'Inquiry',
 'Contact Details',
 'Sitemap',
 'Health Clinics & Eye Clinics',
 'Eye Clinic',
 'Empowerment',
 'News & Events',
 'New',
 'Vacancies extended till 2024.04.26',
 'New',
 'Vacancies extended till 2024.04.26',
 'New',
 'Vacancies extended till 2024.04.26',
 'New',
 'Vacancies extended till 2024.04.26',
 'New',
 'Vacancies extended till 2024.04.26',
 'New',
 'Vacancies extended till 2024.04.26',
 'New',
 'Vacancies exten

# Define Functions for Data Preprocessing 

In [3]:
remove = ['menu', 'home', 'about us', 'contributions', 'services', 'downloads', 'gallery', 'news & events', 
          'donate us', 'vacancy', 'faqs', 'contact us', 'sitemap', 'shrama vasana fund']

In [6]:
def remove_duplicates(text_list):
    '''Remove duplicates and words in remove list from the text list'''
    removed = []
    for phrase in text_list: 
        if phrase not in removed:
            if phrase.lower() not in remove:
                removed.append(phrase)
    return removed

print(remove_duplicates(text_list_sample))

['Overview', 'Our Team', 'Organisation Structure', 'Image Gallery', 'Video Gallery', 'Inquiry', 'Contact Details', 'සිංහල', 'தமிழ்', 'Health Clinics & Eye Clinics', 'Eye Clinic', 'Empowerment', 'New', 'Vacancies extended till 2024.04.26', 'prev', 'next', 'Our Services', 'Promotion of the', 'Welfare of the Workers', 'Providing Financial Aid', '& Other Assistance', 'Providing Workers', 'with Medical Aid', 'Temporary Aid', 'to Workers', 'Financial Assistance', '& Other Benefits', 'Presentations in Recognition', 'of Excellent Services', 'Need Our', 'Support?', 'Your', 'Contribution', 'is Appreciated', 'Latest Lottery Results', 'Draw: 416', 'Date: Sunday,', 'September 08, 2024', 'More Results - Lucky 7', 'H', '6', '4', '0', '3', '5', '7', 'View all', 'VIDEO ARCHIVES', 'Click here', 'Related Links', 'National', 'Lotteries Board', 'Law Commission', 'of Sri Lanka', 'NILS - National', 'Institute of', 'Labour Studies', 'National Institute of', 'Occupational Safety', 'and Health', 'Ministry of La

In [7]:
stop_words = set(stopwords.words('english'))

In [8]:
def process_texts(text_list): 
    '''
    Does not include stemming and lemmatization, simply cleans the text data
    '''
    processed = []

    for phrase in text_list:
        phrase = phrase.strip().lower() # remove spaces and convert to lowercase 
        phrase = re.sub(r'[^A-Za-z0-9\s]', '', phrase) # remove special characters and punctuations from text
        phrase = re.sub(r'\n+', ' ', phrase) # replace \n with space

        processed.append(phrase) 

    return processed

In [9]:
print(process_texts(text_list_sample))

['menu', 'about us', 'contributions', 'services', 'downloads', 'gallery', 'news  events', 'donate us', 'vacancy', 'faqs', 'contact us', 'sitemap', 'about us', 'overview', 'our team', 'organisation structure', 'gallery', 'image gallery', 'video gallery', 'contact us', 'inquiry', 'contact details', '', '', 'about us', 'overview', 'our team', 'organisation structure', 'contributions', 'services', 'downloads', 'gallery', 'image gallery', 'video gallery', 'news  events', 'donate us', 'vacancy', 'faqs', 'contact us', 'inquiry', 'contact details', 'sitemap', 'health clinics  eye clinics', 'eye clinic', 'empowerment', 'news  events', 'new', 'vacancies extended till 20240426', 'new', 'vacancies extended till 20240426', 'new', 'vacancies extended till 20240426', 'new', 'vacancies extended till 20240426', 'new', 'vacancies extended till 20240426', 'new', 'vacancies extended till 20240426', 'new', 'vacancies extended till 20240426', 'prev', 'next', 'our services', 'promotion of the', 'welfare of t

In [10]:
# https://www.ibm.com/topics/stemming-lemmatization#:~:text=The%20practical%20distinction%20between%20stemming,be%20found%20in%20the%20dictionary.

def transform_texts(text_list): 
    '''
    Include stemming and lemmatization for pdf texts where sentence structures should make sense
    '''
    processed = []

    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    for phrase in text_list:
        phrase = phrase.strip().lower() # remove spaces and convert to lowercase 
        phrase = re.sub(r'[^A-Za-z0-9\s]', '', phrase) # remove special characters and punctuations from text
        phrase = re.sub(r'\n+', ' ', phrase) # replace \n with space
        tokens = word_tokenize(phrase) # tokenise text
        tokens = [word for word in tokens if word not in stop_words] # remove stop words
        
        #stemming
        stemming = [stemmer.stem(token) for token in tokens]

        #lemmatization
        lem = [lemmatizer.lemmatize(token) for token in stemming]

        processed.append(lem) 

    return processed

In [12]:
def tokenise_text_list(text_list): 
    '''Tokenise text to feed into lda_model'''
    processed = []

    for phrase in text_list:
        phrase = phrase.strip().lower() # remove spaces and convert to lowercase 
        phrase = re.sub(r'[^A-Za-z0-9\s]', '', phrase) # remove special characters and punctuations from text
        phrase = re.sub(r'\n+', ' ', phrase) # replace \n with space
        tokens = word_tokenize(phrase) # tokenise text
        
        tokens = [word for word in tokens if word not in stop_words] # remove stop words
        processed.append(tokens) 

    return processed


In [13]:
# https://towardsdatascience.com/topic-modelling-in-python-with-spacy-and-gensim-dc8f7748bdbf

def topic_modelling(tokenised_text_list):
    topic_weights = [] # store topics and weights in a list [(topics, weights)]
    texts = tokenise_text_list(tokenised_text_list)
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]  
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=20)
    topics = lda_model.print_topics(num_words=1)

    for index, topic in topics:
        weight_str, topic_str = topic.split("*")
        topic = topic_str.replace('"', '')
        weight = float(weight_str)
        topic_weights.append((topic, weight))
    return topic_weights


In [14]:
topic_modelling(text_list_sample)

[('national', 0.072),
 ('vacancies', 0.077),
 ('us', 0.104),
 ('gallery', 0.077),
 ('institute', 0.068)]

In [15]:
def count_words(phrases_list): 
    word_count = 0 
    for phrase in phrases_list: 
        word_count += len(phrase.split())
    return word_count

In [16]:
# Possible entities
nlp = spacy.load("en_core_web_sm")
nlp.get_pipe("ner").labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [17]:
def generate_questions(text):
    nlp = spacy.load('en_core_web_sm') # load spacy language model
    text = nlp(text) # process text
    questions = []
    
    for entity in text.ents: # look for potential nouns/subjects, generate possible questions with who/where/when/what
        if entity.label_ == 'GPE':
            question = f"Where is {entity.text}?"
            questions.append(question) 
        elif entity.label_ == 'DATE':
            question = f"When did {entity.text} happen?"
            questions.append(question) 
        elif entity.label == 'PERSON': 
            question = f"Who is {entity.text}?"
            questions.append(question) 
        else: 
            question = f"What is {entity.text}?"
            questions.append(question) 

    return questions

questions = generate_questions(" ".join(text_list_sample))
print(questions)

['What is Us Contributions Services Downloads Gallery News & Events Donate Us Vacancy?', 'Where is Us?', 'What is Contact Us Inquiry Contact Details Sitemap?', 'What is Health Clinics & Eye Clinics Eye Clinic Empowerment News & Events?', 'What is Vacancies?', 'What is Vacancies?', 'What is Vacancies?', 'What is Vacancies?', 'What is Vacancies?', 'What is Vacancies?', 'What is Vacancies?', 'What is 2024.04.26?', 'What is Our Services Promotion?', 'What is Workers Financial Assistance & Other Benefits Presentations?', 'What is Recognition of Excellent Services?', 'What is 416?', 'When did Sunday, September 08, 2024 happen?', 'What is 0?', 'What is Related Links National Lotteries Board Law Commission?', 'What is Sri Lanka NILS - National Institute of Labour Studies National Institute of Occupational Safety and Health Ministry of Labour and Trade Union Relations Department of Labour?', 'What is Sri Lanka National Lotteries Board Law Commission?', 'What is Sri Lanka NILS - National Institu

In [203]:
# https://pypi.org/project/googletrans/

def translate_to_english(text_list):
    lst = []
    translator = Translator()
    for text in text_list:
        try:
            if text:
                translated = translator.translate(text, dest='en')
                lst.append(translated.text)
            else:
                lst.append("")
        except Exception as e:
            lst.append("") 
    return lst

In [135]:
def translate_text_to_english(text):
    translator = Translator()
    translated  = translator.translate(text , dest='en')
    return translated.text

In [50]:
def tfidf_matrix(text_list):
    vectorizer = TfidfVectorizer() # general tfidf vectorizer that converts a list of processed texts to a tfidf matrix
    tfidf_matrix = vectorizer.fit_transform(text_list)
    return tfidf_matrix

# print(tfidf_matrix(text_list_sample))

# Preprocessing Scraped Data

In [111]:
titles = [item['title'] for item in data]
texts = [item['texts'] for item in data]
pdf_extracted = [item['pdf_extracted'] for item in data]
image_extracted = [item['image_extracted'] for item in data]

In [112]:
clean_texts = []
for text_list in texts: 
    text = translate_to_english(text_list)
    text = remove_duplicates(text)
    text = process_texts(text)
    clean_texts.append(text)

In [246]:
word_count = []

for text_list in clean_texts: 
    word_count.append(count_words(text_list))

In [247]:
image_texts = []
for i in range(len(image_extracted)):
    text = []
    for url, image_text in image_extracted[i].items(): 
        if image_text: 
            text.append(image_text)
    image_texts.append(text)


In [248]:
clean_image_texts = []

for text_list in image_texts: 
    text = translate_to_english(text_list)
    text = remove_duplicates(text)
    text = process_texts(text)
    clean_image_texts.append(text)

In [249]:
pdf_texts = []
for i in range(len(pdf_extracted)):
    text = []
    for url, pdf_text in pdf_extracted[i].items(): 
        text.append(pdf_text)

    pdf_texts.append(text)


In [250]:
clean_pdf_texts = []
for text_list in pdf_texts: 
    text = translate_to_english(text_list)
    text = remove_duplicates(text)
    text = transform_texts(text)
    clean_pdf_texts.append(text)

In [251]:
topic_modelling_texts = []

for text_list in clean_texts: 
    topics = topic_modelling(text_list)
    topic_modelling_texts.append(topics)

In [252]:
tfidf_texts = []

for text_list in clean_texts: 
    tfidf = tfidf_matrix(text_list)
    tfidf_texts.append(tfidf)

In [253]:
question_texts = []

for text_list in clean_texts: 
    questions = generate_questions(" ".join(text_list))
    question_texts.append(questions)
    
print(question_texts)

[['When did 20240426 happen?', 'What is 416?', 'When did 08 2024 happen?', 'What is 7?', 'When did 6 4 happen?', 'What is 0?', 'What is national lotteries board law commission?', 'Where is sri lanka nils?', 'What is health ministry of labour and trade union relations department?', 'What is no 97?', 'What is 05?', 'What is 112?', 'What is 588?', 'What is 936?', 'What is 94 112?', 'What is 588?', 'What is 2024?', 'When did 09 september 2024 happen?'], ['When did 1998 happen?', 'What is the national lotteries board?', 'What is 1?', 'What is 1?', 'What is 2?', 'What is 2?', 'What is shrama vasana fund?', 'When did 18th dec 2012 happen?', 'When did sunday happen?', 'What is 06?', 'When did march 2010 happen?', 'When did saturday happen?', 'What is jathika sampatha?', 'When did sunday happen?', 'When did 20240426 happen?', 'What is 05?', 'What is 112?', 'What is 588?', 'What is 936?', 'What is 94 112?', 'What is 588?', 'What is 2024?', 'When did 09 september 2024 happen?'], ['What is 60000?'

In [256]:
combined_data = []
for i in range(len(titles)): 
    combined_data.append({
        'title': titles[i],
        'texts': clean_texts[i],
        'texts_word_count': word_count[i],
        'texts_topics': topic_modelling_texts[i],
        'texts_questions': question_texts[i],
        'texts_tfidf': tfidf_texts[i],
        'pdf': clean_pdf_texts[i],
        'image': clean_image_texts[i]
    })