In [182]:
import numpy as np
import pandas as pd
import json
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from gensim import corpora
from gensim.models.ldamodel import LdaModel

import spacy

from googletrans import Translator

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [183]:
file_path = "../scraper/scraped_data/data_selenium.json"
# Load JSON file
with open(file_path) as file:
    data = json.load(file)

In [184]:
remove = ['menu', 'home', 'about us', 'contributions', 'services', 'downloads', 'gallery', 'news & events', 
          'donate us', 'vacancy', 'faqs', 'contact us', 'sitemap', 'shrama vasana fund']

In [185]:
# Convert to pandas dataframe
df = pd.DataFrame(data)
df.head()

Unnamed: 0,url,title,texts,images,pdf_links,pdf_extracted,image_extracted
0,https://www.svf.gov.lk/index.php?lang=en,Shrama Vasana Fund - Home,"[Menu, About Us, Contributions, Services, Down...","[https://www.svf.gov.lk/images/homeicon.png, h...",[],{},{'https://www.svf.gov.lk/images/homeicon.png':...
1,https://www.svf.gov.lk/index.php?option=com_co...,Shrama Vasana Fund - Overview,"[Menu, About Us, Contributions, Services, Down...","[https://www.svf.gov.lk/images/homeicon.png, h...",[],{},{'https://www.svf.gov.lk/images/homeicon.png':...
2,https://www.svf.gov.lk/index.php?option=com_co...,Shrama Vasana Fund - Contributions,"[Menu, About Us, Contributions, Services, Down...","[https://www.svf.gov.lk/images/homeicon.png, h...",[],{},{'https://www.svf.gov.lk/images/homeicon.png':...
3,https://www.svf.gov.lk/index.php?option=com_co...,Shrama Vasana Fund - Services,"[Menu, About Us, Contributions, Services, Down...","[https://www.svf.gov.lk/images/homeicon.png, h...",[],{},{'https://www.svf.gov.lk/images/homeicon.png':...
4,https://www.svf.gov.lk/index.php?option=com_co...,Shrama Vasana Fund - Downloads,"[Menu, About Us, Contributions, Services, Down...","[https://www.svf.gov.lk/images/homeicon.png, h...",[https://www.svf.gov.lk/images/pdfs/act_en.pdf...,{'https://www.svf.gov.lk/images/pdfs/act_en.pd...,{'https://www.svf.gov.lk/images/homeicon.png':...


In [186]:
text_list_sample = df.loc[0]['texts']
text_list_sample

['Menu',
 'About Us',
 'Contributions',
 'Services',
 'Downloads',
 'Gallery',
 'News & Events',
 'Donate Us',
 'Vacancy',
 'FAQs',
 'Contact Us',
 'Sitemap',
 'About Us',
 'Overview',
 'Our Team',
 'Organisation Structure',
 'Gallery',
 'Image Gallery',
 'Video Gallery',
 'Contact Us',
 'Inquiry',
 'Contact Details',
 'සිංහල',
 'தமிழ்',
 'About Us',
 'Overview',
 'Our Team',
 'Organisation Structure',
 'Contributions',
 'Services',
 'Downloads',
 'Gallery',
 'Image Gallery',
 'Video Gallery',
 'News & Events',
 'Donate Us',
 'Vacancy',
 'FAQs',
 'Contact Us',
 'Inquiry',
 'Contact Details',
 'Sitemap',
 'Health Clinics & Eye Clinics',
 'Eye Clinic',
 'Empowerment',
 'News & Events',
 'New',
 'Vacancies extended till 2024.04.26',
 'New',
 'Vacancies extended till 2024.04.26',
 'New',
 'Vacancies extended till 2024.04.26',
 'New',
 'Vacancies extended till 2024.04.26',
 'New',
 'Vacancies extended till 2024.04.26',
 'New',
 'Vacancies extended till 2024.04.26',
 'New',
 'Vacancies exten

In [187]:
def remove_duplicates(text_list):
    '''Remove duplicates and words in remove list from the text list'''
    removed = []
    for phrase in text_list: 
        if phrase not in removed:
            if phrase.lower() not in remove:
                removed.append(phrase)
    return removed

print(remove_duplicates(text_list_sample))

['Overview', 'Our Team', 'Organisation Structure', 'Image Gallery', 'Video Gallery', 'Inquiry', 'Contact Details', 'සිංහල', 'தமிழ்', 'Health Clinics & Eye Clinics', 'Eye Clinic', 'Empowerment', 'New', 'Vacancies extended till 2024.04.26', 'prev', 'next', 'Our Services', 'Promotion of the', 'Welfare of the Workers', 'Providing Financial Aid', '& Other Assistance', 'Providing Workers', 'with Medical Aid', 'Temporary Aid', 'to Workers', 'Financial Assistance', '& Other Benefits', 'Presentations in Recognition', 'of Excellent Services', 'Need Our', 'Support?', 'Your', 'Contribution', 'is Appreciated', 'Latest Lottery Results', 'Draw: 416', 'Date: Sunday,', 'September 08, 2024', 'More Results - Lucky 7', 'H', '6', '4', '0', '3', '5', '7', 'View all', 'VIDEO ARCHIVES', 'Click here', 'Related Links', 'National', 'Lotteries Board', 'Law Commission', 'of Sri Lanka', 'NILS - National', 'Institute of', 'Labour Studies', 'National Institute of', 'Occupational Safety', 'and Health', 'Ministry of La

In [188]:
stop_words = set(stopwords.words('english'))

In [189]:
def process_texts(text_list): 
    '''
    Does not include stemming and lemmatization, simply cleans the text data
    '''
    processed = []

    for phrase in text_list:
        phrase = phrase.strip().lower() # remove spaces and convert to lowercase 
        phrase = re.sub(r'[^A-Za-z0-9\s]', '', phrase) # remove special characters and punctuations from text
        phrase = re.sub(r'\n+', ' ', phrase) # replace \n with space

        processed.append(phrase) 

    return processed

In [190]:
print(process_texts(text_list_sample))

['menu', 'about us', 'contributions', 'services', 'downloads', 'gallery', 'news  events', 'donate us', 'vacancy', 'faqs', 'contact us', 'sitemap', 'about us', 'overview', 'our team', 'organisation structure', 'gallery', 'image gallery', 'video gallery', 'contact us', 'inquiry', 'contact details', '', '', 'about us', 'overview', 'our team', 'organisation structure', 'contributions', 'services', 'downloads', 'gallery', 'image gallery', 'video gallery', 'news  events', 'donate us', 'vacancy', 'faqs', 'contact us', 'inquiry', 'contact details', 'sitemap', 'health clinics  eye clinics', 'eye clinic', 'empowerment', 'news  events', 'new', 'vacancies extended till 20240426', 'new', 'vacancies extended till 20240426', 'new', 'vacancies extended till 20240426', 'new', 'vacancies extended till 20240426', 'new', 'vacancies extended till 20240426', 'new', 'vacancies extended till 20240426', 'new', 'vacancies extended till 20240426', 'prev', 'next', 'our services', 'promotion of the', 'welfare of t

In [191]:
# https://www.ibm.com/topics/stemming-lemmatization#:~:text=The%20practical%20distinction%20between%20stemming,be%20found%20in%20the%20dictionary.

def transform_texts(text_list): 
    '''
    Include stemming and lemmatization for pdf texts where sentence structures should make sense
    '''
    processed = []

    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    for phrase in text_list:
        phrase = phrase.strip().lower() # remove spaces and convert to lowercase 
        phrase = re.sub(r'[^A-Za-z0-9\s]', '', phrase) # remove special characters and punctuations from text
        phrase = re.sub(r'\n+', ' ', phrase) # replace \n with space
        tokens = word_tokenize(phrase) # tokenise text
        tokens = [word for word in tokens if word not in stop_words] # remove stop words
        
        #stemming
        stemming = [stemmer.stem(token) for token in tokens]

        #lemmatization
        lem = [lemmatizer.lemmatize(token) for token in stemming]

        processed.append(lem) 

    return processed

In [192]:
transform_texts(text_list_sample)

[['menu'],
 ['u'],
 ['contribut'],
 ['servic'],
 ['download'],
 ['galleri'],
 ['news', 'event'],
 ['donat', 'u'],
 ['vacanc'],
 ['faq'],
 ['contact', 'u'],
 ['sitemap'],
 ['u'],
 ['overview'],
 ['team'],
 ['organis', 'structur'],
 ['galleri'],
 ['imag', 'galleri'],
 ['video', 'galleri'],
 ['contact', 'u'],
 ['inquiri'],
 ['contact', 'detail'],
 [],
 [],
 ['u'],
 ['overview'],
 ['team'],
 ['organis', 'structur'],
 ['contribut'],
 ['servic'],
 ['download'],
 ['galleri'],
 ['imag', 'galleri'],
 ['video', 'galleri'],
 ['news', 'event'],
 ['donat', 'u'],
 ['vacanc'],
 ['faq'],
 ['contact', 'u'],
 ['inquiri'],
 ['contact', 'detail'],
 ['sitemap'],
 ['health', 'clinic', 'eye', 'clinic'],
 ['eye', 'clinic'],
 ['empower'],
 ['news', 'event'],
 ['new'],
 ['vacanc', 'extend', 'till', '20240426'],
 ['new'],
 ['vacanc', 'extend', 'till', '20240426'],
 ['new'],
 ['vacanc', 'extend', 'till', '20240426'],
 ['new'],
 ['vacanc', 'extend', 'till', '20240426'],
 ['new'],
 ['vacanc', 'extend', 'till', '202

In [193]:
def tokenise_text_list(text_list): 
    '''Tokenise text to feed into lda_model'''
    processed = []

    for phrase in text_list:
        phrase = phrase.strip().lower() # remove spaces and convert to lowercase 
        phrase = re.sub(r'[^A-Za-z0-9\s]', '', phrase) # remove special characters and punctuations from text
        phrase = re.sub(r'\n+', ' ', phrase) # replace \n with space
        tokens = word_tokenize(phrase) # tokenise text
        
        tokens = [word for word in tokens if word not in stop_words] # remove stop words
        processed.append(tokens) 

    return processed


In [194]:
# https://towardsdatascience.com/topic-modelling-in-python-with-spacy-and-gensim-dc8f7748bdbf

def topic_modelling(tokenised_text_list):
    topic_weights = [] # store topics and weights in a list [(topics, weights)]
    texts = tokenise_text_list(tokenised_text_list)
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]  
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=20)
    topics = lda_model.print_topics(num_words=1)

    for index, topic in topics:
        weight_str, topic_str = topic.split("*")
        topic = topic_str.replace('"', '')
        weight = float(weight_str)
        topic_weights.append((topic, weight))
    return topic_weights


In [195]:
topic_modelling(text_list_sample)

[('labour', 0.092),
 ('gallery', 0.094),
 ('us', 0.109),
 ('20240426', 0.075),
 ('2024', 0.038)]

In [196]:
def count_words(phrases_list): 
    word_count = 0 
    for phrase in phrases_list: 
        word_count += len(phrase.split())
    return word_count

In [197]:
# Possible entities
nlp = spacy.load("en_core_web_sm")
nlp.get_pipe("ner").labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [198]:
def generate_questions(text):
    nlp = spacy.load('en_core_web_sm') # load spacy language model
    text = nlp(text) # process text
    questions = []
    
    for entity in text.ents: # look for potential nouns/subjects, generate possible questions with who/where/when/what
        if entity.label_ == 'GPE':
            question = f"Where is {entity.text}?"
            questions.append(question) 
        elif entity.label_ == 'DATE':
            question = f"When did {entity.text} happen?"
            questions.append(question) 
        elif entity.label == 'PERSON': 
            question = f"Who is {entity.text}?"
            questions.append(question) 
        else: 
            question = f"What is {entity.text}?"
            questions.append(question) 

    return questions

questions = generate_questions(" ".join(text_list_sample))
print(questions)

['What is Us Contributions Services Downloads Gallery News & Events Donate Us Vacancy?', 'Where is Us?', 'What is Contact Us Inquiry Contact Details Sitemap?', 'What is Health Clinics & Eye Clinics Eye Clinic Empowerment News & Events?', 'What is Vacancies?', 'What is Vacancies?', 'What is Vacancies?', 'What is Vacancies?', 'What is Vacancies?', 'What is Vacancies?', 'What is Vacancies?', 'What is 2024.04.26?', 'What is Our Services Promotion?', 'What is Workers Financial Assistance & Other Benefits Presentations?', 'What is Recognition of Excellent Services?', 'What is 416?', 'When did Sunday, September 08, 2024 happen?', 'What is 0?', 'What is Related Links National Lotteries Board Law Commission?', 'What is Sri Lanka NILS - National Institute of Labour Studies National Institute of Occupational Safety and Health Ministry of Labour and Trade Union Relations Department of Labour?', 'What is Sri Lanka National Lotteries Board Law Commission?', 'What is Sri Lanka NILS - National Institu

In [199]:
# https://pypi.org/project/googletrans/

text  = "\u0dc0\u0dc6\u0db4\u0dba\u0dd4\u0db8\u0dca\u0d9a\u0dbb\u0dd4\u0dbd\u0d9a\u0dd4/\u0d9a\u0d9a\u0ddc\u0db1\u0dca\u0dad\u0dca\u200d\u0dbb\u0dc5\u0dad\u0dca\u0d9a\u0dbb\u0d9a\u0dbb\u0dd4\u0dbd\u0d9a\u0dd4\u0dd9\u0d9a\u0dbc\u0dc0\u0dd9\u0dba\u0dcf\u0dba\u0dc5\u0db4\u0d82\u0da0\u0dd2 \u0dc0\u0dd9\u0db8 \u0db8\u0dd9\u0dc0\u0dcf\u0dc1\u0dc5\u0dd9\u0daf\u0dba\u0db8\u0dca\u0db8\u0dca\u0db4\u0dad\u0dca\u200d\u0dbb\u0dba\n(\u0dbd\u0dba\u0dc5\u0db4\u0dc5\u0dbb\u0dd9\u0dad\u0dba\u0dba\u0dda\u0d9a\u0dbd\u0dd9\u0dba\u0dcf \u0dc1\u0dd9\u0dbb\u0dca\u0dc2\u0dba\u0dbf\u0dba\u0d9a\u0dd9\u0dd2\u0d82\u0db4\u0dad\u0db4\u0dad\u0dca\u0d9a\u0dbb\u0dd9\u0d9a\u0dbb\u0db1\u0dca\u0dda)\n1. \u0daf\u0dba\u0db8\u0dca\u0db8\u0dca\u0d9a\u0dbb\u0dd4\u0d9a\u0dda\u0dd9\u0dda\u0db8\u0dd9:-\u0dd9\n.......................................................................................................................\n2. \u0dbd\u0dba\u0dc5\u0db4\u0dc5\u0dbb\u0dd9\u0dad\u0dba\u0dba\u0dda\u0d9a\u0dbd/\u0dc0\u0db8\u0dc5\u0d9c\u0d9a\u0db8\u0dca/\u0dbd\u0dba\u0dc5\u0db4\u0dc5\u0dbb\u0d9a\u0dbd\u0dd9\u0dda\u0db8\u0dd9:-\u0dd9\n.............................................................................\n3. \u0daf\u0dba\u0db8\u0dca\u0db8\u0dca\u0d9a\u0dbb\u0dd4\u0d9a\u0dda\u0dd9\u0dbd\u0dba\u0dc5\u0db4\u0dc5\u0db4\u0dad\u0d9a\u0dd9\u0dba\u0dcf \u0dc1\u0dda\u0dba\u0dd9:-\u0dd9\n....................................................................................................\n4. \u0db8\u0dca\u0dbb\u0d9a\u0dae\u0dda\u0dd9\u0daf\u0da0\u0dd2\u0d9a\u0dba\u0dd9:-\u0dd9............................................\n5. \u0dc3\u0dc6\u0d9a\u0dca\u0dc0\u0dc4\u0dd9\u0daf\u0da0\u0dd2\u0d9a\u0dba\u0dd9:-\u0dd9................................................\n6. \u0dc0\u0dd2\u0daf\u0dca\u200d\u0dba\u0dd4\u0dad\u0dca\u0d9a\u0dbb\u0dd9\u0dba\u0dc6\u0db4\u0dc6 \u0dbd\u0dd9\u0dba\u0dcf \u0dc1\u0dda\u0dba\u0dd9:-\u0dd9.....................................................\n7. \u0dbd\u0dba\u0dc5\u0db4\u0dc5\u0dbb\u0dba\u0dd9\u0dba\u0dcf\u0dba\u0dc5\u0db4\u0d82\u0da0\u0dd2 \u0dc0\u0dd9\u0dbb\u0dd3\u0db8\u0dda\u0d9a\u0db8\u0dca\u0dd9\u0dc0\u0dc1\u0d9a\u0dba\u0d9a\u0d9a\u0dbd\u0dd9\u0daf\u0da0\u0dd2\u0d9a\u0dba\u0dd9\u0dc0\u0dc1\u0dd9\u0d82\u0dda\u0dba\u0dd9:-\u0dd9\n...............................................................\n(\u0d9a\u0dbb\u0dd4\u0dab\u0dc5\u0d9a\u0dc5\u0dbb\u0dd9\u0dc0\u0dc1\u0d9a\u0dba\u0d9a\u0d9a\u0dbd\u0dd9\u0da1\u0dc5\u0dba\u0dc5\u0dd9 \u0dc1\u0da7\u0db4\u0dba\u0d9a\u0dca\u0dd9\u0daf\u0db8\u0dd4\u0dab\u0dc5\u0dd9\u0d91\u0dbd\u0db1\u0dca\u0dda)\n8. \u0dba\u0dcf\u0dba\u0dc5\u0db4\u0d82\u0da0\u0dd2 \u0dc0\u0dba\u0dd9\u0dd2\u0d9a\u0dd4\u0dad\u0dca\u0d9a\u0dbb\u0dd9\u0dbd\u0dda\u0dd9\u0d82\u0dda\u0dba\u0dd9:-\u0dd9........\n9. \u0daf\u0daf\u0dca\u200d\u0dba\u0dc5\u0dc2\u0dd9\u0dbd\u0dd8\u0dad\u0dca\u0d9a\u0dbb\u0dad\u0dd3\u0d9a\u0dbd\u0dd9\u0db4\u0dc2\u0db4\u0dd4\u0dbb\u0dd4\u0daf\u0dca\u0daf\u0dca\u200d\u0dba\u0dd9\u0dbd\u0dc2\u0dba\u0dbf\u0dd9\u0dbd\u0dba\u0dcf\u0db1\u0dca\u0dd9:-\u0dd9.....................................................\n10. \u0db6\u0dc6\u0da0\u0dd2\u0d9a\u0dd4\u0d9a\u0dbb\u0dd4\u0dbd\u0db1\u0dca\u0d9a\u0dda\u0dd9\u0dda\u0db8\u0dd9:-\u0dd9................................................\n11. \u0dbd\u0dc6\u0da7\u0dca\u0dd9\u0daf\u0da0\u0dd2\u0d9a\u0dba\u0dd9:-\u0dd9............................\n12. \u0dad\u0daf\u0dca\u200d\u0dba\u0dc5\u0dba\u0db8\u0dca\u0dd9\u0db6\u0db8\u0dca\u0dd9\u0dba\u0dcf \u0dc1\u0dd9\u0d9a\u0d9c\u0ddc\u0db1\u0dd4\u0dd9\u0daf\u0da0\u0dd2\u0d9a\u0dba\u0dd9:-\u0dd9..............................\n(\u0dbb\u0dcf\u0dbd\u0dc5\u0dd9\u0d9c\u0dc6\u0db1\u0dd3\u0db8\u0dd9\u0dc0\u0db8\u0dca\u0db6\u0db1\u0dca\u0db0\u0d9a\u0dba\u0db1\u0dca\u0dd9\u0d9a\u0daf\u0dca\u0dbb\u0dca\u0dba\u0dd9\u0dad\u0daf\u0dca\u200d\u0dba\u0dc5\u0dba\u0db8\u0dca\u0dd9\u0d9a\u0d9a\u0ddc\u0db8\u0dc0\u0dc5\u0db4\u0dad\u0dc0\u0dc4\u0dd9\u0da2\u0dda\u0dbb\u0dc5 \u0dbd\u0dd9\u0dc0\u0dd2\u0dc3\u0dd2\u0db1\u0dca\u0dd9\u0db1\u0dd2\u0d9a\u0dd4\u0dad\u0dca\u0d9a\u0dbb\u0dd9\u0d9a\u0dbb\u0dd9\u0d87\u0d9a\u0dba\u0dd9\n\u0db1\u0dd2\u0d9a\u0dba\u0ddd\u0d9c\u0dd9\u0dbb\u0dd3\u0dc3\u0dd2\u0dbd\u0d9a\u0dca\u0dd9\u0d9a\u0dba\u0d9a\u0dda\u0dd9\u0dda\u0db8\u0dca\u0dd9\u0daf\u0dba\u0db8\u0dca\u0db8\u0dca\u0dd9\u0db4\u0dad\u0dca\u200d\u0dbb\u0dba\u0dd9\u0dc0\u0db8 \u0dd9\u0daf\u0db8\u0dd4\u0dab\u0dc5\u0dd9\u0d91\u0dc0\u0dd2\u0dba\u0dd9\u0dba\u0dd4\u0dd4\u0dba.)\n\u0db8\u0db8/\u0daf \u0dc1 \u0dc1\u0dca\u200d\u0dbb\u0db8\u0dd9\u0dbd\u0dc5\u0dc0\u0dda\u0dc5\u0dd9\u0daf\u0dbb\u0db8\u0dd4\u0daf\u0dca\u200d\u0dba\u0dbc\u0dd9\u0dc0\u0dd2\u0dc3\u0dd2\u0db1\u0dca\u0dd9\u0daf\u0dca\u200d\u0dba\u0d9a\u0dca\u0dbd\u0dc5\u0dd9\u0d87\u0d9a\u0dba\u0dd9\u0d9a\u0d9a\u0ddc\u0db1\u0dca\u0d9a\u0daf\u0dca\u0dc3\u0dd2\u0dbd\u0dbc\u0da7\u0dd9\u0d91\u0d9a \u0dbd\u0dd9\u0dba\u0dcf\u0dba\u0dc5\u0db4\u0d82\u0da0\u0dd2 \u0dc0\u0dba\u0dd9\u0dc0\u0dcf\u0dc1\u0dc5\u0dd9\u0dd2 \u0dbd\u0dbd\u0dd4\u0db8\u0dca\u0dd9\n\u0d9a\u0dbb\u0db8\u0dd2/\u0d9a\u0dbb\u0db8\u0dd4\n\u0d82\u0dda\u0dba\u0dd9:-\u0dd9.................................. \u0dd9\u0dd9.............................\n\u0daf\u0dba\u0db8\u0dca\u0db8\u0dca\u0d9a\u0dbb\u0dd4\u0d9a\u0dda\u0dd9\u0daf\u0dad\u0dca\u0d9a\u0dbb\u0dc0\u0dda\n\u0dd9\u0dd9\u0dd9\u0dd9\u0dd9\u0dd9\u0dd9\u0dd9\u0db1\u0dd2\u0dbc\u0dd9\u0db8\u0dd4\u0daf\u0dca\u200d\u0dbb\u0dc5\u0dbd\n\u0dd2\u0dc1\u0dba\u0dd9\u0dad\u0daf\u0dca\u200d\u0dba\u0dc2\u0dba\u0dbe\u0dd9\u0daf\u0dba\u0db8\u0dca\u0db8\u0dca\u0db4\u0dad\u0dca\u0d9a\u0dbb\u0dd9\u0dd9www.svf.gov.lk \u0dba\u0dda\u0dd9\u0dc1\u0dca\u200d\u0dbb\u0db8\u0dd9\u0dbd\u0dc5\u0dc0\u0dda\u0dc5\u0dd9\u0daf\u0dbb\u0db8\u0dd4\u0daf\u0dca\u200d\u0dba\u0d9a \u0dbd\u0dd9\u0d9a\u0dbd\u0dd2 \u0dd9\u0daf\u0dc0\u0dc0\u0dd2\u0d9a\u0dba\u0db1\u0dca\u0dd9\u0dbc\u0db6\u0dc5\u0dd9\u0d9c\u0dba\u0dd9\u0dc1\u0dc6\u0dbb\u0dd3\u0dd9\n\u0daf\u0dba\u0dbb,\u0dd9\u0d92\u0dd9\u0dd9\u0daf\u0db1\u0dd4\u0dbd\u0dd9\u0dc0\u0d9a\u0dc0\u0dd9\u0dc4 \u0d9a\u0dbb\u0dd9\u0d9c\u0db1\u0dca\u0dda\u0dc5\u0dd9\u0dbc\u0daf\u0dca\u200d\u0dba\u0dd9\u0daf\u0dba\u0db8\u0dca\u0db8\u0dca\u0db4\u0dad\u0dca\u0d9a\u0dbb\u0dd9\u0dd9\u0d94\u0db6\u0d9a\u0dda\u0dd9\u0dba\u0dcf \u0dc1\u0dd9\u0dbb\u0dca\u0dc2\u0dba\u0dbf\u0dba\u0d9a\u0dca\u0dd9\u0db8\u0d9f\u0dd2\u0db1\u0dca\u0dd9\u0dd2\u0d82\u0db4\u0dad\u0db4\u0dad\u0dca\u0d9a\u0dbb\u0dd9\u0d9a\u0dbb\u0db1\u0dca\u0dda.\u0dd9\u0d94\u0db6\u0dda\u0dd9\n\u0daf\u0dba\u0db8\u0dca\u0db8\u0dca\u0db4\u0dba\u0dd92019.01.24 \u0d82\u0dda\u0da7\u0dd9\u0db4\u0dca\u200d\u0dbb\u0dae\u0db8\u0dd9\u0dc0\u0db7\u0dc5\u0db4\u0d9a\u0dba,\u0dd9\u0dc1\u0dca\u200d\u0dbb\u0db8\u0dd9\u0dbd\u0dc5\u0dc0\u0dda\u0dc5\u0dd9\u0daf\u0dbb\u0db8\u0dd4\u0daf\u0dca\u200d\u0dba\u0dbc,\u0dd9\u0d9a\u0db8\u0dca\u0d9a\u0dbb\u0dd4\u0dd9\u0daf\u0db8\u0dc5\u0dba\u0dba\u0dc5\u0da0\u0dd2\u0dbe\u0dba,\u0dd9\u0daf\u0da0\u0dd2\u0d9a\u0dd997,\u0dd9\n\u0da2\u0dc5\u0dbd\u0dad\u0dca\u0d9a\u0dbb\u0dba\u0dd9\u0db4\u0dc5\u0dbb,\u0dd9\u0d9a\u0d9a\u0ddc\u0dc2\u0db9\u0dd905\u0dd9\u0dba\u0dda\u0dd9\u0dba\u0dcf \u0dc1\u0dda\u0dba\u0da7\u0dd9\u0dba\u0dcf\u0dba\u0dc5\u0db4\u0d82\u0da0\u0dd2 \u0dc0\u0dd9\u0dba\u0dc6\u0db4\u0dc7\u0dbc\u0dd9\u0db8\u0d9f\u0dd2\u0db1\u0dca\u0dd9\u0dbc\u0dc6\u0db6\u0dd3\u0db8\u0da7\u0dd9\u0dc0\u0dbc\u0dc0\u0dc4\u0dbd\u0dda\u0dd9\u0d9a\u0db8\u0db1\u0dca\u0dd9\u0dd2 \u0dbd\u0dbc\u0dc5\u0dd9\n\u0dc3\u0dd2\u0da7\u0dd2\u0db8\u0dd2.\n\u0dc0\u0db7\u0dc5\u0db4\u0d9a\u0dba,\n\u0dc1\u0dca\u200d\u0dbb\u0db8\u0dd9\u0dbd\u0dc5\u0dc0\u0dda\u0dc5\u0dd9\u0daf\u0dbb\u0db8\u0dd4\u0daf\u0dca\u200d\u0dba\u0dbc,\n\u0d9a\u0db8\u0dca\u0d9a\u0dbb\u0dd4\u0dd9\u0daf\u0db8\u0dc5\u0dba\u0dba\u0dc5\u0da0\u0dd2\u0dbe\u0dba,\n\u0daf\u0da0\u0dd2\u0d9a.97,\u0dd9\u0da2\u0dc5\u0dbd\u0dad\u0dca\u0d9a\u0dbb\u0dba\u0dd9\u0db4\u0dc5\u0dbb,\n\u0d9a\u0d9a\u0ddc\u0dc2\u0db9\u0dd905.\u0dd9\u0dd9\u0dd9(\u0db8\u0dca.\u0d9a.\u0dd9\u0daf\u0da0\u0dd2\u0d9a\u0dd9:\u0dd9011\u20132588936\u0dd9/\u0dd9\u0dc3\u0dc6\u0d9a\u0dca\u0dc0\u0dd9\u0dc4\u0daf\u0da0\u0dd2\u0d9a\u0dba\u0dd9:\u0dd9\u0dd9011\u20132588937)"
def translate_text_to_english(text): 
    translator = Translator()
    translated  = translator.translate(text , dest='en')
    return translated.text

translated_to_english = translate_text_to_english(text)

In [200]:
cleaned_text =  process_texts([translated_to_english])
print(cleaned_text)

['expenditure of a wampurakarakara  register permit of the lal estabarian apartment 1 principle n 2 in the lal establishment  permanent  labor  3 establishment of a naymator select  4 cathoraki e  5 safakehawkta ne  6 email velafipper e  7 inquiry in a veteranlator of the level zote navawardena in karape 8 investigation department mawala 9 twdychshawewa mapuracies in general   10 buffethiculam ne  11 lefatchanical e  12 the abamara secuces e  ralhalagafaniculous marketing proteans nika seyrokecrease is sent i have a lolum of lolum    do  let sleeme we  known tennasty wwwsvfgovlk is observed in wwwsvfgovlk business of the buddhasarakarawpara due to the diffire 20190202019 97 respective 97 inquiques of the department degeneration 05 i am in weather laboratory labbilitary motorcycle purchase 97 kolcoship yatthic y0112588936  carekaya yes112588937']


In [201]:
def tfidf_matrix(text_list):
    vectorizer = TfidfVectorizer() # general tfidf vectorizer that converts a list of processed texts to a tfidf matrix
    tfidf_matrix = vectorizer.fit_transform(text_list)
    return tfidf_matrix

In [202]:
# for i in range(len(data)): #loop through the data 
#     for key, value in data[i].items():
#         print(key, value)