In [57]:
import os
import codecs
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import re
from scrapper_v2 import BOOKS

In [121]:
def parse_htms(directory, lang):
    """
        Parses the entire directory and returns a DataFrame of the verses.
    """
    paths  = []
    df = pd.DataFrame(columns=['id', lang])

    for dirname, _, filenames in os.walk(directory):
        for filename in filenames:
            book_number = dirname.split('/')[-1]
            try:
                book_number = int(book_number)
            except ValueError:
                continue
            if not filename.endswith('.htm'):
                continue
            pairs = (book_number, os.path.join(dirname, filename))
            paths.append(pairs)

    for book_number, path in tqdm(paths):
        # Create ID
        book_name = BOOKS[book_number - 1]
        chapter = path.split('/')[-1].split('.')[0]
        id = book_name + str(chapter)
        soup = BeautifulSoup(codecs.open(path, 'r', encoding='utf-8', errors='ignore'), 'html.parser')
        verses = soup.select('.textBody p')[0].get_text().strip()
        verses = verses.replace('1 \n', '1 ')
        verses = verses.split('\n')

        for verse in verses:
            content = verse.split(' ')
            verse_number = content[0]
            id_ = id + verse_number
            verse_content = ' '.join(content[1:])
            sentences = re.split('[.?]', verse_content)
            
            i = 1 # Sentence id
            
            for sentence in sentences:
                if not sentence.isspace() and len(sentence) > 0:
                    id_sentence = id_ + str(i)
                    i += 1
                    temp = pd.DataFrame([[id_sentence, sentence.strip()]],
                                    columns=['id', lang])
    #                 print(temp)
                    df = df.append(temp, ignore_index=False)

    return df

In [122]:
LUG = parse_htms('lug_new', 'luganda')

100%|██████████| 1189/1189 [02:39<00:00,  7.47it/s]


In [128]:
LUG = LUG.reset_index()

In [129]:
LUG.head(200)

Unnamed: 0,index,id,luganda
0,0,MRK911,N'abagamba nti Mazima mbagamba nti Ku bano aba...
1,0,MRK921,Awo ennaku omukaaga bwe zaayitawo Yesu n'atwal...
2,0,MRK931,Engoye ze ne zaakaayakana ne zitukula nnyo; so...
3,0,MRK941,Awo Eriya ne Musa ne babalabikira; era baali b...
4,0,MRK951,"Peetero n'addamu, n'agamba Yesu nti Labbi, kye..."
...,...,...,...
195,0,MRK14361,"N'agamba nti Aba, Kitange, byonna biyinzika gy..."
196,0,MRK14371,"Awo n'ajja, n'abasanga nga beebase, n'agamba P..."
197,0,MRK14372,tobadde na maanyi ag'okutunula n'essaawa emu eti
198,0,MRK14381,"Mutunule, musabe, muleme okuyingira mu kukemeb..."


In [130]:
ENG = parse_htms('kj_new', 'english')

100%|██████████| 1189/1189 [02:35<00:00,  7.64it/s]


In [131]:
ENG.shape

(34851, 2)

In [132]:
LUG.shape

(34796, 3)

In [133]:
ENG['id'].nunique()

33570

In [134]:
LUG['id'].nunique()

33514

In [135]:
LUG

Unnamed: 0,index,id,luganda
0,0,MRK911,N'abagamba nti Mazima mbagamba nti Ku bano aba...
1,0,MRK921,Awo ennaku omukaaga bwe zaayitawo Yesu n'atwal...
2,0,MRK931,Engoye ze ne zaakaayakana ne zitukula nnyo; so...
3,0,MRK941,Awo Eriya ne Musa ne babalabikira; era baali b...
4,0,MRK951,"Peetero n'addamu, n'agamba Yesu nti Labbi, kye..."
...,...,...,...
34791,0,2SA17252,Era Amasa yali mwana wa musajja erinnya lye Is...
34792,0,2SA17261,Awo Isiraeri ne Abusaalomu ne basiisira mu nsi...
34793,0,2SA17271,"Awo olwatuuka Dawudi bwe yatuuka e Makanayimu,..."
34794,0,2SA17281,"ne baleeta ebitanda, n'ebibya, n'entamu, n'eŋŋ..."


In [136]:
lug_unique = LUG.drop_duplicates(subset='id')
lug_unique

Unnamed: 0,index,id,luganda
0,0,MRK911,N'abagamba nti Mazima mbagamba nti Ku bano aba...
1,0,MRK921,Awo ennaku omukaaga bwe zaayitawo Yesu n'atwal...
2,0,MRK931,Engoye ze ne zaakaayakana ne zitukula nnyo; so...
3,0,MRK941,Awo Eriya ne Musa ne babalabikira; era baali b...
4,0,MRK951,"Peetero n'addamu, n'agamba Yesu nti Labbi, kye..."
...,...,...,...
34791,0,2SA17252,Era Amasa yali mwana wa musajja erinnya lye Is...
34792,0,2SA17261,Awo Isiraeri ne Abusaalomu ne basiisira mu nsi...
34793,0,2SA17271,"Awo olwatuuka Dawudi bwe yatuuka e Makanayimu,..."
34794,0,2SA17281,"ne baleeta ebitanda, n'ebibya, n'entamu, n'eŋŋ..."


In [137]:
eng_unique = ENG.drop_duplicates(subset='id')
eng_unique

Unnamed: 0,id,english
0,MRK911,"And he said unto them, Verily I say unto you, ..."
0,MRK921,And after six days Jesus taketh with him Peter...
0,MRK931,"And his raiment became shining, exceeding whit..."
0,MRK941,And there appeared unto them Elias with Moses:...
0,MRK951,"And Peter answered and said to Jesus, Master, ..."
...,...,...
0,2SA17251,And Absalom made Amasa captain of the host ins...
0,2SA17261,So Israel and Absalom pitched in the land of G...
0,2SA17271,"And it came to pass, when David was come to Ma..."
0,2SA17281,"Brought beds, and basons, and earthen vessels,..."


In [138]:
# Do an inner join to keep the instersection.
corpus = pd.merge(eng_unique[['id', 'english']], 
                   lug_unique[['id', 'luganda']], 
                   how='inner', 
                   left_on='id', 
                   right_on='id')

In [139]:
corpus

Unnamed: 0,id,english,luganda
0,MRK911,"And he said unto them, Verily I say unto you, ...",N'abagamba nti Mazima mbagamba nti Ku bano aba...
1,MRK921,And after six days Jesus taketh with him Peter...,Awo ennaku omukaaga bwe zaayitawo Yesu n'atwal...
2,MRK931,"And his raiment became shining, exceeding whit...",Engoye ze ne zaakaayakana ne zitukula nnyo; so...
3,MRK941,And there appeared unto them Elias with Moses:...,Awo Eriya ne Musa ne babalabikira; era baali b...
4,MRK951,"And Peter answered and said to Jesus, Master, ...","Peetero n'addamu, n'agamba Yesu nti Labbi, kye..."
...,...,...,...
32996,2SA17251,And Absalom made Amasa captain of the host ins...,Abusaalomu n'afuula Amasa omukulu w'eggye mu k...
32997,2SA17261,So Israel and Absalom pitched in the land of G...,Awo Isiraeri ne Abusaalomu ne basiisira mu nsi...
32998,2SA17271,"And it came to pass, when David was come to Ma...","Awo olwatuuka Dawudi bwe yatuuka e Makanayimu,..."
32999,2SA17281,"Brought beds, and basons, and earthen vessels,...","ne baleeta ebitanda, n'ebibya, n'entamu, n'eŋŋ..."


In [140]:
corpus.to_csv('corpus.csv', index=False)

In [141]:
corpus.head()

Unnamed: 0,id,english,luganda
0,MRK911,"And he said unto them, Verily I say unto you, ...",N'abagamba nti Mazima mbagamba nti Ku bano aba...
1,MRK921,And after six days Jesus taketh with him Peter...,Awo ennaku omukaaga bwe zaayitawo Yesu n'atwal...
2,MRK931,"And his raiment became shining, exceeding whit...",Engoye ze ne zaakaayakana ne zitukula nnyo; so...
3,MRK941,And there appeared unto them Elias with Moses:...,Awo Eriya ne Musa ne babalabikira; era baali b...
4,MRK951,"And Peter answered and said to Jesus, Master, ...","Peetero n'addamu, n'agamba Yesu nti Labbi, kye..."


In [142]:
corpus.shape

(33001, 3)