# Installation of required packages

In [None]:
!pip3 install pyspark
!pip3 install Fuzzy
!pip3 install phonetics
!pip3 install requests
!pip3 install beautifulsoup4


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 29 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 58.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=d9f19340b8a47a985414cdb18ae8845051fac653b31ee6caad58852bea9c70ff
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Fuzzy
  D

# Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing required pacakges and initializing the Spark Context

In [None]:
import fuzzy
import phonetics
import difflib
import requests
from difflib import SequenceMatcher
import pickle
from bs4 import BeautifulSoup as bs
from pyspark import SparkConf, SparkContext
conf = (SparkConf().setMaster("local").setAppName("Phonetic Search").set("spark. executor.memory",   "lg"))
sc = SparkContext(conf = conf)

# Creating Data Set

In [None]:
urls = [i for i in range(1,115)]
rdd_urls = sc.parallelize(urls)

# Web Crawler for fetching data from https://quran411.com/verse-by-verse

In [None]:
def generate_N_grams(text,ngram=1):
    words=[word for word in text.split(" ")]
    if(len(words)>=5):
        temp=zip(*[words[i:] for i in range(0,ngram)])
        ans=[' '.join(ngram) for ngram in temp]
        return ans
    else:
        return [text]
def fetchDataFromWeb(urls):
    r = requests.get(f"https://quran411.com/verse-by-verse?sn={urls}")
    html = r.content
    soup = bs(html,"html.parser")
    div = soup.find("div", {"class": "ac-content"})
    new_div = str(div).replace("<br/>","##")
    soup2 = bs(new_div,"html.parser")
    total_data = []
    for e in (soup2.text.split("##")[:-1]):
        e = e.replace("\n","")
        ayah_num = e[0]
        e = e[3:]
        if("section" in e):
            index = e.find("section")
            replace_word = e[index-2:]
            e = e.replace(replace_word,"")
        if("End Juz" in e):
            index = e.find("End Juz")
            replace_word = e[index-2:]
            e = e.replace(replace_word,"")
        ngrams = generate_N_grams(e,5)
        for n in ngrams:
            data = {"arabic":n,"surat":urls,"ayat":ayah_num,"phonemes":fuzzy.nysiis(n),"phonemese2":phonetics.metaphone(n),"verse":f"{urls}:{ayah_num}"}
            total_data.append(data)
    return total_data

In [None]:
data = rdd_urls.flatMap(lambda x:fetchDataFromWeb(x))

# Save data as a pickle file

In [None]:
data.saveAsPickleFile("/content/drive/MyDrive/phonemesTransData")

# Query Portion

# Importing data from pickle files and converting them into spark rdds

In [None]:

phonetic_verse_data = "/content/drive/MyDrive/romanPhonemeDistributedData"
quranic_verse_data = "/content/drive/MyDrive/quranVerseDistributedData"

rdd_phonemes = sc.pickleFile(phonetic_verse_data)
rdd_quran = sc.pickleFile(quranic_verse_data).persist()

# Required Preprocessing
## As we have done majority preprocessing while creating the dataset so not much of the cleaning is required

In [None]:
def preprocess_data(x):
    arabic = x["arabic"]
    phonemes_v1 = x["phonemes"].replace(" ","")
    phonemes_v2 = x["phonemese2"].replace(" ","")
    ayat = int(x["ayat"])
    surat = int(x["surat"])
    verse = x["verse"]

    return [arabic, phonemes_v1, phonemes_v2,ayat,surat,verse]

In [None]:
rdd_phonemes = rdd_phonemes.map(lambda x: preprocess_data(x)).persist()

#Function for phonetic match we use 2 types of phonemes and based on the distance we rank the data

#Query and ayah

In [37]:
ayat = "kul hoo vellahoo ehed"
ayat = ayat.split(" ")
ayat = ' '.join(ayat[0:6]) if len(ayat) > 6 else ' '.join(ayat)
phoneme_type1 = fuzzy.nysiis(ayat)
phoneme_type2 = phonetics.metaphone(ayat)
broadcastVar1 = sc.broadcast(phoneme_type1)
broadcastVar2 = sc.broadcast(phoneme_type2)
seq = SequenceMatcher()
seq2 = SequenceMatcher()

def matchPhonemes(phon,verse,arabic,ayat):
    '''
    1. Phonemese type1 is extracted using nysiis phonetic algo
    2. Phonemes type2 is extracted using metaphone algo
    3. Phon coming from database will be compare with type1
    5. verse will be return which will than be used to query further
    
    Similar to above mention. It will be evaluated on basis of sequence ratio
    '''
    phoneme_type1 = broadcastVar1.value
    phoneme_type2 = broadcastVar2.value
    try:
        seq.set_seqs(phoneme_type1,phon)
        if ((seq.ratio()>0.85) or ((phoneme_type1 in phon) and (seq.ratio()>0.68))):
            seq2.set_seqs(ayat.lower(),arabic.lower())
            if seq2.ratio()>0.55:
                return (seq.ratio(),verse)
        else:
            return None
    except:
        return None

phonetic_matches = rdd_phonemes.map(lambda x: matchPhonemes(x[1],x[5],x[0],ayat))
phonetic_matches = phonetic_matches.filter(lambda x: x!=None)
phonetic_matches = phonetic_matches.groupBy(lambda x: x[1]).mapValues(lambda x: max(x))
phonetic_matches = phonetic_matches.sortBy(lambda x: x[1][0],False)
phonetic_matches = phonetic_matches.map(lambda x:(x[0],x[1][0]))

In [38]:
final_results = rdd_quran.map(lambda x: (x["verse"],x))
final_results = final_results.join(phonetic_matches)
final_results = final_results.collect()

In [39]:
final_results

[('112:1',
  ({'SurahName': 'surah-ikhlas',
    'arabic': 'قُلْ هُوَ اللَّهُ أَحَدٌ',
    'code': '112001',
    'translation': 'Say, "He is God, the One.',
    'urdu_translation': 'تم فرماؤ وہ اللہ ہے وہ ایک ہے',
    'verse': '112:1'},
   0.875))]