## 1- Library and Data Imports

In [36]:
import numpy as np
import pandas as pd
import time

# for Web Scraping
import requests
from bs4 import BeautifulSoup

# for text cleaning and preprocessing
import re
from nltk.corpus import stopwords
import string 
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
docs_df = pd.read_json('../Data/sample_data.json')
docs_df

Unnamed: 0,title,url,published_at,crawled_at,summary,content,tags,article_type
0,الزراعة تدرس أفكارا لشركات خاصة لاستثمار مياه ...,https://husna.fm/%D9%85%D8%AD%D9%84%D9%8A/%D8%...,2021-09-01 06:55:00,2022-10-07 08:32:02.179500,كشف أمين عام وزارة الزراعة محمد الحياري لـ حسن...,[كما أكد الحياري أن التعليمات الجديدة التي ستط...,"[مياه الزيبار, وزارة الزراعة, زيت الزيتون]",News
1,الأردن والسعودية يبحثان استثناء الشاحنات الأرد...,https://husna.fm/%D9%85%D8%AD%D9%84%D9%8A/%D8%...,2021-09-02 08:55:00,2021-01-07 08:32:02.179500,يعقد اجتماع بين هيئتي النقل في الأردن والسعودي...,[وأجرى وزير النقل م.وجيه عزايزة اتصالا هاتفيا ...,"[السعودية, الأردن]",News
2,ذكرى المولد النبوي طريق نور وهداية للبشرية,https://husna.fm/%D9%85%D8%AD%D9%84%D9%8A/%D8%...,2000-03-22 08:55:00,2002-01-07 08:32:02.179500,,[يحرص المسلمون في بقاع الأرض على الاحتفال بذكر...,[ذكرى المولد النبوي],Blog


## 2- Data Preparation and Cleaning

Cleaning Functions Applied:
- removing mentions
- removing punctuation
- removing Arabic diacritics (short vowels and other harakahs)
- removing elongation
- removing stopwords (which is available in NLTK corpus)

#### 2.1 preparing data for cleaning

In [34]:
docs_df['text'] = docs_df['content'].apply(lambda x: " ".join(x))
docs_df['text'] 

0    كما أكد الحياري أن التعليمات الجديدة التي ستطب...
1    وأجرى وزير النقل م.وجيه عزايزة اتصالا هاتفيا ا...
2    يحرص المسلمون في بقاع الأرض على الاحتفال بذكرى...
Name: text, dtype: object

#### 2.2 data cleaning

In [14]:
# punctuation symbols
punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation

# Arabic stop words with nltk
stop_words = stopwords.words()
arabic_diacritics = re.compile("""
                             ّ    | # Shadda
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)

def clean_text(txt): 
    #remove punctuations
    translator = str.maketrans('', '', punctuations)
    txt = txt.translate(translator)
    
    # remove Tashkeel
    txt = re.sub(arabic_diacritics, '', txt)
    
    # remove longation
    txt = re.sub("[إأآا]", "ا", txt)
    txt = re.sub("ى", "ي", txt)
    txt = re.sub("ؤ", "ء", txt)
    txt = re.sub("ئ", "ء", txt)
    txt = re.sub("ة", "ه", txt)
    txt = re.sub("گ", "ك", txt)
    
    # remove stopwords
    txt = ' '.join(word for word in txt.split() if word not in stop_words)
    
    return txt

In [48]:
docs_df['text_clean'] = docs_df['text'].apply(clean_text)
docs_df['summary_clean'] = docs_df['summary'].apply(clean_text)
docs_df['title_clean'] = docs_df['title'].apply(clean_text)
# + tags

display(docs_df)

Unnamed: 0,title,url,published_at,crawled_at,summary,content,tags,article_type,text,text_clean
0,الزراعة تدرس أفكارا لشركات خاصة لاستثمار مياه ...,https://husna.fm/%D9%85%D8%AD%D9%84%D9%8A/%D8%...,2021-09-01 06:55:00,2022-10-07 08:32:02.179500,كشف أمين عام وزارة الزراعة محمد الحياري لـ حسن...,[كما أكد الحياري أن التعليمات الجديدة التي ستط...,"[مياه الزيبار, وزارة الزراعة, زيت الزيتون]",News,كما أكد الحياري أن التعليمات الجديدة التي ستطب...,اكد الحياري ان التعليمات الجديده ستطبق اعتبارا...
1,الأردن والسعودية يبحثان استثناء الشاحنات الأرد...,https://husna.fm/%D9%85%D8%AD%D9%84%D9%8A/%D8%...,2021-09-02 08:55:00,2021-01-07 08:32:02.179500,يعقد اجتماع بين هيئتي النقل في الأردن والسعودي...,[وأجرى وزير النقل م.وجيه عزايزة اتصالا هاتفيا ...,"[السعودية, الأردن]",News,وأجرى وزير النقل م.وجيه عزايزة اتصالا هاتفيا ا...,واجري وزير النقل موجيه عزايزه اتصالا هاتفيا ال...
2,ذكرى المولد النبوي طريق نور وهداية للبشرية,https://husna.fm/%D9%85%D8%AD%D9%84%D9%8A/%D8%...,2000-03-22 08:55:00,2002-01-07 08:32:02.179500,,[يحرص المسلمون في بقاع الأرض على الاحتفال بذكر...,[ذكرى المولد النبوي],Blog,يحرص المسلمون في بقاع الأرض على الاحتفال بذكرى...,يحرص المسلمون بقاع الارض علي الاحتفال بذكري مو...


#### 2.3 info and preprocessing

In [108]:
def show_info_text(df_col):
    print(f"-> Number of Documents: {docs_df.shape[0]}")
    print('-' * 50, end='\n\n')

    print('-> Documents - First 150 letters')
    print()
    for i, document_i in enumerate(docs_df['text_clean']):
        print(f"Document Number {i+1}: {document_i[:150]}..")
        print()

    print('-' * 50)
    
def data_preprocessing(df_col):
    # Instantiate a TfidfVectorizer object
    global vectorizer
    vectorizer = TfidfVectorizer()
    
    # It fits the data and transform it as a vector
    X = vectorizer.fit_transform(df_col)
    # Convert the X as transposed matrix
    X = X.T.toarray()
    # Create a DataFrame and set the vocabulary as the index
    df = pd.DataFrame(X, index=vectorizer.get_feature_names())
    return df

In [29]:
show_info_text(docs_df['text_clean'])
text_clean_enc_df = data_preprocessing(docs_df['text_clean'])
text_clean_enc_df 

-> Number of Documents: 3
--------------------------------------------------

-> Documents - First 150 letters

Document Number 1: اكد الحياري ان التعليمات الجديده ستطبق اعتبارا الموسم الحالي اشترطت لترخيص المعاصر الجديده مواصفات معينه لتخفيض كميات مياه الزيبار السامه وماده الجفت ..

Document Number 2: واجري وزير النقل موجيه عزايزه اتصالا هاتفيا اليوم نظيره السعودي جاسم الصالح لحل مشكله الشاحنات الاردنيه فرضتها اللواءح التنظيميه المحدثه لدي الهيءه ال..

Document Number 3: يحرص المسلمون بقاع الارض علي الاحتفال بذكري مولد النبي الامين وسيد المرسلين محمد بن عبدالله احياء لسيرته العطره والتبصر مواعظ نحتاجها الايام لنستشف ال..

--------------------------------------------------




Unnamed: 0,0,1,2
12,0.000000,0.000000,0.035573
180,0.000000,0.097922,0.000000
256,0.071215,0.000000,0.000000
58,0.000000,0.097922,0.000000
ابو,0.000000,0.000000,0.071147
...,...,...,...
يسمع,0.000000,0.000000,0.035573
يشربون,0.000000,0.000000,0.035573
يقاتل,0.000000,0.000000,0.035573
يقول,0.000000,0.000000,0.035573


## 3- Calculating Similarities

In [154]:
def get_similar_articles(q, df):
    # Convert the query become a vector
    q = [q]
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    
    # Calculate the similarity
    sim = {}
    for i in range(df.shape[1]):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
        if np.isnan(sim[i]):
            sim[i] = 0

    # Sort the values 
    sim_sorted = list(sim.items())
    return sim_sorted

In [58]:
# Add The Query
q1 = 'مولد النبي'

# Measures
time_measure = None
most_freq_measure = None  

start_time = time.time()
sorted_docs_with_scores = get_similar_articles(q1, text_clean_enc_df)  # call function
time_measure = (time.time() - start_time) * 10**3

sorted_docs_with_scores

query: مولد النبي
Berikut artikel dengan nilai cosine similarity tertinggi: 


[(2, 0.10061669772619392), (0, 0.0), (1, 0.0)]

In [63]:
vocab_ = vectorizer.vocabulary_
print(f"number of unique words: {len(vocab_.keys())}")
most_freq_word = sorted(vocab_.items(), key=lambda x: x[1], reverse=True)[:1][0]
print('most frequent word is --> {} ({} times)'.format(most_freq_word[0], most_freq_word[1]))
score = len(vocab_.keys()) / most_freq_word[1]
print('Ratio: {:.3f}'.format(score))

most_freq_measure = most_freq_word[1]

number of unique words: 397
most frequent word is --> يوم (396 times)
Ratio: 1.003


## 4- getting top documents

In [33]:
top_5_docs = np.array(sorted_docs_with_scores, dtype='int32')[:5, 0]
top_5_docs

array([2, 0, 1])

In [65]:
# results 
print('time measure:', time_measure)
print('frequency measure:', most_freq_measure)
print('score %.3f' % score)

time measure: 1.0008811950683594
frequency measure: 396
score 1.003


## 5- Organizing Search Process

In [72]:
docs_df['text_clean'] = docs_df['text'].apply(clean_text)
docs_df['summary_clean'] = docs_df['summary'].apply(clean_text)
docs_df['title_clean'] = docs_df['title'].apply(clean_text)
# + tags

In [73]:
docs_df.head(1)

Unnamed: 0,title,url,published_at,crawled_at,summary,content,tags,article_type,text,text_clean,summary_clean,title_clean
0,الزراعة تدرس أفكارا لشركات خاصة لاستثمار مياه ...,https://husna.fm/%D9%85%D8%AD%D9%84%D9%8A/%D8%...,2021-09-01 06:55:00,2022-10-07 08:32:02.179500,كشف أمين عام وزارة الزراعة محمد الحياري لـ حسن...,[كما أكد الحياري أن التعليمات الجديدة التي ستط...,"[مياه الزيبار, وزارة الزراعة, زيت الزيتون]",News,كما أكد الحياري أن التعليمات الجديدة التي ستطب...,اكد الحياري ان التعليمات الجديده ستطبق اعتبارا...,كشف امين عام وزاره الزراعه محمد الحياري حسني ا...,الزراعه تدرس افكارا لشركات خاصه لاستثمار مياه ...


In [78]:
docs_df['tags']

0    [مياه الزيبار, وزارة الزراعة, زيت الزيتون]
1                            [السعودية, الأردن]
2                          [ذكرى المولد النبوي]
Name: tags, dtype: object

In [100]:
q_test = 'مياه الزراعة'
doc_ids = list(docs_df['tags'].index)
q_list = np.array(q_test.split(' '))
sim_score = list(np.zeros(docs_df.shape[0]))

for i, tag in enumerate(docs_df['tags']):
    print(tag)
    for str_tag in tag:
        q_list_map = np.vectorize(lambda x: x in str_tag)(q_list) 
        if True in q_list_map:
            sim_score[i] += 1
        print(q_list_map)
        
        
sim_non_sorted = list(zip(doc_ids, sim_score))
sim_sorted = sorted(sim_non_sorted, key=lambda x: x[1], reverse=True)

sim_sorted

['مياه الزيبار', 'وزارة الزراعة', 'زيت الزيتون']
[ True False]
[False  True]
[False False]
['السعودية', 'الأردن']
[False False]
[False False]
['ذكرى المولد النبوي']
[False False]


[(0, 2.0), (1, 0.0), (2, 0.0)]

In [188]:
import time
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

cols_weights = [0.1, 0.5, 0.15, 0.2]
cols = ['tags', 'text_clean', 'summary_clean', 'title_clean']
def overall(q, df):
    # print("query:", q)
    
    # note: potential class vars
    overall_time = 0
    overall_score_measure = 0
    similarities_list = []
    
    # apply search over every col of interest
    for col in cols:
        if col != 'tags':
            # 1 data preprocessing(each column)
            text_clean_enc_df = data_preprocessing(docs_df[col])

            # 2- count similarities (each column)
            time_measure = None
            most_freq_measure = None  
            start_time = time.time()
            sorted_docs_with_scores_content = get_similar_articles(q, text_clean_enc_df)  # call function
            time_measure = (time.time() - start_time) * 10**3

            # 3- results
            global vectorizer
            vocab_ = vectorizer.vocabulary_
            # print(f"number of unique words: {len(vocab_.keys())}")
            most_freq_word = sorted(vocab_.items(), key=lambda x: x[1], reverse=True)[:1][0]
            # print('most frequent word is --> {} ({} times)'.format(most_freq_word[0], most_freq_word[1]))

            score = len(vocab_.keys()) / most_freq_word[1]
            # print('Ratio: {:.3f}'.format(score))
            # print()
            # print('time measure:', time_measure)
        else:
            time_measure = None
            most_freq_measure = None  
            start_time = time.time()
            
            doc_ids = list(docs_df['tags'].index)
            q_list = np.array(q.split(' '))
            sim_score = list(np.zeros(docs_df.shape[0]))

            for i, tag in enumerate(docs_df['tags']):
                # print(tag)
                for str_tag in tag:
                    q_list_map = np.vectorize(lambda x: x in str_tag)(q_list) 
                    if True in q_list_map:
                        sim_score[i] += 1
                    # print(q_list_map)

            sim_non_sorted = list(zip(doc_ids, sim_score))
            sorted_docs_with_scores_content = sim_non_sorted
            
            time_measure = (time.time() - start_time) * 10**3
            score = 0
            
        similarities_list.append(sorted_docs_with_scores_content)
        overall_time += time_measure
        overall_score_measure += (score/(len(cols) - 1))
        
        averaged_scores_ids = np.array(resulting_simalarities)[0, :, 0]
        averaged_scores = np.average(np.array(resulting_simalarities)[:, :, 1], axis=0, weights=cols_weights)
        similarities_scores = list(zip(averaged_scores_ids, averaged_scores))
        # print('--------------------')
    
    return similarities_scores, overall_time, overall_score_measure
    
    
q1 = 'مولد النبي'
resulting_simalarities, SE_time, SE_avg_score = overall(q1, docs_df)

print("-" * 25, 'FINAL', '-' * 25)
print('resulting simalarities:')
for rs in resulting_simalarities:
    print(rs)
print()
print('search engine time taken', SE_time, 'ms')
print('search engine average score', SE_avg_score, '(uniqueness/frequency)')
print("-" * 25, '-----', '-' * 25)

------------------------- FINAL -------------------------
resulting simalarities:
(0.0, 0.0)
(1.0, 0.0)
(2.0, 0.15821931459273364)

search engine time taken 2.998828887939453 ms
search engine average score 1.021043771043771 (uniqueness/frequency)
------------------------- ----- -------------------------


  sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)


In [187]:
# TEST: COMPUTING AVERAGE SIMILARITY SCORES
# averaged_scores_ids = np.array(resulting_simalarities)[0, :, 0]
# display(averaged_scores_ids)

# display(np.array(resulting_simalarities)[:, :, 1])

# averaged_scores = np.average(np.array(resulting_simalarities)[:, :, 1], axis=0, weights=cols_weights)
# display(averaged_scores)

# list(zip(averaged_scores_ids, averaged_scores))

array([0., 1., 2.])

array([[0.       , 0.       , 1.       ],
       [0.       , 0.       , 0.1006167],
       [0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       ]])

array([0.        , 0.        , 0.15821931])

[(0.0, 0.0), (1.0, 0.0), (2.0, 0.15821931459273364)]