## 1- Library and Data Imports

In [1]:
import numpy as np
import pandas as pd
import time

# for text cleaning and preprocessing
import re
from nltk.corpus import stopwords
import string 
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
docs_df = pd.read_json('../Data/husna.json')

## 2- Data Preparation

#### 2.1 preparing data for cleaning

In [3]:
# MODIFIED
docs_df = docs_df.drop(columns=['publisher', 'crawled_at', 'url', 'published_at'], axis=1)

In [4]:
docs_df_dropped = docs_df.drop(index=
                               docs_df[(docs_df['content'].str.len() == 0) & (docs_df['title'] == '')].index, axis=0)
docs_df_dropped = docs_df_dropped.reset_index(drop=True)
docs_df = docs_df_dropped

In [5]:
docs_df['text'] = docs_df['content'].apply(lambda x: " ".join(x))

## 3- Data Cleaning

In [7]:
def show_info_text(df_col):
    print(f"-> Number of Documents: {docs_df.shape[0]}")
    print('-' * 50, end='\n\n')

    print('-> Documents - First 150 letters')
    print()
    for i, document_i in enumerate(docs_df['text_clean'][:20]):
        print(f"Document Number {i+1}: {document_i[:150]}..")
        print()

    print('-' * 50)
    
def data_preprocessing(df_col):
    # Instantiate a TfidfVectorizer object
    global vectorizer
    vectorizer = TfidfVectorizer()
    
    # It fits the data and transform it as a vector
    X = vectorizer.fit_transform(df_col)
    # Convert the X as transposed matrix
    X = X.T.toarray()
    # Create a DataFrame and set the vocabulary as the index
    df = pd.DataFrame(X, index=vectorizer.get_feature_names())
    return df

### 3.1 data cleaning (ver.1)

handle:
- removing mentions
- removing punctuation
- removing Arabic diacritics (short vowels and other harakahs) 
    - حركات وشد
- removing elongation 
    - مد
- removing stopwords (which is available in NLTK corpus)
    - normal stopwords (not specific to arabic)
- remove words from languages other than arabic and english

In [6]:
docs_df_cleaned = docs_df.copy()

In [392]:
# punctuation symbols
punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation

# Arabic stop words with nltk
stop_words = stopwords.words()
arabic_diacritics = re.compile("""
                             ّ    | # Shadda
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)

def clean_text(txt): 
    #remove punctuations
    translator = str.maketrans('', '', punctuations)
    txt = txt.translate(translator)
    
    # remove Tashkeel
    txt = re.sub(arabic_diacritics, '', txt)
    
    # remove longation
    txt = re.sub("[إأآا]", "ا", txt)
    txt = re.sub("ى", "ي", txt)
    txt = re.sub("ؤ", "ء", txt)
    txt = re.sub("ئ", "ء", txt)
    txt = re.sub("ة", "ه", txt)
    txt = re.sub("گ", "ك", txt)
    
    # remove stopwords
    txt = ' '.join(word for word in txt.split() if word not in stop_words)
    
    # remove non-arabic words, or non-numbers, or non-english words in the text
    txt = re.sub(r'[^a-zA-Z\s0-9\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD.0-9]+'
                 ,' ', txt)
    
    return txt

In [393]:
docs_df_cleaned = docs_df.drop(columns=['_id', 'title', 'summary', 'content', 'text'])

start_time = time.time()
docs_df_cleaned['text_clean'] = docs_df['text'].apply(clean_text)
time_measure = (time.time() - start_time) * 10**3

# docs_df_cleaned['summary_clean'] = docs_df['summary'].apply(clean_text) # no need for now
docs_df_cleaned['title_clean'] = docs_df['title'].apply(clean_text)
text_clean_enc_df = data_preprocessing(docs_df_cleaned['text_clean'])

text_clean_enc_df 



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5074,5075,5076,5077,5078,5079,5080,5081,5082,5083
00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00000015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ہر,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ہمارے,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ہولوکاسٹ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ہےکہ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**NOTE** 
- ~6000 source documents -> ~5000 documents -> ~89,000 tokens
- Clean time for 'text' column: ~111 seconds
- **Problems**:
    - may not be normalized enough
    - words from other languages
    - confusing numbers (remove or keep?)
        - remove english numbers? or arabic numbers? or both?
        - should we remove words of letters mixed with numbers (E.g. COVID19)
    - links (remove or keep?)
        

### 3.1 data cleaning (ver.2)

handle:
- removing Arabic diacritics (short vowels and other harakahs)
- variation by form and spelling, based on context (Orthographic Ambiguity)
- existence of many forms for the same word (Morphological Richness)
- dialects (Dialectal Variation)
- different ways to write the same word when writing in dialectal Arabic, for which there is no agreed-upon standard
    - Orthographic Inconsistency
- removing longation and stop words
- remove words from languages other than arabic and english
   
these problems can possibly lead to immensly large vocabularies generated.

In [530]:
docs_df_cleaned2 = docs_df.copy()

In [531]:
# import the dediacritization tool
from camel_tools.utils.dediac import dediac_ar

# Reducing Orthographic Ambiguity
from camel_tools.utils.normalize import normalize_alef_maksura_ar
from camel_tools.utils.normalize import normalize_alef_ar
from camel_tools.utils.normalize import normalize_teh_marbuta_ar

# toknenization
from camel_tools.tokenizers.word import simple_word_tokenize

# Morphological Disambiguation (Maximum Likelihood Disambiguator)
from camel_tools.disambig.mle import MLEDisambiguator
mle = MLEDisambiguator.pretrained() # instantiation fo MLE disambiguator

# tokenization / lemmatization (choosing approach that best fit the project)
from camel_tools.tokenizers.morphological import MorphologicalTokenizer

import re
from nltk.corpus import stopwords

In [532]:
stop_words = stopwords.words()
tokenizer = MorphologicalTokenizer(mle, scheme='atbtok', diac=False) # atbseg scheme 
def text_clean2(txt):
    # dediacritization
    txt = dediac_ar(txt)
    
    # normalization: Reduce Orthographic Ambiguity and Dialectal Variation
    txt = normalize_alef_maksura_ar(txt)
    txt = normalize_alef_ar(txt)
    txt = normalize_teh_marbuta_ar(txt)
    
    # normalization: Reducing Morphological Variation
    tokens = simple_word_tokenize(txt)
    disambig = mle.disambiguate(tokens)
    lemmas = [d.analyses[0].analysis['lex'] for d in disambig]
    tokens = tokenizer.tokenize(lemmas)
    txt = ' '.join(tokens)
    
    # remove longation
    txt = re.sub("[إأآا]", "ا", txt)
    txt = re.sub("ى", "ي", txt)
    txt = re.sub("ؤ", "ء", txt)
    txt = re.sub("ئ", "ء", txt)
    txt = re.sub("ة", "ه", txt)
    txt = re.sub("گ", "ك", txt)
    
    # remove stopwords
    txt = ' '.join(word for word in txt.split() if word not in stop_words)
    
    # remove non-arabic words, or non-numbers, or non-english words in the text
    txt = re.sub(r'[^a-zA-Z\s0-9\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD.0-9]+'
                 ,' ', txt)
    
    return txt

In [533]:
# apply to your text column
# df['text_clean'] = df['text_clean'].apply(dediac_ar)

docs_df_cleaned2 = docs_df.drop(columns=['_id', 'title', 'summary', 'content', 'text'])

start_time = time.time()
# docs_df_cleaned2['text_clean'] = docs_df['text'].apply(dediac_ar)
# docs_df_cleaned2['text_clean'] = docs_df_cleaned2['text_clean'].apply(ortho_normalize)
# docs_df_cleaned2['text_clean']  = docs_df_cleaned2['text_clean'].apply(simple_word_tokenize)
docs_df_cleaned2['text_clean'] = docs_df['text'].apply(text_clean2)
time_measure = (time.time() - start_time) * 10**3

In [534]:
text_clean_enc_df = data_preprocessing(docs_df_cleaned2['text_clean'])

text_clean_enc_df



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5074,5075,5076,5077,5078,5079,5080,5081,5082,5083
00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0000015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ہر,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ہمارے,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ہولوکاسٹ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ہےکہ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- ~6000 source documents -> ~5000 documents -> ~23,000 tokens
- Clean time for 'text' column: ~180 seconds
- Problems:
    - stopwords (remove or keep?)
    - normalization may have cut out too many tokens 
    - confusing numbers (remove or keep?)
        - remove english numbers? or arabic numbers? or both?
    - should we remove words of letters mixed with numbers (E.g. COVID19)
    - links (remove or keep?)
    
**NOTE** discuss with instructor before proceeding

### 3.x check results

In [396]:
# display(docs_df_cleaned.head())
i=0

In [515]:
# check results
print(f'--> {i}')
display(text_clean_enc_df.index[50*i:50*(i+1)])
display(text_clean_enc_df.index[-50*(i+1):(-50*i)-1])
i += 1

print('clean time: {:.2f} seconds'.format(time_measure * 10**-3))

--> 118


Index(['اجرامي', 'اجراميا', 'اجراميه', 'اجراه', 'اجراها', 'اجرت', 'اجرته',
       'اجرتها', 'اجره', 'اجرها', 'اجرهم', 'اجرو', 'اجروا', 'اجري', 'اجرياها',
       'اجرياهما', 'اجريت', 'اجرين', 'اجزاء', 'اجزاءه', 'اجزاءها', 'اجزم',
       'اجساد', 'اجسادهم', 'اجسام', 'اجساما', 'اجسامنا', 'اجسامهم', 'اجعلك',
       'اجل', 'اجلاء', 'اجلاءه', 'اجلاءها', 'اجلاءهم', 'اجلال', 'اجلت', 'اجلس',
       'اجله', 'اجلها', 'اجلهم', 'اجلي', 'اجماع', 'اجماعا', 'اجمال', 'اجمالي',
       'اجماليه', 'اجمع', 'اجمعت', 'اجمعوا', 'اجمعين'],
      dtype='object')

Index(['ومستوطنوه', 'ومستوطنيه', 'ومستوفيه', 'ومستوي', 'ومستويات', 'ومستوياته',
       'ومسجد', 'ومسجدها', 'ومسجل', 'ومسجله', 'ومسددين', 'ومسرحه', 'ومسرحيه',
       'ومسرحيين', 'ومسرعه', 'ومسري', 'ومسعر', 'ومسكين', 'ومسلح', 'ومسلحه',
       'ومسلسل', 'ومسلسلات', 'ومسلم', 'ومسلمات', 'ومسلمه', 'ومسلمين',
       'ومسلوقه', 'ومسمع', 'ومسميات', 'ومسنه', 'ومسنون', 'ومسوحات', 'ومسوده',
       'ومسور', 'ومسيء', 'ومسيحين', 'ومسيحيه', 'ومسيحيين', 'ومسيرات',
       'ومسيرته', 'ومسيرتهم', 'ومسيره', 'ومشابهه', 'ومشادات', 'ومشارفها',
       'ومشاركاتهم', 'ومشاركته', 'ومشاركتهم', 'ومشاركه'],
      dtype='object')

clean time: 99.80 seconds


In [None]:
vocab_ = vectorizer.vocabulary_
print(f"number of unique words: {len(vocab_.keys())}")
most_freq_word = sorted(vocab_.items(), key=lambda x: x[1], reverse=True)[:1][0]
print('most frequent word is --> {} ({} times)'.format(most_freq_word[0], most_freq_word[1]))
score = len(vocab_.keys()) / most_freq_word[1]
print('Ratio: {:.3f}'.format(score))

## 4- Apply Cleaning on Query

In [521]:
query_test = 'مولد النبي'
query_test_cleaned = text_clean2(query_test)
query_test_cleaned

'مولد نبي'

In [None]:
# OPTIONAL

# query_transformed = vectorizer.transform([query_test_cleaned])
# query_transformed = query_transformed.T.toarray()
# df = pd.DataFrame(query_transformed, index=vectorizer.get_feature_names())
# df

## 5- Calculating Similarities

In [522]:
def get_similar_articles(q, df):
    # Convert the query become a vector
    q = [q]
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    
    # Calculate the similarity
    sim = {}
    for i in range(df.shape[1]):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
        if np.isnan(sim[i]):
            sim[i] = 0

    # Sort the values 
    sim_sorted = list(sim.items())
    return sim_sorted

In [523]:
# Add The Query
q1 = 'مولد النبي'
q1_cleaned = text_clean2(query_test)

# Measures
time_measure = None
most_freq_measure = None  

start_time = time.time()
sorted_docs_with_scores = get_similar_articles(q1, text_clean_enc_df)  # call function
time_measure = (time.time() - start_time) * 10**3

sorted_docs_with_scores[:10]

  sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)


[(0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0)]

In [524]:
vocab_ = vectorizer.vocabulary_
print(f"number of unique words: {len(vocab_.keys())}")
most_freq_word = sorted(vocab_.items(), key=lambda x: x[1], reverse=True)[:1][0]
print('most frequent word is --> {} ({} times)'.format(most_freq_word[0], most_freq_word[1]))
score = len(vocab_.keys()) / most_freq_word[1]
print('Ratio: {:.3f}'.format(score))

most_freq_measure = most_freq_word[1]

number of unique words: 89018
most frequent word is --> یقول (89017 times)
Ratio: 1.000


## 6- getting top documents

In [528]:
sorted_docs_with_scores = sorted(sorted_docs_with_scores, key=lambda x: x[1], reverse=True)
top_5_docs = np.array(sorted_docs_with_scores, dtype='int32')[:5, 0]
top_5_docs

array([1697, 2152, 2608, 2527, 2147])

In [62]:
# results 
print('time measure:', time_measure)
print('frequency measure:', most_freq_measure)
print('score %.3f' % score)

time measure: 648.0000019073486
frequency measure: 89123
score 1.000


## 7- Organizing Search Process

In [72]:
docs_df['text_clean'] = docs_df['text'].apply(clean_text)
docs_df['summary_clean'] = docs_df['summary'].apply(clean_text)
docs_df['title_clean'] = docs_df['title'].apply(clean_text)
# + tags

In [73]:
docs_df.head(1)

Unnamed: 0,title,url,published_at,crawled_at,summary,content,tags,article_type,text,text_clean,summary_clean,title_clean
0,الزراعة تدرس أفكارا لشركات خاصة لاستثمار مياه ...,https://husna.fm/%D9%85%D8%AD%D9%84%D9%8A/%D8%...,2021-09-01 06:55:00,2022-10-07 08:32:02.179500,كشف أمين عام وزارة الزراعة محمد الحياري لـ حسن...,[كما أكد الحياري أن التعليمات الجديدة التي ستط...,"[مياه الزيبار, وزارة الزراعة, زيت الزيتون]",News,كما أكد الحياري أن التعليمات الجديدة التي ستطب...,اكد الحياري ان التعليمات الجديده ستطبق اعتبارا...,كشف امين عام وزاره الزراعه محمد الحياري حسني ا...,الزراعه تدرس افكارا لشركات خاصه لاستثمار مياه ...


In [78]:
docs_df['tags']

0    [مياه الزيبار, وزارة الزراعة, زيت الزيتون]
1                            [السعودية, الأردن]
2                          [ذكرى المولد النبوي]
Name: tags, dtype: object

In [100]:
q_test = 'مياه الزراعة'
doc_ids = list(docs_df['tags'].index)
q_list = np.array(q_test.split(' '))
sim_score = list(np.zeros(docs_df.shape[0]))

for i, tag in enumerate(docs_df['tags']):
    print(tag)
    for str_tag in tag:
        q_list_map = np.vectorize(lambda x: x in str_tag)(q_list) 
        if True in q_list_map:
            sim_score[i] += 1
        print(q_list_map)
        
        
sim_non_sorted = list(zip(doc_ids, sim_score))
sim_sorted = sorted(sim_non_sorted, key=lambda x: x[1], reverse=True)

sim_sorted

['مياه الزيبار', 'وزارة الزراعة', 'زيت الزيتون']
[ True False]
[False  True]
[False False]
['السعودية', 'الأردن']
[False False]
[False False]
['ذكرى المولد النبوي']
[False False]


[(0, 2.0), (1, 0.0), (2, 0.0)]

In [188]:
import time
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

cols_weights = [0.1, 0.5, 0.15, 0.2]
cols = ['tags', 'text_clean', 'summary_clean', 'title_clean']
def overall(q, df):
    # print("query:", q)
    
    # note: potential class vars
    overall_time = 0
    overall_score_measure = 0
    similarities_list = []
    
    # apply search over every col of interest
    for col in cols:
        if col != 'tags':
            # 1 data preprocessing(each column)
            text_clean_enc_df = data_preprocessing(docs_df[col])

            # 2- count similarities (each column)
            time_measure = None
            most_freq_measure = None  
            start_time = time.time()
            sorted_docs_with_scores_content = get_similar_articles(q, text_clean_enc_df)  # call function
            time_measure = (time.time() - start_time) * 10**3

            # 3- results
            global vectorizer
            vocab_ = vectorizer.vocabulary_
            # print(f"number of unique words: {len(vocab_.keys())}")
            most_freq_word = sorted(vocab_.items(), key=lambda x: x[1], reverse=True)[:1][0]
            # print('most frequent word is --> {} ({} times)'.format(most_freq_word[0], most_freq_word[1]))

            score = len(vocab_.keys()) / most_freq_word[1]
            # print('Ratio: {:.3f}'.format(score))
            # print()
            # print('time measure:', time_measure)
        else:
            time_measure = None
            most_freq_measure = None  
            start_time = time.time()
            
            doc_ids = list(docs_df['tags'].index)
            q_list = np.array(q.split(' '))
            sim_score = list(np.zeros(docs_df.shape[0]))

            for i, tag in enumerate(docs_df['tags']):
                # print(tag)
                for str_tag in tag:
                    q_list_map = np.vectorize(lambda x: x in str_tag)(q_list) 
                    if True in q_list_map:
                        sim_score[i] += 1
                    # print(q_list_map)

            sim_non_sorted = list(zip(doc_ids, sim_score))
            sorted_docs_with_scores_content = sim_non_sorted
            
            time_measure = (time.time() - start_time) * 10**3
            score = 0
            
        similarities_list.append(sorted_docs_with_scores_content)
        overall_time += time_measure
        overall_score_measure += (score/(len(cols) - 1))
        
        averaged_scores_ids = np.array(resulting_simalarities)[0, :, 0]
        averaged_scores = np.average(np.array(resulting_simalarities)[:, :, 1], axis=0, weights=cols_weights)
        similarities_scores = list(zip(averaged_scores_ids, averaged_scores))
        # print('--------------------')
    
    return similarities_scores, overall_time, overall_score_measure
    
    
q1 = 'مولد النبي'
resulting_simalarities, SE_time, SE_avg_score = overall(q1, docs_df)

print("-" * 25, 'FINAL', '-' * 25)
print('resulting simalarities:')
for rs in resulting_simalarities:
    print(rs)
print()
print('search engine time taken', SE_time, 'ms')
print('search engine average score', SE_avg_score, '(uniqueness/frequency)')
print("-" * 25, '-----', '-' * 25)

------------------------- FINAL -------------------------
resulting simalarities:
(0.0, 0.0)
(1.0, 0.0)
(2.0, 0.15821931459273364)

search engine time taken 2.998828887939453 ms
search engine average score 1.021043771043771 (uniqueness/frequency)
------------------------- ----- -------------------------


  sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)


In [187]:
# TEST: COMPUTING AVERAGE SIMILARITY SCORES
# averaged_scores_ids = np.array(resulting_simalarities)[0, :, 0]
# display(averaged_scores_ids)

# display(np.array(resulting_simalarities)[:, :, 1])

# averaged_scores = np.average(np.array(resulting_simalarities)[:, :, 1], axis=0, weights=cols_weights)
# display(averaged_scores)

# list(zip(averaged_scores_ids, averaged_scores))

array([0., 1., 2.])

array([[0.       , 0.       , 1.       ],
       [0.       , 0.       , 0.1006167],
       [0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       ]])

array([0.        , 0.        , 0.15821931])

[(0.0, 0.0), (1.0, 0.0), (2.0, 0.15821931459273364)]