### Base Words (Manually Collected)

In [5]:
import gensim
import re
import numpy as np
from nltk import ngrams
import pandas as pd
import os
import sys
from pydantic import BaseModel
class Word(BaseModel):
    query: str
    id: int
    searched: bool
    scroll_ahead: int

In [2]:
arabic_news_words = list(set(["ุฎุจุฑ", "ุฃุฎุจุงุฑ", 
"ูุจุฃ", "ุฃูุจุงุก",  
"ุญุฏูุซ", "ุฃุญุงุฏูุซ",  # hadith (singular), ahadith (plural) - report, ุงูุญุฏูุซ (al-hadith) - the news (specifically Islamic traditions)
"ุตุญุงูุฉ",  # ุงูุตุญุงูุฉ (as-sihaafah) - journalism, press
"ุฅุนูุงู",  # ุงูุฅุนูุงู (al-i'lam) - media, information
"ุฌุฑูุฏุฉ", "ุฌุฑุงุฆุฏ",  # jaridah (singular), jaraaid (plural) - newspaper
"ูุฌูุฉ", "ูุฌูุงุช",  # majalah (singular), majallaat (plural) - magazine
"ููุงุฉ", "ูููุงุช",  # qanaah (singular), qanaat (plural) - channel (TV)
"ุฅุฐุงุนุฉ",  # ุฅุฐุงุนุฉ (idhaa'ah) - radio broadcast
"ุจุซ",  # ุจุซ (bath) - broadcast
"ุชูุฑูุฑ", "ุชูุงุฑูุฑ",  # taqrir (singular), taqariir (plural) - report
"ุฎุจุฑ ุนุงุฌู",  # khabar 'ajil - breaking news
"ุฃุฎุจุงุฑ ุฑูุงุถูุฉ",  # akhbaar riyadiyah - sports news
"ุฃุฎุจุงุฑ ุฌููุฉ",  # akhbaar jawwiyah - weather news
"ุณูุงุณุฉ",  # siyasah - politics
"ุงุฌุชูุงุน", "ุงุฌุชูุงุนุงุช",  # ijtimaa' (singular), ijtimaaat (plural) - meeting, conference
"ุชุตุฑูุญ", "ุชุตุฑูุญุงุช",  # tasrih (singular), tasrihaat (plural) - statement, declaration
"ุชุญููู", "ุชุญูููุงุช",  # tahqiq (singular), tahqiqaat (plural) - investigation, report  
"ููุงุจูุฉ", "ููุงุจูุงุช",  # muqabalah (singular), muqabalฤt (plural) - interview  
"ุญุฏุซ", "ุฃุญุฏุงุซ",  # hadath (singular), aุญุฏุงุซ (ahdaath) - event, happening
"ุงูุฌุฒูุฑุฉ",  # Al Jazeera
"ุงูุนุฑุจูุฉ",  # Al Arabiya
"ุจู ุจู ุณู",  # BBC Arabic
"ุณูุงู ูููุฒ",  # Sky News Arabia
"ุณู ุฅู ุฅู",  # CNN Arabic
"ููุงุฉ ุงูุดุฑูู",  # Al-Shorouk TV
"ููุงุฉ ุงูุญุฑุฉ",  # Alhurra TV
"ุฑูุณูุง ุงูููู",  # RT Arabic
"ูุฑุงูุณ 24", # France 24 (Arabic)
"DW ุนุฑุจูุฉ",
"ุฃุฎุจุงุฑ", "ุฃูุจุงุก", "ุญุฏูุซ", "ูุจุฃ",
"ุงูุฃุญุฏุงุซ ุงูุฌุงุฑูุฉ", "ุงูุดุคูู ุงูุฌุงุฑูุฉ",
"ุชูุฑูุฑ", "ุฎุจุฑ", "ุฅุจูุงุบ",
"ุงูุตุญุงูุฉ", "ุงูุฅุนูุงู",
"ุงูุฅุนูุงู", "ุงููุณุงุฆู ุงูุฅุนูุงููุฉ",
"ููุงุฉ", "ูุญุทุฉ",
"ุชููุฒููู",
"ุฅุฐุงุนุฉ",
"ุงูุฌุฒูุฑุฉ",  # Qatar
"ุจู ุจู ุณู",  # International
"ุณู ุฅู ุฅู",  # International
"ุงูุนุฑุจูุฉ",  # UAE
"ุณูุงู ูููุฒ",  # UAE
"ุงููุงูุฑุฉ ุงูุฅุฎุจุงุฑูุฉ",  # Egypt
"ON E",  # Egypt
"ุงู ูุจุณ",  # Saudi Arabia
"ุฑูุชุงูุง ุฎููุฌูุฉ",
'ุงูุจูุฏูุงุณุช ุงูุจุญุฑูู',
'ุงูุจูุฏูุงุณุช ูุตุฑ', 
'ุงูุจูุฏูุงุณุช ุฅูุฑุงู',
'ุงูุจูุฏูุงุณุช ุงูุนุฑุงู',
'ุงูุจูุฏูุงุณุช ุงูุฃุฑุฏู',
'ุงูุจูุฏูุงุณุช ุงููููุช', 
'ุงูุจูุฏูุงุณุช ูุจูุงู',
'ุงูุจูุฏูุงุณุช ุนูุงู',
'ุงูุจูุฏูุงุณุช ูุทุฑ', 
'ุงูุจูุฏูุงุณุช ุงูุณุนูุฏูุฉ', 
'ุงูุจูุฏูุงุณุช ุณูุฑูุง',
'ุงูุจูุฏูุงุณุช ุงูุฅูุงุฑุงุช ุงูุนุฑุจูุฉ ุงููุชุญุฏุฉ',
'ุงูุจูุฏูุงุณุช ุงูููู',
'ุงูุจูุฏูุงุณุช ุงููุตุฑู',
'ุงูุจูุฏูุงุณุช',
 "ุงูุฌุฒุงุฆุฑ", "ุงูุจุญุฑูู", "ูุตุฑ", "ุงูุนุฑุงู", "ุงูุฃุฑุฏู", "ุงููููุช", "ูุจูุงู", "ููุจูุง", "ููุฑูุชุงููุง", "ุงููุบุฑุจ", 
    "ุนูุงู", "ููุณุทูู", "ูุทุฑ", "ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ", "ุงูุณูุฏุงู", "ุณูุฑูุง", "ุชููุณ", "ุงูุฅูุงุฑุงุช ุงูุนุฑุจูุฉ ุงููุชุญุฏุฉ", "ุงูููู",
    "ุจูุฏูุงุณุช ุงูุฌุฒุงุฆุฑ", "ุจูุฏูุงุณุช ุงูุจุญุฑูู", "ุจูุฏูุงุณุช ูุตุฑ", "ุจูุฏูุงุณุช ุงูุนุฑุงู", "ุจูุฏูุงุณุช ุงูุฃุฑุฏู", "ุจูุฏูุงุณุช ุงููููุช", "ุจูุฏูุงุณุช ูุจูุงู", 
    "ุจูุฏูุงุณุช ููุจูุง", "ุจูุฏูุงุณุช ููุฑูุชุงููุง", "ุจูุฏูุงุณุช ุงููุบุฑุจ", "ุจูุฏูุงุณุช ุนูุงู", "ุจูุฏูุงุณุช ููุณุทูู", "ุจูุฏูุงุณุช ูุทุฑ", 
    "ุจูุฏูุงุณุช ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ", "ุจูุฏูุงุณุช ุงูุณูุฏุงู", "ุจูุฏูุงุณุช ุณูุฑูุง", "ุจูุฏูุงุณุช ุชููุณ", "ุจูุฏูุงุณุช ุงูุฅูุงุฑุงุช ุงูุนุฑุจูุฉ ุงููุชุญุฏุฉ", "ุจูุฏูุงุณุช ุงูููู",
    "ุฃุฎุจุงุฑ ุงูุฌุฒุงุฆุฑ", "ุฃุฎุจุงุฑ ุงูุจุญุฑูู", "ุฃุฎุจุงุฑ ูุตุฑ", "ุฃุฎุจุงุฑ ุงูุนุฑุงู", "ุฃุฎุจุงุฑ ุงูุฃุฑุฏู", "ุฃุฎุจุงุฑ ุงููููุช", "ุฃุฎุจุงุฑ ูุจูุงู", "ุฃุฎุจุงุฑ ููุจูุง", 
    "ุฃุฎุจุงุฑ ููุฑูุชุงููุง", "ุฃุฎุจุงุฑ ุงููุบุฑุจ", "ุฃุฎุจุงุฑ ุนูุงู", "ุฃุฎุจุงุฑ ููุณุทูู", "ุฃุฎุจุงุฑ ูุทุฑ", "ุฃุฎุจุงุฑ ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ", 
    "ุฃุฎุจุงุฑ ุงูุณูุฏุงู", "ุฃุฎุจุงุฑ ุณูุฑูุง", "ุฃุฎุจุงุฑ ุชููุณ", "ุฃุฎุจุงุฑ ุงูุฅูุงุฑุงุช ุงูุนุฑุจูุฉ ุงููุชุญุฏุฉ", "ุฃุฎุจุงุฑ ุงูููู"
]))  # Saudi Arabia]  # DW Arabic

In [3]:
df = pd.DataFrame({"query":arabic_news_words, "searched":[False for _ in arabic_news_words]})
df.head()

Unnamed: 0,query,searched
0,ุงูุฃุฑุฏู,False
1,ุฃุฎุจุงุฑ ููุณุทูู,False
2,ุงุฌุชูุงุนุงุช,False
3,ูุฌูุงุช,False
4,ููุงุฉ ุงูุญุฑุฉ,False


In [4]:
df.to_csv(r'../data/words.csv',index=False)

In [5]:
len(df)

126

### Populate Words

In [6]:
## Preprocessing function
def clean_str(text):
    search = ["ุฃ","ุฅ","ุข","ุฉ","_","-","/",".","ุ"," ู "," ูุง ",'"',"ู","'","ู","\\",'\n', '\t','&quot;','?','ุ','!']
    replace = ["ุง","ุง","ุง","ู"," "," ","","",""," ู"," ูุง","","","","ู","",' ', ' ',' ',' ? ',' ุ ',' ! ']
    
    #remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    
    #remove longation
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    
    text = text.replace('ูู', 'ู')
    text = text.replace('ูู', 'ู')
    text = text.replace('ุงุง', 'ุง')
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    
    #trim    
    text = text.strip()

    return text

In [7]:
for arabic_word in arabic_news_words:

    t_model = gensim.models.Word2Vec.load(r'../models/full_grams_cbow_300_twitter/full_grams_cbow_300_twitter.mdl')
    token = clean_str(arabic_word).replace(" ", "_")

    if token in t_model.wv:

        most_similar = t_model.wv.most_similar( token, topn=25 )
        
        for term, score in most_similar:
            term = clean_str(term).replace(" ", "_")
            
            if term != token:
                term = term.replace("_"," ")
                df = pd.read_csv(r'../data/words.csv')
                if not (term in df['query'].values):
                    df = pd.concat([df, pd.DataFrame({"query":[term], "searched":[False]})], ignore_index=True)
                    df.to_csv(r'../data/words.csv',index=False) 
                    print(term)

ุณูุฑูู
ุงูุงูุงุฑุงุช
ุงูุณุนูุฏูู
ุงูููููู
ุงุฑุจุฏ
ุจุงูุงุฑุฏู
ุจูุฑูุช
ุงูุฎููุฌ
ุงูุงุฑุฏููู
ุจุฑูุทุงููุง
ูุงุฏุจุง
ุณูุทูู ุนูุงู
ุบุฒู
ูุคุชูุฑุงุช
ููุงุกุงุช
ูุดุงูุฑุงุช
ูุฏูุงุช
ุงุฌุชูุงุน ูุฒุฑุงุก
ุฒูุงุฑุงุช
ุงูุงุฌุชูุงุนุงุช
ูุจุงุญุซุงุช
ูุคุชูุฑ
ุจุงุฌุชูุงุนุงุช
ูุงุฌุชูุงุนุงุช
ุงูุงุฌุชูุงุน
ูุดุงุทุงุช
ููุงูุดุงุช
ุงุฌุชูุงุน ูุฌูุณ
ุงุชูุงูุงุช
ุงุฌุชูุงุนุงุชู
ุงููุคุชูุฑ
ุงุฌุชูุงุน ุทุงุฑุฆ
ุงูุดุทู
ุฌูุณุงุช
ูุฏุงููุงุช
ุงุฌุชูุงุนุงุชูุง
ุงุฌุชูุงุนุง
ููุฌูุงุช
ุงููุฌูุงุช
ุงููุงู
ุจุฑูุดูุฑุงุช
ูุฌูู
ุฑูุงูุงุช
ุทุจุนุงุช
ูุชุจ ุงููุชุฑูููู
ููุชุจุงุช
ุจุฑุงูุฌ
ููุงูุงุช
ูุจุฑุงูุฌ
ูุฏููุงุช
ุฑุณููุงุช
ุฌุฑุงูุฏ
ุงูุจุฑ ููุณูุนู
ุณูุฏูุงุช
ูุชูุจุงุช
ุชุฑุฌูุงุช
ุงุบููู
ุงููุงู ููุณูุณูุงุช
ุงุดุฑุทู
ุงููุงู ูุฑุชูู
ุงูุนุงุจ
ููุงู ูุฑุงูุณ
ุงูุจู ุจู ุณู
ุงูุฌุฒูุฑู ูุจุงุดุฑ
ููุงู ุงูุฌุฒูุฑู

### Clean Up

In [9]:
df = pd.read_csv(r'../data/words.csv')
df.head()

Unnamed: 0,query,searched
0,ุงูุฃุฑุฏู,False
1,ุฃุฎุจุงุฑ ููุณุทูู,False
2,ุงุฌุชูุงุนุงุช,False
3,ูุฌูุงุช,False
4,ููุงุฉ ุงูุญุฑุฉ,False


In [10]:
df['query'][417]

'ูุชุตุฑูุญุงุช'

In [11]:
df['query'] = df['query'].str.replace(r'[^\u0600-\u06FF\u0660-\u0669]', ' ', regex=True)

df['query'][417]

'ูุชุตุฑูุญุงุช'

In [12]:
df['query'] = df['query'].str.replace(r'\s+', ' ', regex=True).str.strip()
df['query'][417]

'ูุชุตุฑูุญุงุช'

In [13]:
df.head()

Unnamed: 0,query,searched
0,ุงูุฃุฑุฏู,False
1,ุฃุฎุจุงุฑ ููุณุทูู,False
2,ุงุฌุชูุงุนุงุช,False
3,ูุฌูุงุช,False
4,ููุงุฉ ุงูุญุฑุฉ,False


In [15]:
len(df[df['query'] != ""])

1312

In [16]:
df = df[df['query'] != ""]
df.head()

Unnamed: 0,query,searched
0,ุงูุฃุฑุฏู,False
1,ุฃุฎุจุงุฑ ููุณุทูู,False
2,ุงุฌุชูุงุนุงุช,False
3,ูุฌูุงุช,False
4,ููุงุฉ ุงูุญุฑุฉ,False


In [17]:
df = df.drop_duplicates()
df.duplicated().any()

False

In [18]:
len(df)

1301

In [19]:
df.to_csv(r'../data/words.csv',index=False)

In [20]:
import pandas as pd
df = pd.read_csv(r"../data/words.csv")
df.head()

Unnamed: 0,query,searched
0,ุงูุฃุฑุฏู,False
1,ุฃุฎุจุงุฑ ููุณุทูู,False
2,ุงุฌุชูุงุนุงุช,False
3,ูุฌูุงุช,False
4,ููุงุฉ ุงูุญุฑุฉ,False


In [21]:
for row in df.values:
    print(row[0])
    break

ุงูุฃุฑุฏู


In [22]:
import pandas as pd
df = pd.read_csv(r"../data/words.csv")

In [23]:
df['scroll_ahead'] = 0
df['id'] = range(len(df))


In [24]:
df.head()

Unnamed: 0,query,searched,scroll_ahead,id
0,ุงูุฃุฑุฏู,False,0,0
1,ุฃุฎุจุงุฑ ููุณุทูู,False,0,1
2,ุงุฌุชูุงุนุงุช,False,0,2
3,ูุฌูุงุช,False,0,3
4,ููุงุฉ ุงูุญุฑุฉ,False,0,4


In [25]:
df.to_csv(r'../data/words.csv',index=False)

In [6]:
import pandas as pd
df = pd.read_csv(r"../data/words.csv")

In [7]:
data_records = [ Word.model_validate(i).model_dump() for i in df.to_dict("records")]

In [8]:
from pymongo import MongoClient
client = MongoClient('mongodb://localhost:27017/')
db = client['ArLip']
coll = db['words']

In [9]:
coll.insert_many(data_records)

InsertManyResult([ObjectId('6698c748f63a05d524256077'), ObjectId('6698c748f63a05d524256078'), ObjectId('6698c748f63a05d524256079'), ObjectId('6698c748f63a05d52425607a'), ObjectId('6698c748f63a05d52425607b'), ObjectId('6698c748f63a05d52425607c'), ObjectId('6698c748f63a05d52425607d'), ObjectId('6698c748f63a05d52425607e'), ObjectId('6698c748f63a05d52425607f'), ObjectId('6698c748f63a05d524256080'), ObjectId('6698c748f63a05d524256081'), ObjectId('6698c748f63a05d524256082'), ObjectId('6698c748f63a05d524256083'), ObjectId('6698c748f63a05d524256084'), ObjectId('6698c748f63a05d524256085'), ObjectId('6698c748f63a05d524256086'), ObjectId('6698c748f63a05d524256087'), ObjectId('6698c748f63a05d524256088'), ObjectId('6698c748f63a05d524256089'), ObjectId('6698c748f63a05d52425608a'), ObjectId('6698c748f63a05d52425608b'), ObjectId('6698c748f63a05d52425608c'), ObjectId('6698c748f63a05d52425608d'), ObjectId('6698c748f63a05d52425608e'), ObjectId('6698c748f63a05d52425608f'), ObjectId('6698c748f63a05d5242560