Building a search engine for emojis

1. Index the corpus

term - token

term - emoji index. A sparse matrix with true/false if emoji appears with term
inverted index - dictionary of terms, and a list of their appearances (emojis)

Building index:
1. collect documents (sentences with emojis)
2. tokenize the documents
3. preprocess the tokens. lowercase, cleanup, english
4. Index documents with inverted index

Each emoji has unique ID
Maintain dictionary and postings
dictionary - emoji and pointer to document its from
postings - inverted index [emoji, frequency in doc, [docID1, docID2]]


Boolean query Happy AND Sad
Answer set rank emojis that has both happy and sad, otherwise, happy then sad, depending on frequency. 

Tokenization
- lowercase might be bad for emojis because we need to keep names apart from words (General Motors)
- stemming and lemmatization - Porter algorithm

Intersection algorithm for Happy and Sad is O(n+m) where n and m are number of occurrences 

Tolerant retrieval
Wildcard searches like re*val would need to use re AND val. for those searches, 
k-gram index woudl help
phonetic correction
lehvenstein distance


Index compression
Possibly 75% less storage
Allow use of caching frequently used terms and 
Rule of 30 - the 30 most common words account for 30% of the tokens in text. 
In the postings list, the term is the most space needed. Instead of using the emoji, use a pointer to the emoji


Scoring, term weighting, vector space model 


In [None]:
%pip install Unidecode nltk

In [23]:
import os
import csv
import json
import nltk
from unidecode import unidecode
import re
from nltk.corpus import words

# Download words corpus if not done before
#nltk.download('words')

# Set of all English words
#english_words = set(words.words())

In [24]:
import pandas as pd
import nltk
import emoji
import glob
from collections import defaultdict


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\carde\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
csv_files = glob.glob('data/clean/*.csv')
li = []

for filename in csv_files:
    df = pd.read_csv(filename)
    li.append(df)

df = pd.concat(li)

In [44]:
import string
from itertools import tee
from urllib.parse import urlparse
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
ps = PorterStemmer()
tweet = TweetTokenizer()
stopwords = nltk.corpus.stopwords.words('english')
stopwords += list(string.punctuation)
# Save the index to a file


def save_index(index, filepath):
    with open(filepath, 'w') as f:
        # Convert sets to lists before saving
        for word in index:
            if isinstance(index[word]['emojis'], set):
                index[word]['emojis'] = list(index[word]['emojis'])
        json.dump(index, f)

# useful for debugging
def save_index_csv(index, filepath):
    with open(filepath, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        # Convert sets to lists before saving
        for word in index:
            emojis = list(index[word]['emojis'])
            writer.writerow([word, index[word]['count'], *emojis])


# Clean tweet
def clean_text(text: string):
    #text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 
                   lambda x: urlparse(x.group()).netloc, text)
    ##words = nltk.word_tokenize(text)
    words = tweet.tokenize(text)
    words = [ps.stem(a) for a in words if a not in stopwords]
    return words

def index_data(text, index):
    
    words = clean_text(text)
    emoji_iterator = [em for em in emoji.analyze(''.join(words))]
    # two independent iters to check if empty in one
    iter1, iter2 = tee(emoji_iterator)
    try:
        _ = next(iter1) 
        emojis = [em for em in iter2]
        char_count = 0
        for i, word in enumerate(words):
            char_count += len(word)
            if word not in index:
                index[word] = {'count': 0, 'emojis': []}
            index[word]['count'] += 1
            for em in emojis:
                # Calculate the offset of the emoji from the word
                emoji_offset = abs(char_count - em[1].start)
                index[word]['emojis'].append({'emoji': em[0], 'offset': emoji_offset})

            #index[word]['emojis'] = index[word]['emojis'].union(set(emojis))
    except StopIteration:
        pass
index = defaultdict(dict)
df['text'].apply(lambda x: index_data(x, index))

save_index(index, 'output/index.json')
save_index_csv(index, 'output/index.csv')

In [42]:
from itertools import tee
a ='a👋'
s = emoji.analyze(a)
iter1, iter2 = tee(s)
try:
    val = next(iter2)
except StopIteration:
    print("awdaw")
for x in iter2:
    print(x[1].start)

In [45]:
import json

# Read the index from a file
def read_index(filepath):
    with open(filepath, 'r') as f:
        data = json.load(f)
        # Convert lists back to sets after loading
    for word, info in data.items():
        index[word] = {'count': info['count'], 'emojis': []}
        for emoji_info in info['emojis']:
            index[word]['emojis'].append({'emoji': emoji_info['emoji'], 'offset': emoji_info['offset']})
    return index



In [46]:
index = read_index('index.json')

In [90]:
import math
search = 'my flight was amazing'

collector = defaultdict(dict)
#query = clean_text(search)
for s in search.split(' '):
    q = clean_text(s)
    if len(q) <= 0:
        continue 
    else:
        q = q[0]
    if q not in index:
        continue
    res = index[q]
    query_tf = 1/len(q)
    n = len(index)
    df_t = len(res['emojis'])
    idf_t = math.log(n / df_t)
    for r in res['emojis']:
        if r['emoji'] not in collector:
            collector[r['emoji']] = {'query': s, 'emoji': r, 'score': query_tf * idf_t}
        else:
            collector[r['emoji']]['score'] += query_tf * idf_t
    
    # normalize tf_idf on length
    # for emoji, info in collector.items():
    #     info['score'] /= len(index[emoji]['emojis'])
    #     collector[emoji] = info


In [104]:

# # Split the query into words
# query_words = search.split()

# for x in range(5):
#     res = ''
#     for i, word in enumerate(query_words):
#         q = clean_text(word)
#         if(len(q) <= 0):
#             res += word
#             continue
#         if q[0] in collector:
#             emoji = sorted(collector[q[0]]['emojis'], key=lambda x: x['score'], reverse=True)[x]
#             print(x, emoji)
#             res += word + emoji
#     print(res)    


In [101]:
import math

search = 'my flight was amazing'

collector = defaultdict(list) 
query = clean_text(search)
queryweight = 1/len(query)

for q in query:
    if q in index:
        res = index[q]
        query_tf = queryweight
        n = len(index)
        df_t = len(res['emojis'])
        idf_t = math.log(n / df_t)
        for r in res['emojis']:
            score = query_tf * idf_t
            collector[q].append((r['emoji'], score))

# Normalize tf_idf on length and sort by score
for word, emojis in collector.items():
    normalized_emojis = [(emoji, score / len(index[word]['emojis'])) for emoji, score in emojis]
    collector[word] = sorted(normalized_emojis, key=lambda x: x[1], reverse=True)

# Print top scoring emoji for each word in the search query
for word in query:
    if word in collector and len(collector[word]) > 0:
        print(f"Top scoring emoji for word '{word}': {collector[word][0][0]}")


Top scoring emoji for word 'flight': 😢
Top scoring emoji for word 'amaz': 😊


In [99]:
top_emojis_per_word = {}

top_emoji_overall = None
top_score_overall = -1

for word in search.split(' '):
    top_emoji_for_word = None
    top_score_for_word = -1

    for emoji, info in collector.items():
        if info['query'] == word:
            if info['score'] > top_score_for_word:
                top_emoji_for_word = emoji
                top_score_for_word = info['score']
            if info['score'] > top_score_overall:
                top_emoji_overall = emoji
                top_score_overall = info['score']
    top_emojis_per_word[word] = top_emoji_for_word


In [100]:
for x in top_emojis_per_word:
    print(x)
print("top overall", top_emoji_overall)

my
flight
was
amazing
top overall 🙏
