Building a search engine for emojis

1. Index the corpus

term - token

term - emoji index. A sparse matrix with true/false if emoji appears with term
inverted index - dictionary of terms, and a list of their appearances (emojis)

Building index:
1. collect documents (sentences with emojis)
2. tokenize the documents
3. preprocess the tokens. lowercase, cleanup, english
4. Index documents with inverted index

Each emoji has unique ID
Maintain dictionary and postings
dictionary - emoji and pointer to document its from
postings - inverted index [emoji, frequency in doc, [docID1, docID2]]


Boolean query Happy AND Sad
Answer set rank emojis that has both happy and sad, otherwise, happy then sad, depending on frequency. 

Tokenization
- lowercase might be bad for emojis because we need to keep names apart from words (General Motors)
- stemming and lemmatization - Porter algorithm

Intersection algorithm for Happy and Sad is O(n+m) where n and m are number of occurrences 

Tolerant retrieval
Wildcard searches like re*val would need to use re AND val. for those searches, 
k-gram index woudl help
phonetic correction
lehvenstein distance


Index compression
Possibly 75% less storage
Allow use of caching frequently used terms and 
Rule of 30 - the 30 most common words account for 30% of the tokens in text. 
In the postings list, the term is the most space needed. Instead of using the emoji, use a pointer to the emoji


Scoring, term weighting, vector space model 


In [None]:
%pip install Unidecode nltk

In [1]:
import os
import csv
import json
import nltk
from unidecode import unidecode
import re
from nltk.corpus import words

# Download words corpus if not done before
#nltk.download('words')

# Set of all English words
#english_words = set(words.words())

In [2]:
import pandas as pd
import nltk
import emoji
import glob
from collections import defaultdict


In [3]:
csv_files = glob.glob('data/clean/*.csv')
li = []

for filename in csv_files:
    df = pd.read_csv(filename)
    li.append(df)

df = pd.concat(li)

In [5]:
import string
from itertools import tee
from urllib.parse import urlparse
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
ps = PorterStemmer()
tweet = TweetTokenizer()
stopwords = nltk.corpus.stopwords.words('english')
stopwords += list(string.punctuation)
# Save the index to a file


def save_index(index, filepath):
    with open(filepath, 'w') as f:
        # Convert sets to lists before saving
        for word in index:
            if isinstance(index[word]['emojis'], set):
                index[word]['emojis'] = list(index[word]['emojis'])
        json.dump(index, f)

# useful for debugging
def save_index_csv(index, filepath):
    with open(filepath, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        # Convert sets to lists before saving
        for word in index:
            emojis = list(index[word]['emojis'])
            writer.writerow([word, index[word]['count'], *emojis])


# Clean tweet
def clean_text(text: string):
    #text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 
                   lambda x: urlparse(x.group()).netloc, text)
    words = nltk.word_tokenize(text)
    #words = tweet.tokenize(text)
    words = [ps.stem(a) for a in words if a not in stopwords]
    #words = [ps.stem(a) for a in words]
    return words

def index_data(text, index):
    #data = set(text.split(' '))
    #data = data.union(set(clean_text(text)))
    #data = data.union(set(text.translate(str.maketrans('', '', string.punctuation)).split(' ')))
    data = clean_text(text)
    emoji_iterator = [em for em in emoji.analyze("".join(data))]
    # two independent iters to check if empty in one
    iter1, iter2 = tee(emoji_iterator)
    try:
        _ = next(iter1) 
        emojis = [em for em in iter2]
        char_count = 0
        for i, word in enumerate(data):
            char_count += len(word)
            if word not in index:
                index[word] = {'count': 0, 'emojis': []}
            index[word]['count'] += 1
            for em in emojis:
                # Calculate the offset of the emoji from the word
                emoji_offset = abs(char_count - em[1].start) + 1
                index[word]['emojis'].append({'emoji': em[0], 'offset': emoji_offset})

            #index[word]['emojis'] = index[word]['emojis'].union(set(emojis))
    except StopIteration:
        pass
index = defaultdict(dict)
df['text'].apply(lambda x: index_data(x, index))
print("finished parsing")
save_index(index, 'output/index.json')
print("finished saving json")
#save_index_csv(index, 'output/index.csv')

finished parsing
finished saving json


In [105]:
import json

# Read the index from a file
def read_index(filepath):
    with open(filepath, 'r') as f:
        data = json.load(f)
        # Convert lists back to sets after loading
    for word, info in data.items():
        index[word] = {'count': info['count'], 'emojis': []}
        for emoji_info in info['emojis']:
            index[word]['emojis'].append({'emoji': emoji_info['emoji'], 'offset': emoji_info['offset']})
    return index



In [107]:
index = read_index('output/index.json')

In [18]:
import math
collector = defaultdict(dict)
def search_index(search_query):
    collector = defaultdict(dict)
    query = clean_text(search_query)
    query_weight = 1/len(query)
    print(query, query_weight)
    for i, q in enumerate(query):
        matching_emojis = index[q]

        query_tf = query_weight
        n = len(index)
        df_t = len(matching_emojis['emojis']) if matching_emojis else -1
        if df_t == -1:
            continue
        idf_t = math.log(n / df_t)

        emoji_counter = {}
        for individual_emoji in matching_emojis['emojis']:
            unique_emoji = individual_emoji['emoji']
            if unique_emoji not in emoji_counter:
                emoji_counter[unique_emoji] = {'count':1, 'offsets':[individual_emoji['offset']]}
            else:
                emoji_counter[unique_emoji]['count'] += 1
                emoji_counter[unique_emoji]['offsets'].append(individual_emoji['offset'])

        for unique_emoji, info in emoji_counter.items():
            avg_offset = sum(info['offsets']) / len(info['offsets']) if info['offsets'] else 1
            if unique_emoji not in collector:
                collector[unique_emoji] = {'query': search_query.split(' ')[i], 'raw': q, 'emoji': unique_emoji, 'score': (query_tf * idf_t * info['count']) / avg_offset}
            else:
                collector[unique_emoji]['score'] += (query_tf * idf_t *  info['count']) / avg_offset
    #normalize tf_idf on length
    for emoji, info in collector.items():
        info['score'] /= index[info['raw']]['count']
        collector[emoji] = info
    return collector



def print_top_emojis(collector, search, n_per_word=3, n_overall=5):
    query_words = search.split(' ')
    
    # Print top emojis for each word
    for word in query_words:
        word_emojis = [(emoji, info['score']) for emoji, info in collector.items() if info['query'] == word]
        word_emojis.sort(key=lambda x: x[1], reverse=True)
        top_emojis = word_emojis[:n_per_word]
        print(f"For the word '{word}', the top {n_per_word} emojis are: {top_emojis}")

    # Print top emojis overall
    all_emojis = [(emoji, info['score']) for emoji, info in collector.items()]
    all_emojis.sort(key=lambda x: x[1], reverse=True)
    top_emojis = all_emojis[:n_overall]
    print(f"The top {n_overall} emojis overall are: {top_emojis}")


In [23]:
query = "Think about how to make your insert operations more atomic"
collector = search_index(query)
# Call the function
print_top_emojis(collector, query)


['think', 'make', 'insert', 'oper', 'atom'] 0.2
For the word 'Think', the top 3 emojis are: [('😂', 0.007103785403500853), ('🙄', 0.0015273784317746176), ('😭', 0.0015263997989966838)]
For the word 'about', the top 3 emojis are: [('🕹', 3.224485002370002e-05), ('👨\u200d🔧', 1.576101604642609e-05), ('🔋', 1.161304064956572e-05)]
For the word 'how', the top 3 emojis are: [('👲🏾', 0.0006185745590016376), ('3⃣', 0.0005647854669145387), ('🚎', 0.0002952287667962361)]
For the word 'to', the top 3 emojis are: [('🎛', 0.002968251729093897), ('✴', 0.0011130943984102113), ('🇧🇲', 0.0007915337944250392)]
For the word 'make', the top 3 emojis are: [('☢️', 0.02422799616142971), ('⚛', 0.003461142308775673), ('🤼\u200d♂', 0.0007125881223949915)]
For the word 'your', the top 3 emojis are: []
For the word 'insert', the top 3 emojis are: []
For the word 'operations', the top 3 emojis are: []
For the word 'more', the top 3 emojis are: []
For the word 'atomic', the top 3 emojis are: []
The top 5 emojis overall are: 