In [None]:
%pip install Unidecode nltk, emoji, pandas

In [52]:
import pandas as pd

class DataPreprocessor:

    def __init__(self):
        self.data = []

    def load_data_from_csv(self, filepath):
        df = pd.read_csv(filepath)
        self.data.append(df)

    def combine_data(self):
        self.data = pd.concat(self.data)

In [49]:
import nltk
from nltk.stem import PorterStemmer
from collections import defaultdict

import emoji
import string
from itertools import tee
import csv
import json
import re

class Indexer:

    def __init__(self):
        self.inverted_index = defaultdict(dict)
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.stopwords += list(string.punctuation)
        self.ps = PorterStemmer()
        self.translator = str.maketrans('', '', string.punctuation)

    def _clean_text(self, text: string):
        text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
        text = text.translate(self.translator)
        words = nltk.word_tokenize(text)
        #words = [self.ps.stem(a) for a in words if a not in self.stopwords]
        return words

    def index_data(self, text):
        words = self._clean_text(text)
        try:
            emojis = []
            clean_words = []
            for word in words:
                if(emoji.purely_emoji(word)):
                    emojis.append(word)
                    clean_words.append(word)
                elif emoji.emoji_count(word) > 0:
                    extracted_emojis = "".join([em['emoji'] for em in emoji.emoji_list(word)])
                    word = emoji.replace_emoji(word, replace="")
                    emojis.append(extracted_emojis)
                    clean_words.append(word)
                    clean_words.append(extracted_emojis)
                else:
                    clean_words.append(word)

            current_words = []
            for i, word in enumerate(clean_words):
                if word not in self.inverted_index:
                    self.inverted_index[word] = {'count': 0, 'emojis': {}}
                if not emoji.purely_emoji(word):
                    current_words.append(word)
                else:    
                    self.inverted_index[word]['count'] += 1
                    for e in emojis:
                        emoji_offset = abs(clean_words.index(e) - i) + 1
                        if(emoji_offset == 1): # itself
                            continue
                        if e not in self.inverted_index[word]['emojis']:
                            self.inverted_index[word]['emojis'][e] = []
                        self.inverted_index[word]['emojis'][e].append(emoji_offset)
                    
                    for w in current_words:
                        self.inverted_index[w]['count'] += 1
                        word_offset = abs(clean_words.index(w) - i) + 1
                        if w not in self.inverted_index[w]['emojis']:
                            self.inverted_index[w]['emojis'][word] = []
                        self.inverted_index[w]['emojis'][word].append(word_offset)
                    current_words = []
        except StopIteration:
            pass

    def save_index(self, filepath):
        with open(filepath, 'w') as f:
            json.dump(self.inverted_index, f)

    def save_index_csv(self, filepath):
        with open(filepath, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            for word in self.inverted_index:
                writer.writerow([word, self.inverted_index[word]['count'], self.inverted_index[word]['emojis']])

    def read_index(self, filepath):
        self.inverted_index = defaultdict(dict)
        with open(filepath, 'r') as f:
            self.inverted_index = json.load(f)
    
# i = Indexer()
# i.index_data("good good luck 😊😊 you dawg🐕🐕")
# i.save_index_csv("output/index.csv")

In [94]:
import math

class QueryEngine:

    def __init__(self, index):
        self.index = index
        self.query_result = defaultdict(dict)

    def process_query_avg_offset(self, search_query, index):
        self.query_result = defaultdict(dict)
        query = index._clean_text(search_query)
        query_weight = 1/len(query)
        print(query, query_weight)
        for i, q in enumerate(query):
            matching_emojis = self.index[q]

            query_tf = query_weight
            n = len(self.index)
            df_t = len(matching_emojis['emojis']) if matching_emojis else -1
            if df_t == -1:
                continue
            idf_t = math.log(n / df_t)

            emoji_counter = {}
            for emo, counts in matching_emojis['emojis'].items():
                if emo not in emoji_counter:
                    emoji_counter[emo] = {'count':1, 'offsets':counts}
                else:
                    emoji_counter[emo]['count'] += 1

            for unique_emoji, info in emoji_counter.items():
                avg_offset = sum(info['offsets']) / len(info['offsets']) if info['offsets'] else 1
                if unique_emoji not in self.query_result:
                    self.query_result[unique_emoji] = {'query': search_query.split(' ')[i], 'raw': q, 'emoji': unique_emoji, 'score': (query_tf * idf_t * info['count']) / avg_offset}
                else:
                    self.query_result[unique_emoji]['score'] += (query_tf * idf_t *  info['count']) / avg_offset
        #normalize tf_idf on length
        for emoji, info in self.query_result.items():
            info['score'] /= self.index[info['raw']]['count']
            self.query_result[emoji] = info
        return self.query_result

    def process_query(self, search_query, index):
        self.query_result = defaultdict(dict)
        query = index._clean_text(search_query)
        n = len(self.index)
        d = 0.5

        for i, q in enumerate(query):
            matching_emojis = self.index[q] if q in self.index else None

            if not matching_emojis:
                continue

            # vanilla tf-idf
            term_freq = matching_emojis['count']
            
            # sublinear tf scaling
            #term_freq = 1 + math.log(term_freq)

            # max tf normalization
            #term_freq = 0.5 + 0.5 * (term_freq / max_tf)

            # custome offset based tf scaling
            df = len(matching_emojis['emojis'])
            idf = math.log(n / df)

            for emo, offset_list in matching_emojis['emojis'].items():
                #unique_emoji = individual_emoji['emoji']
                avg_offset = sum(offset_list) / len(offset_list) 
                tf = 1 + math.log(sum(math.pow(d, p) for p in offset_list))


                if emo not in self.query_result:
                    self.query_result[emo] = {'query': search_query.split(' ')[i], 'raw': q, 'emoji': emo, 'score': 0}

                self.query_result[emo]['score'] += (tf * idf *  info['count']) / avg_offset #term_freq * idf 

        # normalize scores
        for emoji, info in self.query_result.items():
            self.query_result[emoji]['score'] /=  self.index[info['raw']]['count']

        return self.query_result
    
    def print_top_emojis(self, search, n_per_word=3, n_overall=5):
        query_words = search.split(' ')
        
        # Print top emojis for each word
        for word in query_words:
            word_emojis = [(emoji, info['score']) for emoji, info in self.query_result.items() if info['query'] == word]
            word_emojis.sort(key=lambda x: x[1], reverse=True)
            top_emojis = word_emojis[:n_per_word]
            print(f"For the word '{word}', the top {n_per_word} emojis are: {top_emojis}")

        # Print top emojis overall
        all_emojis = [(emoji, info['score']) for emoji, info in self.query_result.items()]
        all_emojis.sort(key=lambda x: x[1], reverse=True)
        top_emojis = all_emojis[:n_overall]
        print(f"The top {n_overall} emojis overall are: {top_emojis}")


In [None]:

import glob
preprocessor = DataPreprocessor()

csv_files = glob.glob('data/clean/*.csv')

for filename in csv_files:
    preprocessor.load_data_from_csv(filename)
preprocessor.combine_data()

# Index data
indexer = Indexer()

preprocessor.data['text'].apply(lambda x: indexer.index_data(x))

indexer.save_index('output/index.json')
indexer.save_index_csv('output/index.csv')

In [None]:
print("reading data from file")
indexer = Indexer()
indexer.read_index("output/index.json")
    

In [106]:
# Query data
    
engine = QueryEngine(indexer.inverted_index)

query = "the weather will be sunny tomorrow"
engine.process_query_avg_offset(query, indexer)
engine.print_top_emojis(query)



['the', 'weather', 'will', 'be', 'sunny', 'tomorrow'] 0.16666666666666666
For the word 'the', the top 3 emojis are: [('😜', 1.2974020779344543e-05), ('📸', 1.2676716175715308e-05), ('💝', 1.1401735218412606e-05)]
For the word 'weather', the top 3 emojis are: []
For the word 'will', the top 3 emojis are: [('❔', 4.114805922383699e-05), ('🏤', 3.239335048636906e-05), ('🚋', 2.0621181162178596e-05)]
For the word 'be', the top 3 emojis are: [('🅱', 9.753242092682357e-06), ('♒', 4.8766210463411784e-06), ('🛬', 3.901296837072943e-06)]
For the word 'sunny', the top 3 emojis are: [('☀️', 0.0036245079061921356)]
For the word 'tomorrow', the top 3 emojis are: [('🕒', 6.725831613487132e-05), ('🕐', 6.725831613487132e-05), ('🕢', 6.725831613487132e-05)]
The top 5 emojis overall are: [('☀️', 0.0036245079061921356), ('🕒', 6.725831613487132e-05), ('🕐', 6.725831613487132e-05), ('🕢', 6.725831613487132e-05), ('⛩', 4.483887742324754e-05)]


In [None]:
import nltk
from nltk.stem import PorterStemmer
from collections import defaultdict

import emoji
import string
from itertools import tee
import csv
import json
import re

class Indexer:

    def __init__(self):
        self.inverted_index = defaultdict(dict)
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.stopwords += list(string.punctuation)
        self.ps = PorterStemmer()
        self.translator = str.maketrans('', '', string.punctuation)

    def _clean_text(self, text: string):
        text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
        text = text.translate(self.translator)
        words = nltk.word_tokenize(text)
        #words = [self.ps.stem(a) for a in words if a not in self.stopwords]
        return words

    def index_data(self, text):
        words = self._clean_text(text)
        try:
            emojis = []
            clean_words = []
            for word in words:
                if(emoji.purely_emoji(word)):
                    emojis.append(word)
                    clean_words.append(word)
                elif emoji.emoji_count(word) > 0:
                    extracted_emojis = "".join([em['emoji'] for em in emoji.emoji_list(word)])
                    print(extracted_emojis)
                    word = emoji.replace_emoji(word, replace="")
                    emojis.append(extracted_emojis)
                    clean_words.append(word)
                    clean_words.append(extracted_emojis)
                else:
                    clean_words.append(word)

            for i, word in enumerate(clean_words):
                if word not in self.inverted_index:
                    self.inverted_index[word] = {'count': 0, 'emojis': {}}
                self.inverted_index[word]['count'] += 1
                for em in emojis:
                    emoji_offset = abs(clean_words.index(em) - i) + 1
                    if em not in self.inverted_index[word]['emojis']:
                        self.inverted_index[word]['emojis'][em] = []
                    self.inverted_index[word]['emojis'][em].append(emoji_offset)
        except StopIteration:
            pass

    def save_index(self, filepath):
        with open(filepath, 'w') as f:
            json.dump(self.inverted_index, f)

    def save_index_csv(self, filepath):
        with open(filepath, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            for word in self.inverted_index:
                writer.writerow([word, self.inverted_index[word]['count'], self.inverted_index[word]['emojis']])

    def read_index(self, filepath):
        self.inverted_index = defaultdict(dict)
        with open(filepath, 'r') as f:
            self.inverted_index = json.load(f)
    
i = Indexer()
i.index_data("good good luck 😊😊 you dawg🐕🐕")
i.save_index_csv("output/index.csv")