In [None]:
%pip install Unidecode nltk, emoji, pandas

In [1]:
import pandas as pd

class DataPreprocessor:

    def __init__(self):
        self.data = []

    def load_data_from_csv(self, filepath):
        df = pd.read_csv(filepath)
        self.data.append(df)

    def combine_data(self):
        self.data = pd.concat(self.data)

In [8]:
import nltk
from nltk.stem import PorterStemmer
from collections import defaultdict

import emoji
import string
from itertools import tee
import csv
import json
import re

class Indexer:

    def __init__(self):
        self.inverted_index = defaultdict(dict)
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.stopwords += list(string.punctuation)
        self.ps = PorterStemmer()

    def _clean_text(self, text: string):
        text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
        words = nltk.word_tokenize(text)
        words = [self.ps.stem(a) for a in words if a not in self.stopwords]
        return words
    
    def _add_word_to_index(self, word, emojis, char_count):
        if word not in self.inverted_index:
            self.inverted_index[word] = {'count': 0, 'emojis': []}
        self.inverted_index[word]['count'] += 1
        for em in emojis:
            emoji_offset = abs(char_count - em[1].start) + 1
            if emoji_offset <= 40:
                self.inverted_index[word]['emojis'].append({'emoji': em[0], 'offset': emoji_offset})


    def index_data(self, text):
        words = self._clean_text(text)
        emoji_iterator = [em for em in emoji.analyze("".join(words))]

        try:
            emojis = [em for em in next(emoji_iterator, [])]
            char_count = 0
            for word in words:
                char_count += len(word)
                self._add_word_to_index(word, emojis, char_count)
        except StopIteration:
            pass

    # def _prepare_index_for_saving(self):
    #     for word in self.inverted_index:
    #         if isinstance(self.inverted_index[word]['emojis'], set):
    #             self.inverted_index[word]['emojis'] = list(self.inverted_index[word]['emojis'])

    def save_index(self, filepath):
        with open(filepath, 'w') as f:
            json.dump(self.inverted_index, f)

    def save_index_csv(self, filepath):
        with open(filepath, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            for word in self.inverted_index:
                writer.writerow([word, self.inverted_index[word]['count'], self.inverted_index[word]['emojis']])

    def read_index(self, filepath):
        self.inverted_index = defaultdict(dict)
        with open(filepath, 'r') as f:
            data = json.load(f)

        for word, info in data.items():
            self.inverted_index[word] = {'count': info['count'], 'emojis': [emoji_info for emoji_info in info['emojis']]}



In [3]:
import math

class QueryEngine:

    def __init__(self, index):
        self.index = index
        self.query_result = defaultdict(dict)

    def process_query(self, search_query, index):
        self.query_result = defaultdict(dict)
        query = index.clean_text(search_query)
        query_weight = 1/len(query)
        print(query, query_weight)
        for i, q in enumerate(query):
            matching_emojis = self.index[q]

            query_tf = query_weight
            n = len(self.index)
            df_t = len(matching_emojis['emojis']) if matching_emojis else -1
            if df_t == -1:
                continue
            idf_t = math.log(n / df_t)

            emoji_counter = {}
            for individual_emoji in matching_emojis['emojis']:
                unique_emoji = individual_emoji['emoji']
                if unique_emoji not in emoji_counter:
                    emoji_counter[unique_emoji] = {'count':1, 'offsets':[individual_emoji['offset']]}
                else:
                    emoji_counter[unique_emoji]['count'] += 1
                    emoji_counter[unique_emoji]['offsets'].append(individual_emoji['offset'])

            for unique_emoji, info in emoji_counter.items():
                avg_offset = sum(info['offsets']) / len(info['offsets']) if info['offsets'] else 1
                if unique_emoji not in self.query_result:
                    self.query_result[unique_emoji] = {'query': search_query.split(' ')[i], 'raw': q, 'emoji': unique_emoji, 'score': (query_tf * idf_t * info['count']) / avg_offset}
                else:
                    self.query_result[unique_emoji]['score'] += (query_tf * idf_t *  info['count']) / avg_offset
        #normalize tf_idf on length
        for emoji, info in self.query_result.items():
            info['score'] /= self.index[info['raw']]['count']
            self.query_result[emoji] = info
        return self.query_result

    def print_top_emojis(self, search, n_per_word=3, n_overall=5):
        query_words = search.split(' ')
        
        # Print top emojis for each word
        for word in query_words:
            word_emojis = [(emoji, info['score']) for emoji, info in self.query_result.items() if info['query'] == word]
            word_emojis.sort(key=lambda x: x[1], reverse=True)
            top_emojis = word_emojis[:n_per_word]
            print(f"For the word '{word}', the top {n_per_word} emojis are: {top_emojis}")

        # Print top emojis overall
        all_emojis = [(emoji, info['score']) for emoji, info in self.query_result.items()]
        all_emojis.sort(key=lambda x: x[1], reverse=True)
        top_emojis = all_emojis[:n_overall]
        print(f"The top {n_overall} emojis overall are: {top_emojis}")


In [9]:

import glob
preprocessor = DataPreprocessor()

csv_files = glob.glob('data/clean/*.csv')

for filename in csv_files:
    preprocessor.load_data_from_csv(filename)
preprocessor.combine_data()

# Index data
indexer = Indexer()

preprocessor.data['text'].apply(lambda x: indexer.index_data(x))

indexer.save_index('output/index.json')
indexer.save_index_csv('output/index.csv')

AttributeError: 'Indexer' object has no attribute 'clean_text'

In [6]:

# Query data
if indexer is None:
    print("reading data from file")
    indexer = Indexer()
    indexer.read_index()
    
engine = QueryEngine(indexer.inverted_index)

query = "my dog is cool"
engine.process_query(query, indexer)
engine.print_top_emojis(query)



['dog', 'cool'] 0.5
For the word 'my', the top 3 emojis are: [('😂', 0.05431188200592355), ('🐶', 0.053666308878979975), ('😎', 0.04160980075166413)]
For the word 'dog', the top 3 emojis are: [('⛳', 0.001637450329663763), ('👋', 0.0015758044414132158), ('✌️', 0.0014787656388240444)]
For the word 'is', the top 3 emojis are: []
For the word 'cool', the top 3 emojis are: []
The top 5 emojis overall are: [('😂', 0.05431188200592355), ('🐶', 0.053666308878979975), ('😎', 0.04160980075166413), ('😭', 0.01636332305277942), ('👍', 0.014559067693791318)]
