In [19]:
%pip install Unidecode nltk emoji pandas autocorrect swifter

Collecting swifter
  Downloading swifter-1.3.5.tar.gz (490 kB)
     ---------------------------------------- 0.0/490.6 kB ? eta -:--:--
     ------- ------------------------------ 102.4/490.6 kB 3.0 MB/s eta 0:00:01
     -------------------------------- ----- 419.8/490.6 kB 5.3 MB/s eta 0:00:01
     --------------------------------- ---- 430.1/490.6 kB 3.0 MB/s eta 0:00:01
     -------------------------------------- 490.6/490.6 kB 3.1 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting dask[dataframe]>=2.10.0
  Downloading dask-2023.6.1-py3-none-any.whl (1.2 MB)
     ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
     ---------------------------------------  1.2/1.2 MB 24.8 MB/s eta 0:00:01
     ---------------------------------------- 1.2/1.2 MB 18.7 MB/s eta 0:00:00
Collecting ipywidgets>=7.0.0
  Downloading ipywidgets-8.0.6-py3-none-any.whl (138 kB)
     ---------------------------------

  DEPRECATION: swifter is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559

[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: C:\Users\carde\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [91]:
import pandas as pd

class DataPreprocessor:

    def __init__(self):
        self.data = []

    def load_data_from_csv(self, filepath):
        df = pd.read_csv(filepath)
        self.data.append(df)

    def combine_data(self):
        self.data = pd.concat(self.data)
        self.data.reset_index(drop=True, inplace=True)

In [93]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from collections import defaultdict

import emoji
import string
from itertools import tee
import csv
import json
import re
from collections import Counter
from multiprocessing import Pool, cpu_count
class Indexer:

    def __init__(self):
        self.inverted_index = defaultdict(dict)
        self.emoji_dict = defaultdict()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.stopwords += list(string.punctuation)
        self.ps = PorterStemmer()
        self.tknzr = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=True)
        #self.translator = str.maketrans('', '', string.punctuation)

    def _clean_text(self, text: string):
        text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
        #text = text.translate(self.translator)
        
        words = self.tknzr.tokenize(text)
        words = [self.ps.stem(a) for a in words] #if a not in self.stopwords
        return words

    def index_data(self, text):
        words = self._clean_text(text)
        try:
            emojis = []
            clean_words = []
            for word in words:
                if(emoji.purely_emoji(word)):
                    #emojis.append(word)
                    emojis.extend([em[0] for em in emoji.analyze(word, join_emoji=True)])
                    #clean_words.append(word)
                    clean_words.extend([em[0] for em in emoji.analyze(word, join_emoji=True)])
                elif emoji.emoji_count(word) > 0:
                    #extracted_emojis = "".join([em[0] for em in emoji.analyze(word, join_emoji=True)])
                    extracted_emojis = [em[0] for em in emoji.analyze(word, join_emoji=True)]
                    word = emoji.replace_emoji(word, replace="")
                    #emojis.append(extracted_emojis)
                    emojis.extend(extracted_emojis)
                    clean_words.append(word)
                    #clean_words.append(extracted_emojis)
                    clean_words.extend(extracted_emojis)
                else:
                    clean_words.append(word)

            current_words = []
            for i, word in enumerate(clean_words):
                if word not in self.inverted_index:
                    self.inverted_index[word] = {'count': 0, 'emojis': {}}
                if not emoji.purely_emoji(word):
                    current_words.append(word)
                else:    
                    if word not in self.emoji_dict:
                        self.emoji_dict[word] = 1
                    else:
                        self.emoji_dict[word] += 1
                    self.inverted_index[word]['count'] += 1
                    for e in emojis:
                        emoji_offset = abs(clean_words.index(e) - i) + 1
                        if(emoji_offset == 1): # itself
                            continue
                        if e not in self.inverted_index[word]['emojis']:
                            self.inverted_index[word]['emojis'][e] = []
                        self.inverted_index[word]['emojis'][e].append(emoji_offset)
                    
                    for w in current_words:
                        self.inverted_index[w]['count'] += 1
                        word_offset = abs(clean_words.index(w) - i) + 1
                        if w not in self.inverted_index[w]['emojis']:
                            self.inverted_index[w]['emojis'][word] = []
                        self.inverted_index[w]['emojis'][word].append(word_offset)
                    current_words = []
        except StopIteration:
            pass

    def save_metadata(self, filepath):
        with open(filepath, 'w') as f:
            json.dump(self.emoji_dict, f)

    def save_index(self, filepath):
        with open(filepath, 'w') as f:
            json.dump(self.inverted_index, f)

    def save_index_csv(self, filepath):
        with open(filepath, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            for word in self.inverted_index:
                writer.writerow([word, self.inverted_index[word]['count'], self.inverted_index[word]['emojis']])

    def read_index(self, filepath):
        self.inverted_index = defaultdict(dict)
        with open(filepath, 'r') as f:
            self.inverted_index = json.load(f)
    
    def read_meta(self, filepath):
        self.emoji_dict = defaultdict()
        with open(filepath, 'r') as f:
            self.emoji_dict = json.load(f)

    def process_data(self, data):
        with Pool(cpu_count()) as p:
            p.map(self.index_data, data)

# i = Indexer()
# i.index_data("good Good   luck😮‍💨, you dawg qhoo . aah 😮‍💨")
# i.save_index_csv("output/index.csv")
# i.save_index("output/index.json")
# i.save_metadata("output/meta.json")


In [80]:
import math
from itertools import takewhile
class QueryEngine:

    def __init__(self, index, meta):
        self.index = index
        self.meta = meta
        self.query_result = defaultdict(dict)

    def _findMedian(self, a):
        # First we sort the array
        sorted(a)
    
        # check for even case
        n = len(a)
        if n % 2 != 0:
            return float(a[int(n/2)])
    
        return float((a[int((n-1)/2)] +
                    a[int(n/2)])/2.0)
    # Vanilla (ish) tf-idf
    def process_query_tf_idf(self, search_query, cleaner, n_per_word=3, n_overall=5):
        self.query_result = defaultdict(dict)
        query = cleaner(search_query)
        print(query)
        n = len(self.index)
        for i, query_term in enumerate(query):
            postings = self.index[query_term] if query_term in self.index else None
            if postings is None:
                continue
            query_weight = 1.0
            query_tf = len([q for q in query if q == query_term])
            query_weight *= query_tf

            #n = len(self.meta)
            df_t = len(postings)
            idf_t = math.log(n / df_t)
            for emo, offset_list in postings['emojis'].items():
                median = self._findMedian(offset_list)
                if emo not in self.query_result:
                    self.query_result[emo] = {'query': search_query.split(' ')[i], 'raw': emo, 'emoji': emo, 'score': 0}
                self.query_result[emo]['score'] += (query_tf * idf_t) / median
                #print(emo, self.query_result[emo]['score'])
            # normalize
        # for emo, info in self.query_result.items():
        #     self.query_result[emo]['score'] /=  self.meta[emo]
        all_emojis = [(emoji, info['score']) for emoji, info in self.query_result.items()]
        all_emojis.sort(key=lambda x: x[1], reverse=True)
        top_emojis = all_emojis[:n_overall]
        print(f"The top {n_overall} emojis overall are:")
        for x in top_emojis:
            print(x)

    
        # Vanilla (ish) tf-idf
    def process_query_tf_idf2(self, search_query, cleaner):
        self.query_result = defaultdict(dict)
        query = cleaner(search_query)
        n = len(self.index)
        for i, query_term in enumerate(query):
            postings = self.index[query_term] if query_term in self.index else None
            if postings is None:
                continue
            query_weight = 1.0
            query_tf = len([q for q in query if q == query_term])
            query_weight *= query_tf

            #n = len(self.meta)
            df_t = len(postings)
            idf_t = math.log(n / df_t)
            for emo, offset_list in postings['emojis'].items():
                avg_offset = sum(offset_list) / len(offset_list) if offset_list else 1
                if emo not in self.query_result:
                    self.query_result[emo] = {'query': search_query.split(' ')[i], 'raw': emo, 'emoji': emo, 'score': 0}
                self.query_result[emo]['score'] += (query_tf * idf_t ) / avg_offset
            # normalize
        # for emo, info in self.query_result.items():
        #     self.query_result[emo]['score'] /=  self.meta[emo]
        return self.query_result


    def _positional_intersect(self, accumulator, newresults, k):
        if accumulator is [] or newresults is None:
            return accumulator
        
        answer = list()
        for x_em, x_offsets in accumulator['emojis'].items():
            for y_em, y_offsets in newresults['emojis'].items():
                if(x_em == y_em):
                    answer.append(x_em)
        return answer
    
    def phrase_query(self, search_query, cleaner):
        query = cleaner(search_query)
        print(query)
        results = self.index[query[0]] if query[0] in self.index else None
        for term in query[:1]:
            matches = self.index[term] if term in self.index else None
            results = self._positional_intersect(results, matches, 3)
        print(results[:5])

        


In [81]:
def print_top_emojis(query_result, search, n_per_word=3, n_overall=5):
    query_words = search.split(' ')
        
        # Print top emojis for each word
    for word in query_words:
        word_emojis = [(emoji, info['score']) for emoji, info in query_result.items() if info['query'] == word]
        word_emojis.sort(key=lambda x: x[1], reverse=True)
        top_emojis = word_emojis[:n_per_word]
        print(f"For the word '{word}', the top {n_per_word} emojis are: {top_emojis}")

        # Print top emojis overall
    all_emojis = [(emoji, info['score']) for emoji, info in query_result.items()]
    all_emojis.sort(key=lambda x: x[1], reverse=True)
    top_emojis = all_emojis[:n_overall]
    print(f"The top {n_overall} emojis overall are:")
    for x in top_emojis:
        print(x)

In [124]:

import glob
import swifter
preprocessor = DataPreprocessor()

csv_files = glob.glob('data/clean/*.csv')

for filename in csv_files:
    preprocessor.load_data_from_csv(filename)
preprocessor.combine_data()
preprocessor.data.reset_index(drop=True)
# Index data
indexer = Indexer()

preprocessor.data['text'].swifter.apply(lambda x: indexer.index_data(x))
print("preprocessor done, writing to disk")

indexer.save_index('output/index.json')
indexer.save_metadata('output/meta.json')
indexer.save_index_csv('output/index.csv')

Pandas Apply:   0%|          | 0/1516115 [00:00<?, ?it/s]

preprocessor done, writing to disk


In [24]:
print("reading data from file")
indexer = Indexer()
indexer.read_index("output/index.json")
indexer.read_meta("output/meta.json")


reading data from file


In [134]:
# Query data
engine = QueryEngine(indexer.inverted_index, indexer.emoji_dict)
query = "the food was super"
engine.phrase_query(query, indexer._clean_text)
engine.process_query_tf_idf(query, indexer._clean_text)




['the', 'food', 'wa', 'super']
['👨\u200d👩\u200d👧\u200d👦', '🛩', '3️⃣', '🚫', '💰']
['the', 'food', 'wa', 'super']
The top 5 emojis overall are:
('😯', 16.035256155404856)
('🤩', 15.506621337094806)
('👏🏾', 14.92512303695375)
('👥', 14.537457503526381)
('😵', 14.537457503526381)


In [340]:
with open("data.csv", 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            for word in self.inverted_index:
                writer.writerow([word, self.inverted_index[word]['count'], self.inverted_index[word]['emojis']])
