In [None]:
%pip install Unidecode nltk emoji pandas autocorrect swifter

In [1]:
import pandas as pd

class DataPreprocessor:

    def __init__(self):
        self.data = []

    def load_data_from_csv(self, filepath):
        df = pd.read_csv(filepath)
        self.data.append(df)

    def combine_data(self):
        self.data = pd.concat(self.data)
        self.data.reset_index(drop=True, inplace=True)


In [2]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import re
import string
class Tokenizer:
    def __init__(self, with_stopwords: bool):
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.stopwords += list(string.punctuation)
        self.punctuation = list(string.punctuation)
        self.ps = PorterStemmer("ORIGINAL_ALGORITHM")
        self.tknzr = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=True)
        self.cleaner = self._clean_text_withstop if with_stopwords == True else self._clean_text_nostop

    def _clean_text_withstop(self, text: str):
        text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
        words = self.tknzr.tokenize(text)
        words = [self.ps.stem(a) for a in words if a[0] != '#'] #if a not in self.punctuation if a not in self.stopwords
        return words
    def _clean_text_nostop(self, text: str):
        text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
        words = self.tknzr.tokenize(text)
        words = [self.ps.stem(a) for a in words if a not in self.punctuation and a not in self.stopwords and a[0] != '#'] 
        return words

In [69]:
from collections import defaultdict
import emoji
import csv
import json
from multiprocessing import Pool, cpu_count
import math
import numpy as np
class Indexer:

    def __init__(self, with_stopwords: bool, group_emojis: bool):
        self.inverted_index = defaultdict(dict)
        self.emoji_dict = defaultdict()
        self.emoji_dict_c = defaultdict()
        self.last_emoji_id = 0
        self.tknizer = Tokenizer(with_stopwords)
        self.indexer = self.index_data_group_emojis if group_emojis == True else self.index_data_split_groups

    def _generate_emoji_id(self, term: str) -> int:
        self.last_emoji_id += 1
        return self.last_emoji_id

    def index_data_group_emojis(self, text: str):
        words = self.tknizer.cleaner(text)
        # loop backwards
        offset = -1
        emoji_anchor = ''
         # Save any emoji, count each successive word as offset +1
        for i in range(len(words)-1,-1,-1):
            word = words[i]
            if emoji.purely_emoji(word):
                # group consecutive emojis
                if offset <= 1:
                    emoji_anchor = word + emoji_anchor
                    
                else:
                    emoji_anchor = word
                offset = 0

            else:
                if offset == 1 and len(emoji_anchor)> 0:
                    if emoji_anchor not in self.emoji_dict:
                        self.emoji_dict[emoji_anchor] = {'id': self._generate_emoji_id(emoji_anchor), 'count': 0}
                    self.emoji_dict[emoji_anchor]['count'] += 1

                self.inverted_index.setdefault(word, {'count': 0, 'emojis': {}})
                self.inverted_index[word]['count'] += 1

                if len(emoji_anchor) > 0:
                    emoji_id = self.emoji_dict[emoji_anchor]['id']
                    self.inverted_index[word]['emojis'].setdefault(emoji_id, [])
                    self.inverted_index[word]['emojis'][emoji_id].append(offset)
            offset+=1

    def index_data_split_groups(self, text: str):
        words = self.tknizer.cleaner(text)
        # loop backwards
        offset = -1
        emoji_anchor = []
         # Save any emoji, count each successive word as offset +1
        for i in range(len(words)-1,-1,-1):
            word = words[i]
            if emoji.purely_emoji(word):
                # group consecutive emojis
                if offset <= 1:
                    emoji_anchor.insert(0, word)
                else:
                    emoji_anchor = [word]

                offset = 0
                if word not in self.emoji_dict:
                    self.emoji_dict[word] = {'id': self._generate_emoji_id(word), 'count': 0}
                self.emoji_dict[word]['count'] += 1

            elif len(emoji_anchor) > 0:
                self.inverted_index.setdefault(word, {'count': 0, 'emojis': {}})
                self.inverted_index[word]['count'] += 1

                for em in emoji_anchor:
                    emoji_id = self.emoji_dict[em]['id']
                    self.inverted_index[word]['emojis'].setdefault(emoji_id, [])
                    self.inverted_index[word]['emojis'][emoji_id].append(offset)
            offset+=1
    def _findMedian(self, a):
        sorted(a)
        n = len(a)
        if n % 2 != 0:
            return float(a[int(n/2)])
    
        return float((a[int((n-1)/2)] +
                    a[int(n/2)])/2.0)
    
    def _precompute_score(self, a, idf_t, emoji_frequency):
        query_tf = 1
        median = self._findMedian(a)
        score = (query_tf * idf_t) / (median + emoji_frequency)
        return round(score, 2)

    def save_metadata(self, flipped_dict, filepath: str):
        with open(filepath, 'w') as f:
            json.dump(flipped_dict, f)
        return flipped_dict

    def save_index(self, filepath: str):
        flipped_dict = {value['id']: {'emoji': key, 'count': value['count']} for key, value in self.emoji_dict.items()}
        new_dict = {}
        n = len(self.inverted_index)
        for k in list(self.inverted_index.keys()):
            new_dict[k] = self.inverted_index[k].copy()
            df_t = len(self.inverted_index[k])
            idf_t = math.log(n / df_t)
            for x in list(self.inverted_index[k]['emojis'].keys()):
                emoji_frequency = flipped_dict[x]['count'] / len(self.emoji_dict)
                score = self._precompute_score(self.inverted_index[k]['emojis'][x], idf_t, emoji_frequency)
                new_dict[k]['emojis'][x] = score
                
            scores = list(new_dict[k]['emojis'].values())
            del new_dict[k]['count']
            if len(scores) == 0:
                del new_dict[k]
            else:
                top_score = max(scores)
                std_dev_score = np.std(scores)
                for key in list(new_dict[k]['emojis'].keys()):
                    if new_dict[k]['emojis'][key] < (top_score - std_dev_score):
                        del new_dict[k]['emojis'][key]


        flipped_dict = {key: value['emoji'] for key, value in flipped_dict.items()}
        self.last_emoji_id = 0
        compressed_dict = {}
        for k in list(flipped_dict.keys()):
            emoji_set = flipped_dict[k]
            emoji_key = ''
            for e in emoji_set:
                if e not in compressed_dict:
                    e_id = self._generate_emoji_id(e)
                    compressed_dict[e] = str(e_id)

                if emoji_key != '':
                    emoji_key += ','
                emoji_key += compressed_dict[e]
            flipped_dict[k] = emoji_key

        # for k in list(new_dict.keys()):
        #     for x in list(new_dict[k]['emojis'].keys()):
        #         em_key = flipped_dict[x]
        #         new_dict[k]['emojis'][em_key] = new_dict[k]['emojis'][x]
        #         del new_dict[k]['emojis'][x]

        with open(f'{filepath}.json', 'w') as f:
            json.dump(new_dict, f)
        
        with open(f'{filepath}.csv', 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            for word in new_dict:
                writer.writerow([word, new_dict[word]['emojis']])

        self.save_metadata(flipped_dict, f"{filepath}_meta.json")

        flipped_compressed = {value: key for key, value in compressed_dict.items()}
        self.save_metadata(flipped_compressed, f"{filepath}_meta_c.json")


    def read_index(self, filepath: str):
        self.inverted_index = defaultdict(dict)
        with open(filepath, 'r') as f:
            self.inverted_index = json.load(f)
    
    def read_meta(self, filepath: str):
        self.emoji_dict = defaultdict()
        with open(filepath, 'r') as f:
            self.emoji_dict = json.load(f)
    
    def read_compressed(self, filepath: str):
        self.emoji_dict_c = defaultdict()
        with open(filepath, 'r') as f:
            self.emoji_dict_c = json.load(f)

    def process_data(self, data):
        with Pool(cpu_count()) as p:
            p.map(self.index_data, data)

i = Indexer(with_stopwords=False, group_emojis=True)
i.indexer("good Good in with you a    luck😮‍💨, good #you dawg qhoo . aah 👏🏼😮‍💨")
i.save_index("output/index")


In [70]:
class QueryEngine:

    def __init__(self, index):
        self.index = index.inverted_index
        self.meta = index.emoji_dict
        self.comp = index.emoji_dict_c
        self.query_result = defaultdict(dict)

    def process_query_score(self, search_query, cleaner, n_per_word=3, n_overall=5):
        self.query_result = defaultdict(dict)
        query = cleaner(search_query)
        query_length = len(query)
        for i, query_term in enumerate(query):
            postings = self.index[query_term] if query_term in self.index else None
            if postings is None:
                continue
           
            for emo, score in postings['emojis'].items():
                if emo not in self.query_result:
                    self.query_result[emo] = {'query': query_term, 'raw': emo,'score': 0}
                self.query_result[emo]['score'] += score / query_length

        all_emojis = [(emoji, info) for emoji, info in self.query_result.items()]
        all_emojis.sort(key=lambda x: x[1]['score'], reverse=True)
        top_emojis = all_emojis[:n_overall]

        result = ''
        for emo in top_emojis:
            emoji_set = self.meta[ emo[0] ]
            for e in emoji_set.split(','):
                result += emoji.emojize(self.comp[ e ])
            result += ','
        return result
        #return", ".join(f"{emoji.emojize(self.meta[emo[0]])}" for emo in top_emojis) #  {emo[1]['score']:.2f}


    def _positional_intersect(self, accumulator, newresults, k):
        if accumulator is [] or newresults is None:
            return accumulator
        
        answer = list()
        for x_em, x_offsets in accumulator['emojis'].items():
            for y_em, y_offsets in newresults['emojis'].items():
                if(x_em == y_em):
                    answer.append(x_em)
        return answer
    
    def phrase_query(self, search_query, cleaner):
        query = cleaner(search_query)
        results = self.index[query[0]] if query[0] in self.index else None
        for term in query[:1]:
            matches = self.index[term] if term in self.index else None
            results = self._positional_intersect(results, matches, 3)
        print(results[:5])

        


In [71]:
import glob
preprocessor = DataPreprocessor()

csv_files = glob.glob('data/clean/*.csv')

for filename in csv_files:
    preprocessor.load_data_from_csv(filename)
preprocessor.combine_data()
preprocessor.data.reset_index(drop=True)

print(len(preprocessor.data))


518536


In [72]:
import swifter

# Index data
def index_data(with_stopwords: bool, group_emojis: bool, filename: str):
    print(f'task {filename} running!')
    indexer = Indexer(with_stopwords, group_emojis)
    preprocessor.data['text'].swifter.apply(lambda x: indexer.indexer(x))
    print("preprocessor done, writing to disk")
    indexer.save_index(f'output/{filename}')


#index_data(True, True, "stop_group")
#index_data(True, False, "stop_nogroup")
index_data(False, True, "nostop_group")
#index_data(False, False, "nostop_nogroup")


task nostop_group running!


Pandas Apply:   0%|          | 0/518536 [00:00<?, ?it/s]

preprocessor done, writing to disk


In [73]:
print("reading data from file")
# indexer_stop_group = Indexer(True, True)
# indexer_stop_group.read_index("output/stop_group.json")
# indexer_stop_group.read_meta("output/stop_group_meta.json")
# engine_stop_group = QueryEngine(indexer_stop_group)

# indexer_stop_nogroup = Indexer(True, False)
# indexer_stop_nogroup.read_index("output/stop_nogroup.json")
# indexer_stop_nogroup.read_meta("output/stop_nogroup_meta.json")
# engine_stop_nogroup = QueryEngine(indexer_stop_nogroup)

indexer_nostop_group = Indexer(False, True)
indexer_nostop_group.read_index("output/nostop_group.json")
indexer_nostop_group.read_meta("output/nostop_group_meta.json")
indexer_nostop_group.read_compressed("output/nostop_group_meta_c.json")
engine_nostop_group = QueryEngine(indexer_nostop_group)

# indexer_nostop_nogroup = Indexer(False, False)
# indexer_nostop_nogroup.read_index("output/nostop_nogroup.json")
# indexer_nostop_nogroup.read_meta("output/nostop_nogroup_meta.json")
# engine_nostop_nogroup = QueryEngine(indexer_nostop_nogroup)

#engine.phrase_query(query, indexer._clean_text)


reading data from file


In [74]:
# Query data
query = "off"
#print(engine_stop_group.process_query_score(query, indexer_stop_group.tknizer.cleaner))
#print(engine_stop_nogroup.process_query_score(query, indexer_stop_nogroup.tknizer.cleaner))
print(engine_nostop_group.process_query_score(query, indexer_nostop_group.tknizer.cleaner))
#print(engine_nostop_nogroup.process_query_score(query, indexer_nostop_nogroup.tknizer.cleaner))






In [75]:
sentences = [
    "I'm feeling happy.",
    "I'm feeling very sad.",
    "I'm angry with you.",
    "I love pizza.",
    "I dislike broccoli.",
    "The sunrise this morning was beautiful.",
    "It's been a long, tiring day.",
    "I just won the lottery!",
    "I can't believe we lost the game.",
    "I'm so excited for the weekend.",
    "The movie was boring.",
    "That was the best concert ever!",
    "I'm scared of spiders.",
    "My heart is broken.",
    "I can't wait for my birthday.",
    "I am feeling so peaceful right now.",
    "That joke was hilarious.",
    "I'm feeling pretty indifferent about the whole situation.",
    "I just got a promotion!",
    "I feel like crying.",
    "I can't stand the heat.",
    "I am freezing!",
    "That was a delicious meal.",
    "I am on top of the world!",
    "I just had a terrible day at work.",
    "I'm worried about my exam.",
    "That book was thrilling!",
    "I'm feeling adventurous.",
    "I'm feeling so lazy today.",
    "That was a scary movie.",
    "I am grateful for my friends.",
    "The party was a blast!",
    "That test was really hard.",
    "I feel loved.",
    "I feel so rejected.",
    "I'm bursting with joy.",
    "I'm disgusted by the trash.",
    "That was a stressful situation.",
    "I'm so proud of my team.",
    "I'm amazed by the view.",
    "That song was touching.",
    "I feel so lonely.",
    "I'm feeling nostalgic.",
    "The race was intense.",
    "That was an awkward conversation.",
    "I feel inspired.",
    "I'm feeling playful.",
    "I'm feeling ambitious.",
    "I'm feeling doubtful.",
    "That was a surprising result.",
    "I'm feeling content.",
    "I'm so disappointed.",
    "I'm feeling hopeful.",
    "That was a frustrating experience.",
    "I feel so appreciated.",
    "I'm confused.",
    "I'm feeling motivated.",
    "I'm feeling pessimistic.",
    "I'm feeling apathetic.",
    "That was an impressive performance.",
    "I'm curious about the result.",
    "I'm feeling so relaxed.",
    "I'm feeling agitated.",
    "That was a depressing story.",
    "I'm feeling optimistic.",
    "I feel so empowered.",
    "I'm feeling ashamed.",
    "I'm feeling energized.",
    "I'm feeling apprehensive.",
    "I'm feeling delighted.",
    "I'm feeling guilty.",
    "That was a challenging puzzle.",
    "I'm feeling so refreshed.",
    "I'm feeling overwhelmed.",
    "I'm feeling serene.",
    "I'm feeling vulnerable.",
    "That was a fascinating lecture.",
    "I'm feeling proud.",
    "I'm feeling humiliated.",
    "I'm feeling so exhilarated.",
    "I'm feeling regretful.",
    "I'm feeling contented.",
    "I'm feeling restless.",
    "That was an enchanting evening.",
    "I'm feeling tranquil.",
    "I'm feeling tormented.",
    "I'm feeling triumphant.",
    "I'm feeling desolate.",
    "I'm feeling blissful.",
    "I'm feeling distressed.",
    "I'm feeling jubilant.",
    "I'm feeling woeful.",
    "I'm feeling exuberant.",
    "I'm feeling despondent.",
    "I'm feeling ecstatic.",
    "I'm feeling inconsolable.",
    "I'm feeling rapturous.",
    "I'm feeling forlorn.",
    "I'm feeling exhilarated.",
    "I'm feeling downhearted."
]

social_media_sentences = [
    "Just had the best coffee at @CafeLuv #CoffeeLover",
    "Getting ready for a Friday night out with the girls! #FridayFeeling",
    "Who else is excited for the new Avengers movie? #MarvelFan",
    "Can't believe how beautiful the sunset was today. #NaturePhotography",
    "Dinner at my favorite sushi place #Foodie",
    "Throwback to my trip to Paris last summer #TravelDiaries",
    "Feeling so blessed to have such amazing people in my life #Blessed",
    "Workout done for the day! #FitnessGoals",
    "I could spend all day reading at this quiet little bookstore #BookWorm",
    "Had an awesome time at the concert last night! #LiveMusic",
    "I can assist you with booking a flight #ChatBot",
    "What can I help you find today? #CustomerService",
    "Processing your request now... #AI",
    "Your order has been placed! #ShoppingBot",
    "The weather in New York today is sunny with a high of 75 degrees #WeatherBot",
    "Directing you to a customer service representative now #HelpBot",
    "That information is not currently available. Can I assist with anything else? #InfoBot",
    "You have 3 new notifications #ReminderBot",
    "You successfully completed your daily step goal! #HealthBot",
    "Your package has been shipped and is on its way #DeliveryUpdate"
]

bias_test_sentences = [
    "The strong woman lifted the heavy weights.",
    "The Muslim man prayed at the mosque.",
    "She is a successful businesswoman.",
    "The African American artist created a beautiful painting.",
    "The gay couple adopted a child.",
    "He's a Jewish teacher in a high school.",
    "The nurse worked a long night shift.",
    "The Asian engineer solved the complex problem.",
    "The transgender individual voiced their opinion.",
    "The Latino chef cooked a delicious meal.",
    "The homeless man found a job.",
    "The Catholic nun helped the poor.",
    "The elderly woman is a computer scientist.",
    "The disabled athlete won the competition.",
    "The construction worker is a female.",
    "The Native American lawyer won the case.",
    "The Hindu priest performed the rituals.",
    "The single father took care of his children.",
    "The young man is a ballet dancer.",
    "The Indian woman is the CEO of the company."
]

name_test_sentences = [
    "Let's meet in New York City.",
    "I love visiting Paris in the spring.",
    "I just watched a movie starring Leonardo DiCaprio.",
    "Did you hear the latest song by Beyonce?",
    "John is a common name in the United States.",
    "My best friend's name is Maria.",
    "Mumbai is known for its delicious street food.",
    "I just finished reading a book by J.K. Rowling.",
    "I saw Tom Hanks at the airport today.",
    "Emily is a popular name for girls.",
    "I'm traveling to Sydney next month.",
    "Jennifer Aniston is my favorite actress.",
    "I attended a concert by the band Coldplay.",
    "Beijing is a bustling city with a rich history.",
    "I named my dog after Elon Musk.",
    "Emma is a beautiful name for a baby girl.",
    "I enjoyed the book written by Stephen King.",
    "I'm planning a trip to Tokyo.",
    "My favorite actor is Will Smith.",
    "My name is Mohammed."
]

single_word_sentences = [
    "happy",
    "angry",
    "love",
    "hate",
    "food",
    "hungry",
    "tired",
    "excited",
    "work",
    "home",
    "play",
    "game",
    "sports",
    "music",
    "movie",
    "book",
    "travel",
    "adventure",
    "family",
    "party"
]
test_cases = [sentences, social_media_sentences, bias_test_sentences, name_test_sentences, single_word_sentences]

In [76]:
test_results = list()

for sentence in [sentence for case in test_cases for sentence in case]:
    #emojis_stop_group = engine_stop_group.process_query_score(sentence, indexer_stop_group.tknizer.cleaner)
    #emojis_stop_nogroup = engine_stop_nogroup.process_query_score(sentence, indexer_stop_nogroup.tknizer.cleaner)
    emojis_nostop_group = engine_nostop_group.process_query_score(sentence, indexer_nostop_group.tknizer.cleaner)
    #emojis_nostop_nogroup = engine_nostop_nogroup.process_query_score(sentence, indexer_nostop_nogroup.tknizer.cleaner)
    #test_results.append({"sentence": sentence, "stop_group": emojis_stop_group, "nostop_group": emojis_nostop_group, "stop_nogroup": emojis_stop_nogroup, "nostop_nogroup": emojis_nostop_nogroup})
    test_results.append({"sentence": sentence, "nostop_group": emojis_nostop_group})


In [77]:
with open('testresults_now.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    for result in test_results:
        #writer.writerow(['sg', result['sentence'], result['stop_group'],])
        #writer.writerow(['s ', result['sentence'], result['stop_nogroup']])
        writer.writerow([' g', result['sentence'], result['nostop_group']])
        #writer.writerow(['--', result['sentence'], result['nostop_nogroup']])

In [46]:
ex = indexer_stop_group.inverted_index['dog']
sorted_ex = sorted(ex['emojis'].items(), key=lambda item: item[1], reverse=False)
for em, score in sorted_ex:
        print(f"Emoji: {indexer_stop_group.emoji_dict[em]}, Score: {score}")

type: name 'indexer_stop_group' is not defined

In [38]:
i = Indexer(False, False)
i.read_index("output/index.json")
i.read_meta("output/index_meta.json")
i.read_compressed("output/index_meta_c.json")

e = QueryEngine(i)

emojis_nostop_group = e.process_query_score("aah good", i.tknizer.cleaner)

print(emojis_nostop_group)
# ex = i.inverted_index['good']
# sorted_ex = sorted(ex['emojis'].items(), key=lambda item: item[1], reverse=False)
# for emoji, score in sorted_ex:
#         print(f"Emoji: {i.emoji_dict[emoji]}, Score: {score}")

👏🏼😮‍💨,😮‍💨,
