In [19]:
%pip install Unidecode nltk emoji pandas autocorrect swifter

Collecting swifter
  Downloading swifter-1.3.5.tar.gz (490 kB)
     ---------------------------------------- 0.0/490.6 kB ? eta -:--:--
     ------- ------------------------------ 102.4/490.6 kB 3.0 MB/s eta 0:00:01
     -------------------------------- ----- 419.8/490.6 kB 5.3 MB/s eta 0:00:01
     --------------------------------- ---- 430.1/490.6 kB 3.0 MB/s eta 0:00:01
     -------------------------------------- 490.6/490.6 kB 3.1 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting dask[dataframe]>=2.10.0
  Downloading dask-2023.6.1-py3-none-any.whl (1.2 MB)
     ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
     ---------------------------------------  1.2/1.2 MB 24.8 MB/s eta 0:00:01
     ---------------------------------------- 1.2/1.2 MB 18.7 MB/s eta 0:00:00
Collecting ipywidgets>=7.0.0
  Downloading ipywidgets-8.0.6-py3-none-any.whl (138 kB)
     ---------------------------------

  DEPRECATION: swifter is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559

[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: C:\Users\carde\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd

class DataPreprocessor:

    def __init__(self):
        self.data = []

    def load_data_from_csv(self, filepath):
        df = pd.read_csv(filepath)
        self.data.append(df)

    def combine_data(self):
        self.data = pd.concat(self.data)
        self.data.reset_index(drop=True, inplace=True)

In [52]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from collections import defaultdict

import emoji
import string
from itertools import tee
import csv
import json
import re
from multiprocessing import Pool, cpu_count
class Indexer:

    def __init__(self):
        self.inverted_index = defaultdict(dict)
        self.emoji_dict = defaultdict()
        self.last_emoji_id = 0
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.stopwords += list(string.punctuation)
        self.punctuation = list(string.punctuation)
        self.ps = PorterStemmer()
        self.tknzr = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=True)

    def _clean_text(self, text: string):
        text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
        words = self.tknzr.tokenize(text)
        words = [self.ps.stem(a) for a in words if a not in self.punctuation] #if a not in self.stopwords
        return words

    def _generate_emoji_id(self, term):
        self.last_emoji_id += 1
        return self.last_emoji_id

    def index_data(self, text):
        words = self._clean_text(text)
        # loop backwards
        offset = -1
        emoji_anchor = ''
         # Save any emoji, count each successive word as offset +1
        for i in range(len(words)-1,-1,-1):
            word = words[i]
            if emoji.purely_emoji(word):
                # group consecutive emojis
                emoji_anchor = emoji.demojize(word) + emoji_anchor if offset <= 1 else emoji.demojize(word)
                offset = 0
            else:
                self.inverted_index.setdefault(word, {'count': 0, 'emojis': {}})
                self.inverted_index[word]['count'] += 1

                if len(emoji_anchor) > 0:
                    if emoji_anchor not in self.emoji_dict:
                        self.emoji_dict[emoji_anchor] = self._generate_emoji_id(emoji_anchor)
                    emoji_id = self.emoji_dict[emoji_anchor]
                    self.inverted_index[word]['emojis'].setdefault(emoji_id, [])
                    self.inverted_index[word]['emojis'][emoji_id].append(offset)
            offset+=1

    def _findMedian(self, a):
        sorted(a)
        n = len(a)
        if n % 2 != 0:
            return float(a[int(n/2)])
    
        return float((a[int((n-1)/2)] +
                    a[int(n/2)])/2.0)

    def save_metadata(self, filepath):
        flipped_dict = {value: key for key, value in self.emoji_dict.items()}

        with open(filepath, 'w') as f:
            json.dump(flipped_dict, f)

    def save_index(self, filepath):
        new_dict = {}
        for k in self.inverted_index:
            new_dict[k] = self.inverted_index[k].copy()
            for x in self.inverted_index[k]['emojis']:
                new_dict[k]['emojis'][x] = self._findMedian(self.inverted_index[k]['emojis'][x])
        with open(f'{filepath}.json', 'w') as f:
            json.dump(new_dict, f)
        
        with open(f'{filepath}.csv', 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            for word in new_dict:
                writer.writerow([word, new_dict[word]['count'], new_dict[word]['emojis']])


    def read_index(self, filepath):
        self.inverted_index = defaultdict(dict)
        with open(filepath, 'r') as f:
            self.inverted_index = json.load(f)
    
    def read_meta(self, filepath):
        self.emoji_dict = defaultdict()
        with open(filepath, 'r') as f:
            self.emoji_dict = json.load(f)

    def process_data(self, data):
        with Pool(cpu_count()) as p:
            p.map(self.index_data, data)

# i = Indexer()
# i.index_data("good Good   luck😮‍💨, good you dawg qhoo . aah 👏🏼😮‍💨")
# i.save_index("output/index")
# i.save_metadata("output/meta.json")


In [88]:
import math
class QueryEngine:

    def __init__(self, index, meta):
        self.index = index
        self.meta = meta
        self.query_result = defaultdict(dict)

    # Vanilla (ish) tf-idf
    def process_query_tf_idf(self, search_query, cleaner, n_per_word=3, n_overall=5):
        self.query_result = defaultdict(dict)
        query = cleaner(search_query)
        query_length = len(query)
        n = len(self.index)
        for i, query_term in enumerate(query):
            postings = self.index[query_term] if query_term in self.index else None
            if postings is None:
                continue
            query_weight = 1.0
            query_tf = len([q for q in query if q == query_term])
            query_weight *= query_tf

            df_t = len(postings)
            idf_t = math.log(n / df_t)
            for emo, median in postings['emojis'].items():
                if emo not in self.query_result:
                    self.query_result[emo] = {'query': query_term, 'raw': emo,'score': 0}
                self.query_result[emo]['score'] += ((query_tf * idf_t) / median) / query_length

        all_emojis = [(emoji, info) for emoji, info in self.query_result.items()]
        all_emojis.sort(key=lambda x: x[1]['score'], reverse=True)
        top_emojis = all_emojis[:n_overall]
        #print(f"The top {n_overall} emojis overall are:")
        return", ".join(f"{emoji.emojize(self.meta[emo[0]])}" for emo in top_emojis) #  {emo[1]['score']:.2f}


    def _positional_intersect(self, accumulator, newresults, k):
        if accumulator is [] or newresults is None:
            return accumulator
        
        answer = list()
        for x_em, x_offsets in accumulator['emojis'].items():
            for y_em, y_offsets in newresults['emojis'].items():
                if(x_em == y_em):
                    answer.append(x_em)
        return answer
    
    def phrase_query(self, search_query, cleaner):
        query = cleaner(search_query)
        print(query)
        results = self.index[query[0]] if query[0] in self.index else None
        for term in query[:1]:
            matches = self.index[term] if term in self.index else None
            results = self._positional_intersect(results, matches, 3)
        print(results[:5])

        


In [None]:

import glob
import swifter
preprocessor = DataPreprocessor()

csv_files = glob.glob('data/clean/*.csv')

for filename in csv_files:
    preprocessor.load_data_from_csv(filename)
preprocessor.combine_data()
preprocessor.data.reset_index(drop=True)
# Index data
indexer = Indexer()

preprocessor.data['text'].swifter.apply(lambda x: indexer.index_data(x))
print("preprocessor done, writing to disk")

indexer.save_index('output/index')
indexer.save_metadata('output/meta.json')


In [90]:
print("reading data from file")
indexer = Indexer()
indexer.read_index("output/index.json")
indexer.read_meta("output/meta.json")

engine = QueryEngine(indexer.inverted_index, indexer.emoji_dict)


reading data from file


In [76]:
# Query data
engine = QueryEngine(indexer.inverted_index, indexer.emoji_dict)
query = "Are you ready to take your resume to the next level?"
#engine.phrase_query(query, indexer._clean_text)
engine.process_query_tf_idf(query, indexer._clean_text)




'🍞, 7.88, are, 💪🏻💪🏻, 7.11, are, 🍰, 6.67, are, 🔥🔥🔥, 6.62, are, 😎💦, 6.34, you'

In [92]:
sentences = [
    "I'm feeling happy.",
    "I'm feeling very sad.",
    "I'm angry with you.",
    "I love pizza.",
    "I dislike broccoli.",
    "The sunrise this morning was beautiful.",
    "It's been a long, tiring day.",
    "I just won the lottery!",
    "I can't believe we lost the game.",
    "I'm so excited for the weekend.",
    "The movie was boring.",
    "That was the best concert ever!",
    "I'm scared of spiders.",
    "My heart is broken.",
    "I can't wait for my birthday.",
    "I am feeling so peaceful right now.",
    "That joke was hilarious.",
    "I'm feeling pretty indifferent about the whole situation.",
    "I just got a promotion!",
    "I feel like crying.",
    "I can't stand the heat.",
    "I am freezing!",
    "That was a delicious meal.",
    "I am on top of the world!",
    "I just had a terrible day at work.",
    "I'm worried about my exam.",
    "That book was thrilling!",
    "I'm feeling adventurous.",
    "I'm feeling so lazy today.",
    "That was a scary movie.",
    "I am grateful for my friends.",
    "The party was a blast!",
    "That test was really hard.",
    "I feel loved.",
    "I feel so rejected.",
    "I'm bursting with joy.",
    "I'm disgusted by the trash.",
    "That was a stressful situation.",
    "I'm so proud of my team.",
    "I'm amazed by the view.",
    "That song was touching.",
    "I feel so lonely.",
    "I'm feeling nostalgic.",
    "The race was intense.",
    "That was an awkward conversation.",
    "I feel inspired.",
    "I'm feeling playful.",
    "I'm feeling ambitious.",
    "I'm feeling doubtful.",
    "That was a surprising result.",
    "I'm feeling content.",
    "I'm so disappointed.",
    "I'm feeling hopeful.",
    "That was a frustrating experience.",
    "I feel so appreciated.",
    "I'm confused.",
    "I'm feeling motivated.",
    "I'm feeling pessimistic.",
    "I'm feeling apathetic.",
    "That was an impressive performance.",
    "I'm curious about the result.",
    "I'm feeling so relaxed.",
    "I'm feeling agitated.",
    "That was a depressing story.",
    "I'm feeling optimistic.",
    "I feel so empowered.",
    "I'm feeling ashamed.",
    "I'm feeling energized.",
    "I'm feeling apprehensive.",
    "I'm feeling delighted.",
    "I'm feeling guilty.",
    "That was a challenging puzzle.",
    "I'm feeling so refreshed.",
    "I'm feeling overwhelmed.",
    "I'm feeling serene.",
    "I'm feeling vulnerable.",
    "That was a fascinating lecture.",
    "I'm feeling proud.",
    "I'm feeling humiliated.",
    "I'm feeling so exhilarated.",
    "I'm feeling regretful.",
    "I'm feeling contented.",
    "I'm feeling restless.",
    "That was an enchanting evening.",
    "I'm feeling tranquil.",
    "I'm feeling tormented.",
    "I'm feeling triumphant.",
    "I'm feeling desolate.",
    "I'm feeling blissful.",
    "I'm feeling distressed.",
    "I'm feeling jubilant.",
    "I'm feeling woeful.",
    "I'm feeling exuberant.",
    "I'm feeling despondent.",
    "I'm feeling ecstatic.",
    "I'm feeling inconsolable.",
    "I'm feeling rapturous.",
    "I'm feeling forlorn.",
    "I'm feeling exhilarated.",
    "I'm feeling downhearted."
]

for sentence in sentences[:20]:
    emojis = engine.process_query_tf_idf(sentence, indexer._clean_text)
    print(f"{sentence} {emojis}")

I'm feeling happy. 🐪, 🌑, 🙋‍♂️, 🥲, 📖
I'm feeling very sad. 🙁, 😢💔, 😯, 🍃, 😂😂😂
I'm angry with you. 🙌🏼, 👩, 😭, 😳😂, 🅰️
I love pizza. ✔️, 💘, 👀👀, 😓, 🍕😍👌
I dislike broccoli. 😍, 🙄, 🌚, 😛, 👎🏻
The sunrise this morning was beautiful. ☀️, 🙌🏼🙏🏼, 👏🏾🙌🏾, 🙏🏻, 🤌
It's been a long, tiring day. 🌛, 🅱️, 😨, ☔, 👏
I just won the lottery! 🙁, 😭, 🎶, ⭐⭐, 🥀🔥
I can't believe we lost the game. 👏, 🚫, 🤷, 😂😭, 📱
I'm so excited for the weekend. 🔥🔥, 👍, 🔪, 🤞🏽, 🍑
The movie was boring. 🎮, 😮, 🙃, 🤙🏻, 💀
That was the best concert ever! 😚, 💋, 😛, 🤗, 🐕
I'm scared of spiders. 👀, 😲, 😭😂, 💔, 👐
My heart is broken. 👏🏻, 😭😭, 🔑, 😍😍😍😍, 👋🏼
I can't wait for my birthday. 💎🙌, 😂😂, 🙈🙈, 🍭, 😓
I am feeling so peaceful right now. ✨, 😍😍😍, 😪, 🥰, ☺️
That joke was hilarious. 😂😂😂😂😂, 🎥, 🐸, 😂😂👍, 😂❤️
I'm feeling pretty indifferent about the whole situation. 😎, 🗣️, 🙄, 🤒, 🌞
I just got a promotion! 🙏🏼🔥, 🅱️, 🚓, 🙇, 2️⃣
I feel like crying. 🥲, 🤔, 😍😍, 🔪, 😊👍🏼


In [93]:
social_media_sentences = [
    "Just had the best coffee at @CafeLuv ☕️ #CoffeeLover",
    "Getting ready for a Friday night out with the girls! 💃 #FridayFeeling",
    "Who else is excited for the new Avengers movie? 🍿 #MarvelFan",
    "Can't believe how beautiful the sunset was today. 🌅 #NaturePhotography",
    "Dinner at my favorite sushi place 🍣 #Foodie",
    "Throwback to my trip to Paris last summer 🗼 #TravelDiaries",
    "Feeling so blessed to have such amazing people in my life 🥰 #Blessed",
    "Workout done for the day! 💪 #FitnessGoals",
    "I could spend all day reading at this quiet little bookstore 📚 #BookWorm",
    "Had an awesome time at the concert last night! 🎤 #LiveMusic",
    "I can assist you with booking a flight ✈️ #ChatBot",
    "What can I help you find today? 🔍 #CustomerService",
    "Processing your request now... ⏳ #AI",
    "Your order has been placed! 🛍️ #ShoppingBot",
    "The weather in New York today is sunny with a high of 75 degrees 🌞 #WeatherBot",
    "Directing you to a customer service representative now 📞 #HelpBot",
    "That information is not currently available. Can I assist with anything else? ❓ #InfoBot",
    "You have 3 new notifications 📬 #ReminderBot",
    "You successfully completed your daily step goal! 🏃‍♀️ #HealthBot",
    "Your package has been shipped and is on its way 📦 #DeliveryUpdate"
]

for sentence in social_media_sentences:
    emojis = engine.process_query_tf_idf(sentence, indexer._clean_text)
    print(f"{sentence} {emojis}")


Just had the best coffee at @CafeLuv #CoffeeLover 🎀, 🎾, 🌚, 👌🏻, 😩👌
Getting ready for a Friday night out with the girls! 💃 #FridayFeeling 🙌🙌, 🤑, 🍭, 🙌🏻🙌🏻🙌🏻, 🤝
Who else is excited for the new Avengers movie? 🍿 #MarvelFan 🤙🏼, 🔑, 🖕🏽, 🤔, ❗
Can't believe how beautiful the sunset was today. 🌅 #NaturePhotography 🐦, ☀️, 🤙🏻, 🎉, 😍
Dinner at my favorite sushi place 🍣 #Foodie 👅, 🤙🏻, 👀, 🙄, 🚌
Throwback to my trip to Paris last summer 🗼 #TravelDiaries 🍞, 💘, ⁉️, ☺️, 😎💦
Feeling so blessed to have such amazing people in my life 🥰 #Blessed ✨, 🙏, 🍩, 🙏🏼, 🙏🏻🙏🏻🙏🏻
Workout done for the day! 💪 #FitnessGoals 😍🙌, 🍑, ☝️, 👐🏽, 🙏🏾🙏🏾🙏🏾
I could spend all day reading at this quiet little bookstore 📚 #BookWorm 👏🏼, 👇🏻, 🐥, 👇👇, 💕
Had an awesome time at the concert last night! 🎤 #LiveMusic 😔, 🍃, 🌚, 💊, 😈
I can assist you with booking a flight ✈️ #ChatBot 👉, 🙏🏼, 😍❤️, 😻, 💚
What can I help you find today? 🔍 #CustomerService 😔✌️, 👔, ✔️, 🙏🏼✨, 😁
Processing your request now... ⏳ #AI 😫, 🙏🏽, 👇👇, 🍆, 😔
Your order has been placed! 🛍️ #Shopp