[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github.com/ntl2222/HackathonAI/blob/nikos/topic_extractor.ipynb)

# Topic Extraction from an unsupervised dataset

---

### Resources

# Table of Contents

- [Data](#Data)
  - [Preprocessing](#Preprocessing)
- [Topic Extraction using FastText](#Topic-Extraction-using-FastText)

---

In [1]:
# for colab

# Data

The dataset we used is: [10000 Restaurant Reviews](#https://www.kaggle.com/datasets/joebeachcapital/restaurant-reviews) from www.kaggle.com. 

In [37]:
# %%writefile scripts/get_data.py

from pathlib import Path
import requests
import zipfile
import os

def download_data():
    dataset_dir = Path('./data/raw')
    dataset_dir.mkdir(parents=True, exist_ok=True)
    
    notEmpty = any(dataset_dir.iterdir())
    
    if notEmpty:
        print('Dataset exists.')
        
    else:
        try:
            response = requests.get('https://storage.googleapis.com/kaggle-data-sets/3697155/6410731/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240224%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240224T212811Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=3630dc54d8e2cee4459eceb6d3414ccb669f04b6996660b9d1a5e20d07f242fde686cf4609e222e2e0d4d34746a77c1c0115c550228a80bfb707e252614ae108f6e2b7f6fa206998100df0c3218b91bd5ad6ea64aa2921b4ecb170f123e0e9e36e9e20a0d772e1689d698fa53a1f1f0f673cc4b94b42919f970c6286bd3d2fa7ecf5e72a14a3c4ba8fd32e2074c97e178e922d8a44280914e36b8371ebc172e122d9db33e6bd83735ba3c3f106224e2eb6566d7885fd87dccd26156f7018ec0d1d4138b55b4d27ba205e5fd68e4b923b4ca8b64bced817e37f9164e3284bab015e05ec046bf635f90f18ebf1fcfcc2ab450851c441deea8700d717f33251be3a')
        
            if response.status_code == 200:
                print('Downloading dataset..')
                with open('archive.zip', 'wb') as f:
                    f.write(response.content)
        
                print('Unzipping...')
                with zipfile.ZipFile('archive.zip', 'r') as zip_ref:
                    zip_ref.extractall(dataset_dir)
                    print('Done.')
        
                os.remove('archive.zip')
    
            else:
                raise requests.exceptions.RequestException(f"Error downloading dataset. status code: {response.status_code}")
                
        except requests.exceptions.RequestException as e:
            print(e)


In [38]:
download_data()

Dataset exists.


In [3]:
csv_dir = dataset_dir / 'Restaurant-reviews.csv'

In [5]:
df = pd.read_csv(csv_dir, usecols=['Review', 'Rating']).dropna() # make sure we dont have null reviews
print(f'Unique rows: {df.index.nunique()}')
print(df)

Unique rows: 9954
                                                 Review Rating
0     The ambience was good, food was quite good . h...      5
1     Ambience is too good for a pleasant evening. S...      5
2     A must try.. great food great ambience. Thnx f...      5
3     Soumen das and Arun was a great guy. Only beca...      5
4     Food is good.we ordered Kodi drumsticks and ba...      5
...                                                 ...    ...
9994  Madhumathi Mahajan Well to start with nice cou...      3
9995  This place has never disappointed us.. The foo...    4.5
9996  Bad rating is mainly because of "Chicken Bone ...    1.5
9997  I personally love and prefer Chinese Food. Had...      4
9998  Checked in here to try some delicious chinese ...    3.5

[9954 rows x 2 columns]


In [6]:
# remove rows from column Review if they are not of type string
df = df.drop(df[df['Review'].apply(lambda x: not isinstance(x, str))].index)
# remove rows that contain only special symbols and not words
df = df[~df['Review'].str.contains(r'^[\W_]+$')]

In [7]:
# check for duplicates or missing values
duplicate_index = df.index[df.index.duplicated()]
print('Duplicates:')
print(len(duplicate_index))

print('\nMissing indexes:')
missing_index = set(range(len(df))) - set(df.index)
print(len(missing_index))

Duplicates:
0

Missing indexes:
56


In [8]:
df = df.reindex(range(len(df)))
missing_index = set(range(len(df))) - set(df.index)
df = df.dropna()
print('Missing indexes:')
print(len(missing_index))

Missing indexes:
0


## Preprocessing

In [9]:
from typing import List
import re 

def remove_url(text: str) -> str:
    text = re.sub(r"http\S+", "", text)
    return text

* We would also like to handle the emojis that occur in the reviews but without deleting them completely, since they carry a great deal of information in their context. We will instead replace them with the corrensponding text.

In [10]:
import demoji

def replace_emoji(text: str) -> str:
    emojis = demoji.findall(text)

    for emoji in emojis:
        text = text.replace(emoji, ' ' + emojis[emoji].split(':')[0])

    return text

In [11]:
review = df.values[65][0]

print('Before:')
print(review)
print('\nAfter:')
print(replace_emoji(review))

Before:
Best place to hangout...😊
Food is really great...
Thanks Papiya for the service...😊
Staff was reallly co-operative...
Ambience is really great, especially PDR(Private Dining Room) is awesome...😍👌🏻

After:
Best place to hangout... smiling face with smiling eyes
Food is really great...
Thanks Papiya for the service... smiling face with smiling eyes
Staff was reallly co-operative...
Ambience is really great, especially PDR(Private Dining Room) is awesome... smiling face with heart-eyes OK hand


* Next we will perform some standard pre-processing steps (like tokenization, removing stop words, etc.) to prepare our reviews to be fed to the model.

In [12]:
def tokenize(text: str) -> List[str]:
    text = text.lower()
    text = text.split(' ')

    return text

In [13]:
review, _ = next(iter(df.values))

print('Before:')
print(review)
print('\nAfter:')
print(tokenize(review))    

Before:
The ambience was good, food was quite good . had Saturday lunch , which was cost effective .
Good place for a sate brunch. One can also chill with friends and or parents.
Waiter Soumen Das was really courteous and helpful.

After:
['the', 'ambience', 'was', 'good,', 'food', 'was', 'quite', 'good', '.', 'had', 'saturday', 'lunch', ',', 'which', 'was', 'cost', 'effective', '.\ngood', 'place', 'for', 'a', 'sate', 'brunch.', 'one', 'can', 'also', 'chill', 'with', 'friends', 'and', 'or', 'parents.\nwaiter', 'soumen', 'das', 'was', 'really', 'courteous', 'and', 'helpful.']


In [14]:
# ntlk.download()
from nltk.corpus import stopwords

def remove_stopwords(text: List[str]) -> List[str]:
    text = [words for words in text if words not in stopwords.words('english')]

    return text

In [15]:
print('Before:')
print(review)
print('\nAfter:')
print(remove_stopwords(tokenize(review)))     

Before:
The ambience was good, food was quite good . had Saturday lunch , which was cost effective .
Good place for a sate brunch. One can also chill with friends and or parents.
Waiter Soumen Das was really courteous and helpful.

After:
['ambience', 'good,', 'food', 'quite', 'good', '.', 'saturday', 'lunch', ',', 'cost', 'effective', '.\ngood', 'place', 'sate', 'brunch.', 'one', 'also', 'chill', 'friends', 'parents.\nwaiter', 'soumen', 'das', 'really', 'courteous', 'helpful.']


In [16]:
# python -m spacy download en_core_web_sm
import spacy

sp = spacy.load("en_core_web_sm")

In [17]:
def lemmatization(text: List[str]) -> List[str]:

    text = ' '.join(text)
    token = sp(text)
    text = [word.lemma_ for word in token]
    
    return text

In [18]:
print('Before:')
print(review)
print('\nAfter:')
print(lemmatization(tokenize(review)))     

Before:
The ambience was good, food was quite good . had Saturday lunch , which was cost effective .
Good place for a sate brunch. One can also chill with friends and or parents.
Waiter Soumen Das was really courteous and helpful.

After:
['the', 'ambience', 'be', 'good', ',', 'food', 'be', 'quite', 'good', '.', 'have', 'saturday', 'lunch', ',', 'which', 'be', 'cost', 'effective', '.', '\n', 'good', 'place', 'for', 'a', 'sate', 'brunch', '.', 'one', 'can', 'also', 'chill', 'with', 'friend', 'and', 'or', 'parent', '.', '\n', 'waiter', 'soumen', 'das', 'be', 'really', 'courteous', 'and', 'helpful', '.']


* Now putting everything together

In [19]:
import nltk
import spacy
import demoji
from nltk.corpus import stopwords
from nltk import pos_tag
# nltk.download('averaged_perceptron_tagger')
import re
from typing import List

class TextCleaner():
    def __init__(self, text: str = None, remove_stops: bool = True, remove_verbs: bool = False):
        self.remove_verbs = remove_verbs
        self.remove_stops = remove_stops
        self.tokens = []

        sp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

    def tokenizer(self, text: str) -> List[str]:
        '''Transforms input text to lowercase and splits it to tokens.
           Applies lemmatization if lemma=True'''
        doc = sp(text)
        tokens = []

        for token in doc:
            # Check if the token is not a punctuation or whitespace and is not empty
            if not token.is_punct and not token.is_space and token.text.strip():
                # Check if the token is a noun
                if self.remove_verbs:
                    if token.pos_.startswith('N'):                        
                        lemma_token = token.lemma_.lower()
                        tokens.append(lemma_token)
                    elif not token.pos_.startswith('N'):
                        continue   
                else:
                    lemma_token = token.lemma_.lower()
                    tokens.append(lemma_token)                
    
        return tokens

    def _remove_stopwords(self, tokens: List[str]) -> List[str]:
        '''Removes stop words'''
        return [word for word in tokens if word not in stopwords.words('english')]

    def _demoji_replace(self, text: str) -> str:
        '''Replaces emojis with text'''
        emojis = demoji.findall(text)
        for emoji in emojis:
            text = text.replace(emoji, ' ' + emojis[emoji].split(':')[0])
    
        return text

    def clean(self, text: str) -> str:
        '''Performs a full transformation of the input text'''
        # Remove urls
        clean_text = re.sub(r"http\S+", "", text)
        # Replace emojis
        clean_text = self._demoji_replace(clean_text)
        # Remove punctuation
        clean_text = re.sub(r'[^\w\s]', '', clean_text)
        # Tokenize & lemmatization
        tokens = self.tokenizer(clean_text)
        # Remove stop words
        if self.remove_stops:
            tokens = self._remove_stopwords(tokens)
        # Join tokens back into a single string
        cleaned_text = " ".join(tokens)
        # self.tokens = tokens
        return cleaned_text


In [20]:
df['Review'] = df['Review'].astype(str)

# f = TextCleaner(remove_verbs=True)
# df['cleaned-reviews'] = df['Review'].map(lambda review: f.clean(review))
# df.to_csv(dataset_dir / 'topic_df.csv', index=False)

In [21]:
# f = TextCleaner(remove_verbs=False)
# df['cleaned-reviews'] = df['Review'].map(lambda review: f.clean(review))
# df.to_csv(dataset_dir / 'clean_df.csv', index=False)

# Custom Vocabulary

In [22]:
import pandas as pd
df = pd.read_csv(dataset_dir / 'topic_df.csv')
df

Unnamed: 0,Review,Rating,cleaned-reviews
0,"The ambience was good, food was quite good . h...",5,ambience food lunch cost place sate brunch fri...
1,Ambience is too good for a pleasant evening. S...,5,ambience evening experience kudo service
2,A must try.. great food great ambience. Thnx f...,5,food ambience service recommendation music bac...
3,Soumen das and Arun was a great guy. Only beca...,5,guy behavior sincerety food course place
4,Food is good.we ordered Kodi drumsticks and ba...,5,food goodwe drumstick basket mutton biryani thank
...,...,...,...
9882,I am amazed at the quality of food and service...,4,quality food service place ambience location p...
9883,The food was amazing. Do not forget to try 'Mo...,4.5,food sizzler staff chicken town heart
9884,We ordered from here via swiggy:\n\nWe ordered...,4,swiggy mushroom quantity dish paneer gravy dis...
9885,I have been to this place on a sunday with my ...,1,place friend meal time friend moment 215pm man...


In [23]:
df.loc[:, 'cleaned-reviews'] = df['cleaned-reviews'].astype(str)
reviews = df['cleaned-reviews'].values.tolist()

In [24]:
import torchtext
from torchtext.vocab import vocab
import gensim.corpora as corpora

from collections import Counter, OrderedDict
from typing import List, Dict, Union

class CustomVocab(torchtext.vocab.Vocab):
    def __init__(self, document: Union[List[str], str]):
        super(CustomVocab, self).__init__(None)
        
        self.rawText = document
        self.tokens = []
        self.word_freqs = []
        self.vocab = self._create_vocab(document)
        self.id2words = []
        self.bow = self._bag_of_words(document)

    def __len__(self):
        return len(self.tokens)

    def _create_vocab(self, document):
        tokens = self._get_tokens(document)
        orderedDict = self._get_word_freq(tokens)
        
        vocab = torchtext.vocab.vocab(ordered_dict=orderedDict, min_freq=1)
        vocab.set_default_index(-1)
        return vocab

    def _get_tokens(self, document):
        if isinstance(document, str):
            document = [document]

        tokens = []
        for word in document:
            token = word.split(' ')
            tokens.extend(token)

        self.tokens = tokens
        return tokens

    def _get_word_freq(self, tokens):       

        counter = Counter(tokens)
        sort_counter = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        self.word_freqs = sort_counter
        return OrderedDict(counter)

    def _bag_of_words(self, document):
        words = []
        for doc in document:
            words.append([token for token in doc.split(' ')])
        # print(words)
        self.id2words = corpora.Dictionary(words)
        
        return [self.id2words.doc2bow(word) for word in words]


In [25]:
vocab = CustomVocab(reviews)

In [26]:
tokens = vocab.tokens
freq = vocab.word_freqs
stoi = vocab.get_stoi()
bow = vocab.bow
id2word = vocab.id2words

In [27]:
len(bow), len(id2word)

(9887, 7819)

# Latent Dirichlet Allocation (LDA)
We use an LDA model from gensim library to find relevant topics in our dataset.

In [28]:
from gensim.models import LdaMulticore
import os

In [29]:
num_topics = 1
num_cores = os.cpu_count()
lda_model = LdaMulticore(corpus=bow, id2word=id2word,
                         num_topics=num_topics, iterations=100,
                         workers=num_cores)

topics = lda_model.print_topics()
topics = topics[0][1]
topics

'0.047*"place" + 0.047*"food" + 0.020*"service" + 0.016*"time" + 0.016*"chicken" + 0.015*"one" + 0.014*"taste" + 0.012*"ambience" + 0.011*"restaurant" + 0.010*"staff"'

# Topic Extraction using FastText

In [30]:
clean_df = pd.read_csv(dataset_dir / 'clean_df.csv')
clean_df.loc[:, 'cleaned-reviews'] = clean_df['cleaned-reviews'].astype(str)

In [31]:
fasttext_topics = []
clean_reviews = clean_df['cleaned-reviews'].values.tolist()

In [None]:
from gensim.models import FastText
fasttext_model = FastText(clean_reviews, vector_size=100, window=5, min_count=1, workers=4, sg=1)

In [None]:
import numpy as np

In [None]:
labels = {
    0 : 'food',
    1 : 'service',
    2 : 'atmosphare'
}

clean_df['topic'] = None
clean_df = clean_df.drop(columns=['cleaned-reviews'])

for row, review in enumerate(clean_reviews):
    prob = []
    for topic in topics:
        prob.append(fasttext_model.wv.n_similarity(review, topic))
    clean_df.loc[row, 'topic'] = labels[np.argmax(prob)]

In [None]:
clean_df.to_csv(dataset_dir / 'clean_df.csv', index=False)

In [None]:
clean_df