<a href="https://colab.research.google.com/github/MasslessAI/narratelab/blob/master/exp/exp_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [25]:
!pip install spacy keybert[spacy] sentence-transformers redditcleaner psaw pandas loguru
# !python -m spacy download en_core_web_trf

Collecting loguru
  Downloading loguru-0.5.3-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 2.5 MB/s 
Installing collected packages: loguru
Successfully installed loguru-0.5.3


# Scrap Reddit Data

Using pushshift api to quickly search for submissions where title contains wh-words.

```
what|when|where|who|whom|which|whose|why|how|wonder|want|anyone
```

Only retrieve submissions that are text submissions, and filter out deleted submissions and submissions whose author is banned.

We keep the following fields:

* title
* selftext
* author
* permalink
* num_comments <font color='blue'>*</font>
* score <font color='blue'>*</font>
* upvote_ratio <font color='blue'>*</font>
* total_awards_received <font color='blue'>*</font>

<font color='blue'>*</font> *Used for future ranking purpose*

In [27]:
import datetime as dt
from datetime import date, datetime
from loguru import logger
import pandas as pd
from psaw import PushshiftAPI
import time
import os
import redditcleaner
import re
import base64

api = PushshiftAPI()

start_epoch = int(dt.datetime(2019, 1, 1).timestamp())
#end_epoch = int(dt.datetime(2020, 3, 1).timestamp())
end_epoch = int(time.time())
total = 0

SUBREDDIT = 'nft'
DATA_FILE_NAME = 'reddit_submission_{}_{}_{}.tsv'.format(
    SUBREDDIT, datetime.fromtimestamp(start_epoch).strftime("%Y_%m_%d"),
    datetime.fromtimestamp(end_epoch).strftime("%Y_%m_%d"))
while True:
    gen = list(
        api.search_submissions(
            after=start_epoch, before=end_epoch,
            title="what|when|where|who|whom|which|whose|why|how|wonder|want|anyone", is_self=True,
            is_original_content=True, subreddit=SUBREDDIT,
            filter=['title', 'selftext', 'author', 'permalink', 'num_comments', 'score', 'total_awards_received',
                    'upvote_ratio'],
            sort='asc', sort_type='created_utc', limit=500))

    if len(gen) == 0:
        break

    def submission_filter(submission):
        if 'title' not in submission:
            return False
        if 'selftext' not in submission:
            return False
        if 'author' not in submission:
            return False
        if submission['author'] == "[deleted]":
            return False
        if any(submission['selftext'] == x for x in ["[removed]", "[deleted]"]):
            return False
        return True

    def prepare_data(data):
        # some of the fields may be missing
        # must manually set an init value to avoid
        # generating invalid csv
        _data = {
            'title': '',
            'selftext': '',
            'author': '',
            'permalink': '',
            'num_comments': 0,
            'score': 0,
            'total_awards_received': 0,
            'upvote_ratio': 1.0,
            'created_utc': None
        }

        for key in _data:
            if key in data and data[key] is not None:
                _data[key] = data[key]

        return _data


    items = map(prepare_data, [item.d_ for item in gen])

    items = list(filter(submission_filter, items))

    df = pd.DataFrame(items)

    # clean data
    def clean(text):
        # remove reddit styles
        text = redditcleaner.clean(
            text, quote=False, bullet_point=False, link=False, strikethrough=False, spoiler=False, code=False,
            superscript=False, table=False)

        # refer to https://towardsdatascience.com/cleaning-text-data-with-python-b69b47b97b76
        # Remove unicode characters
        text = text.encode('ascii', 'ignore').decode()

        # Remove Hashtags
        text = re.sub("#\S+", " ", text)

        # Remove markdown links
        text = re.sub(r"\[(.+)\]\(.+\)", r"\1", text)

        # Remove other urls
        text = re.sub(r"http\S+", " ", text)

        # remove text inside brackets
        text = re.sub("\(.*?\)"," ", text)
        text = re.sub("\[.*?\]"," ", text)

        # remove quotes
        # remove brackets
        # remove semicolon
        text = re.sub(r'[\t()[\]\"*:\\]',' ', text)

        # remove non-ascii chars
        text = re.sub(r"[^\x00-\x7F]+",' ', text)

         # Replace the over spaces
        text = re.sub('\s{2,}', " ", text)

        return text

    df['title'] = df['title'].map(clean)
    df['selftext'] = df['selftext'].map(clean)

    if not os.path.isfile(DATA_FILE_NAME):
        df.to_csv(DATA_FILE_NAME, sep='\t', header='column_names', index=False, quoting=3)
    else:  # else it exists so append without writing the header
        df.to_csv(DATA_FILE_NAME, sep='\t', mode='a', header=False, index=False, quoting=3)

    start_epoch = items[-1]['created_utc']
    total += len(items)
    logger.info('Added {} Total {} Last created_utc {}'.format(
        len(items), total, date.fromtimestamp(start_epoch)))

    time.sleep(3)

2021-07-08 22:56:40.867 | INFO     | __main__:<module>:127 - Added 472 Total 472 Last created_utc 2021-03-13
2021-07-08 22:56:45.943 | INFO     | __main__:<module>:127 - Added 330 Total 802 Last created_utc 2021-04-09
2021-07-08 22:56:50.728 | INFO     | __main__:<module>:127 - Added 305 Total 1107 Last created_utc 2021-05-17
2021-07-08 22:56:55.378 | INFO     | __main__:<module>:127 - Added 281 Total 1388 Last created_utc 2021-07-08


# Load Scrapped Reddit Data

Load submissions, filter out ads, and classified them by wh-words

From observation, some **submissions are ads**, which often contains the following symbols/phrases

```
'-', ':', ';' 
your business(es)
help you
```

<mark>TODO</mark> 
 - [ ] Add more ad-indicative phrases
 - [ ] Use num_comments to filter ads

In [None]:
df = pd.read_csv("reddit_submission_nft_2019_01_01_2021_07_08.tsv", sep='\t', quoting=3)

# data cleanup
print('before clean-up # rows: {}'.format(len(df)))
cleaned_rows = []

AD_INDICATIVE_PHRASES = ["your business", "your businesses", "help you"]

for index, row in df.iterrows():
    row_title = row['title'].lower()
    if not any(x in ['-', ':', ';'] for x in row_title) and not any(phrase in row_title for phrase in AD_INDICATIVE_PHRASES):
        '''
        1. must contain '?'
        2. can only contain alphanumeric, punctuations and space
        3. should not contain '-', ':', ';' which indicates ads
        '''
        cleaned_rows.append(index)

df = df[df.index.isin(cleaned_rows)].reset_index()
print('after clean-up # rows: {}'.format(len(df)))

# Classified submissions by wh-words in title

In [None]:
WH_WORD_LIST = ["what", "when", "where", "who", "whom", "which", "whose", "why", "how", "wonder", "want", "has anyone"]

title_cat = []
for index, row in df.iterrows():
    title_cat.append('NO_WH_WORD')
    for wh_word in WH_WORD_LIST:
        if wh_word in row['title'].lower():
            title_cat[-1] = wh_word
            break

df['title_cat'] = title_cat

# Pick a wh-word category

In [None]:
df_selection = df[df['title_cat'] == 'how'].copy().reset_index()

# Clustering submission titles using sentence encoder

In [20]:
import spacy
from keybert import KeyBERT
import tensorflow_hub

spacy.prefer_gpu()
nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])

# kw_model = KeyBERT(model=nlp)
#embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
#kw_model = KeyBERT(model=embedding_model)

kw_model = KeyBERT(model="paraphrase-MiniLM-L6-v2")

doc = """
         I've been doing more research on how to sell NFT art and I'm hoping someone can help me out. I want the owner to be able to view the art . Is this possible as an NFT? Or is there a way to attach it to a copy that they can zoom into?
      """

keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 3), highlight=True)
print(keywords)

[('sell nft art', 0.7819), ('nft art', 0.7597), ('nft art hoping', 0.739), ('art possible nft', 0.7353), ('sell nft', 0.6575)]
