<a href="https://colab.research.google.com/github/MasslessAI/narratelab/blob/master/exp/exp_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [18]:
from google.colab import drive

drive.mount('gdrive', force_remount=True)

root_dir = '/content/gdrive/'

gdrive_path = root_dir + 'MyDrive/narratelab/exp_1'

!pip install spacy keybert[spacy] sentence-transformers redditcleaner psaw pandas loguru bertopic[spacy]
# !python -m spacy download en_core_web_trf

Mounted at gdrive
Collecting bertopic[spacy]
[?25l  Downloading https://files.pythonhosted.org/packages/f6/9e/16678af67081452c01fcaeca5fd734a1033be2da0e9d40815ee742588ef4/bertopic-0.8.1-py2.py3-none-any.whl (53kB)
[K     |████████████████████████████████| 61kB 3.6MB/s 
Collecting plotly<4.14.3,>=4.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2e/69579c3db25fa4f85d70a10f8a98d52c2b4a0dcbd153e8f17f425761bef4/plotly-4.14.2-py2.py3-none-any.whl (13.2MB)
[K     |████████████████████████████████| 13.2MB 6.9MB/s 
[?25hCollecting umap-learn>=0.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/75/69/85e7f950bb75792ad5d666d86c5f3e62eedbb942848e7e3126513af9999c/umap-learn-0.5.1.tar.gz (80kB)
[K     |████████████████████████████████| 81kB 8.6MB/s 
[?25hCollecting hdbscan>=0.8.27
[?25l  Downloading https://files.pythonhosted.org/packages/32/bb/59a75bc5ac66a9b4f9b8f979e4545af0e98bb1ca4e6ae96b3b956b554223/hdbscan-0.8.27.tar.gz (6.4MB)
[K     |████████████████

# Scrap Reddit Data

Using pushshift api to quickly search for submissions where title contains wh-words.

```
what|when|where|who|whom|which|whose|why|how|wonder|want|anyone
```

Only retrieve submissions that are text submissions, and filter out deleted submissions and submissions whose author is banned.

We keep the following fields:

* title
* selftext
* author
* permalink
* num_comments <font color='blue'>*</font>
* score <font color='blue'>*</font>
* upvote_ratio <font color='blue'>*</font>
* total_awards_received <font color='blue'>*</font>

<font color='blue'>*</font> *Used for future ranking purpose*

In [8]:
import datetime as dt
from datetime import date, datetime
from loguru import logger
import pandas as pd
from psaw import PushshiftAPI
import time
import os
import redditcleaner
import re
import base64

api = PushshiftAPI()

start_epoch = int(dt.datetime(2019, 1, 1).timestamp())
#end_epoch = int(dt.datetime(2020, 3, 1).timestamp())
end_epoch = int(time.time())
total = 0

SUBREDDIT = 'nft'
DATA_FILE_NAME = gdrive_path + '/reddit_submission_{}_{}_{}.tsv'.format(
    SUBREDDIT, datetime.fromtimestamp(start_epoch).strftime("%Y_%m_%d"),
    datetime.fromtimestamp(end_epoch).strftime("%Y_%m_%d"))
while True:
    gen = list(
        api.search_submissions(
            after=start_epoch, before=end_epoch,
            title="what|when|where|who|whom|which|whose|why|how|wonder|want|anyone", is_self=True,
            is_original_content=True, subreddit=SUBREDDIT,
            filter=['title', 'selftext', 'author', 'permalink', 'num_comments', 'score', 'total_awards_received',
                    'upvote_ratio'],
            sort='asc', sort_type='created_utc', limit=500))

    if len(gen) == 0:
        break

    def submission_filter(submission):
        if 'title' not in submission:
            return False
        if 'selftext' not in submission:
            return False
        if 'author' not in submission:
            return False
        if submission['author'] == "[deleted]":
            return False
        if any(submission['selftext'] == x for x in ["[removed]", "[deleted]"]):
            return False
        return True

    def prepare_data(data):
        # some of the fields may be missing
        # must manually set an init value to avoid
        # generating invalid csv
        _data = {
            'title': '',
            'selftext': '',
            'author': '',
            'permalink': '',
            'num_comments': 0,
            'score': 0,
            'total_awards_received': 0,
            'upvote_ratio': 1.0,
            'created_utc': None
        }

        for key in _data:
            if key in data and data[key] is not None:
                _data[key] = data[key]

        return _data


    items = map(prepare_data, [item.d_ for item in gen])

    items = list(filter(submission_filter, items))

    df = pd.DataFrame(items)

    # clean data
    def clean(text):
        # remove reddit styles
        text = redditcleaner.clean(
            text, quote=False, bullet_point=False, link=False, strikethrough=False, spoiler=False, code=False,
            superscript=False, table=False)

        # refer to https://towardsdatascience.com/cleaning-text-data-with-python-b69b47b97b76
        # Remove unicode characters
        text = text.encode('ascii', 'ignore').decode()

        # Remove Hashtags
        text = re.sub("#\S+", " ", text)

        # Remove markdown links
        text = re.sub(r"\[(.+)\]\(.+\)", r"\1", text)

        # Remove other urls
        text = re.sub(r"http\S+", " ", text)

        # remove text inside brackets
        text = re.sub("\(.*?\)"," ", text)
        text = re.sub("\[.*?\]"," ", text)

        # remove quotes
        # remove brackets
        # remove semicolon
        text = re.sub(r'[\t()[\]\"*:\\]',' ', text)

        # remove non-ascii chars
        text = re.sub(r"[^\x00-\x7F]+",' ', text)

         # Replace the over spaces
        text = re.sub('\s{2,}', " ", text)

        return text

    df['title'] = df['title'].map(clean)
    df['selftext'] = df['selftext'].map(clean)

    if not os.path.isfile(DATA_FILE_NAME):
        df.to_csv(DATA_FILE_NAME, sep='\t', header='column_names', index=False, quoting=3)
    else:  # else it exists so append without writing the header
        df.to_csv(DATA_FILE_NAME, sep='\t', mode='a', header=False, index=False, quoting=3)

    start_epoch = items[-1]['created_utc']
    total += len(items)
    logger.info('Added {} Total {} Last created_utc {}'.format(
        len(items), total, date.fromtimestamp(start_epoch)))

    time.sleep(3)

2021-07-12 18:49:01.085 | INFO     | __main__:<module>:126 - Added 472 Total 472 Last created_utc 2021-03-13
2021-07-12 18:49:09.629 | INFO     | __main__:<module>:126 - Added 330 Total 802 Last created_utc 2021-04-09
2021-07-12 18:49:14.898 | INFO     | __main__:<module>:126 - Added 305 Total 1107 Last created_utc 2021-05-17
2021-07-12 18:49:24.007 | INFO     | __main__:<module>:126 - Added 298 Total 1405 Last created_utc 2021-07-12


# Load Scrapped Reddit Data

Load submissions, filter out ads, and classified them by wh-words

From observation, some **submissions are ads**, which often contain the following symbols/phrases

```
'-', ':', ';' 
your business(es)
help you
case study
```

<mark>TODO</mark> 
 - [ ] Add more ad-indicative phrases
 - [ ] Use num_comments to filter ads

In [12]:
df = pd.read_csv(gdrive_path +"/reddit_submission_nft_2019_01_01_2021_07_12.tsv", sep='\t', quoting=3)

# data cleanup
print('before clean-up # rows: {}'.format(len(df)))
cleaned_rows = []

AD_INDICATIVE_PHRASES = ["your business", "your businesses", "help you", "case study"]

for index, row in df.iterrows():
    row_title = row['title'].lower()
    if not any(x in ['-', ':', ';'] for x in row_title) and not any(phrase in row_title for phrase in AD_INDICATIVE_PHRASES):
        '''
        1. must contain '?'
        2. can only contain alphanumeric, punctuations and space
        3. should not contain '-', ':', ';' which indicates ads
        '''
        cleaned_rows.append(index)

df = df[df.index.isin(cleaned_rows)].reset_index()
print('after clean-up # rows: {}'.format(len(df)))

before clean-up # rows: 1405
after clean-up # rows: 1274


# Classified submissions by wh-words in title

In [13]:
WH_WORD_LIST = ["what", "when", "where", "who", "whom", "which", "whose", "why", "how", "wonder", "want", "has anyone"]

title_cat = []
for index, row in df.iterrows():
    title_cat.append('NO_WH_WORD')
    for wh_word in WH_WORD_LIST:
        if wh_word in row['title'].lower():
            title_cat[-1] = wh_word
            break

df['title_cat'] = title_cat

# Pick a wh-word category

In [14]:
df_selection = df[df['title_cat'] == 'how'].copy().reset_index()

In [17]:
print(df_selection.head(3).to_markdown())

|    |   level_0 |   index | title                                                                                     | selftext                                                                                                                                                                                                                                                                                            | author               | permalink                                                                 |   num_comments |   score |   total_awards_received |   upvote_ratio |   created_utc | title_cat   |
|---:|----------:|--------:|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Topic Modeling with BERTopic

In [19]:
from bertopic import BERTopic

topic_model = BERTopic()
topics, _ = topic_model.fit_transform(df_selection['title'])
topic_model.get_topic_info()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=690.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3673.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=629.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=122.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=229.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=90895153.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=53.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466081.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=516.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=190.0, style=ProgressStyle(description_…






Unnamed: 0,Topic,Count,Name
0,-1,97,-1_gas_do_nfts_to
1,0,66,0_art_artist_nft_digital
2,1,58,1_work_nfts_this_nft
3,2,36,2_sell_nfts_selling_nft
4,3,31,3_mint_mintable_delete_without
5,4,31,4_tokens_crypto_blockchain_game
6,5,23,5_much_money_nfts_price
7,6,17,6_opensea_marketplace_nft_gateway
8,7,16,7_copyrighttrademark_copyright_rights_via
9,8,11,8_foundation_invite_creator_invited


In [21]:
topic_model.visualize_topics()

In [23]:
topic_model.visualize_heatmap()

In [25]:
topic_model.visualize_hierarchy()