<a href="https://colab.research.google.com/github/MasslessAI/narratelab/blob/master/exp/exp_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [2]:
from google.colab import drive

drive.mount('gdrive', force_remount=True)

root_dir = '/content/gdrive/'

gdrive_path = root_dir + 'MyDrive/narratelab/exp_1'

!pip install spacy keybert[spacy] sentence-transformers redditcleaner psaw pandas loguru bertopic[spacy]
# !python -m spacy download en_core_web_trf

Mounted at gdrive
Collecting keybert[spacy]
  Downloading https://files.pythonhosted.org/packages/cd/12/b72f6ce98984157cb0db83baf56a7c8f9eb4df4494d671234dccb630f8a3/keybert-0.4.0.tar.gz
Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/3b/fd/8a81047bbd9fa134a3f27e12937d2a487bd49d353a038916a5d7ed4e5543/sentence-transformers-2.0.0.tar.gz (85kB)
[K     |████████████████████████████████| 92kB 4.3MB/s 
[?25hCollecting redditcleaner
  Downloading https://files.pythonhosted.org/packages/f9/8a/7491757daaf8f3381f736473018880c9e89defd44b9ebbf48a83c172e5ff/redditcleaner-1.1.2-py3-none-any.whl
Collecting psaw
  Downloading https://files.pythonhosted.org/packages/01/fe/e2f43241ff7545588d07bb93dd353e4333ebc02c31d7e0dc36a8a9d93214/psaw-0.1.0-py3-none-any.whl
Collecting loguru
[?25l  Downloading https://files.pythonhosted.org/packages/6d/48/0a7d5847e3de329f1d0134baf707b689700b53bd3066a5a8cfd94b3c9fc8/loguru-0.5.3-py3-none-any.whl (57kB)
[K     |██████████

# Scrap Reddit Data

Using pushshift api to quickly search for submissions where title contains question-indicative phrases/words.

refer to [Content Ideas From Reddit](https://timothywangdev.github.io/knowledge/Content%20Marketing/content-ideas-from-reddit)

In [None]:
QUESTION_WORDS = [
  "what",
  "when",
  "where",
  "who",
  "whom",
  "which",
  "whose",
  "why",
  "how",
  "wonder",
  "want",
  "anyone",
  "tips",
  "advice",
  "suggestion",
  "suggestions",
  "suggest",
  "ideas on",
  "need help",
  "challenge",
  "challenges",
  "can't stand",
  "struggle",
  "struggling",
  "figure out",
  "help me",
  "hardest part",
  "appreciate"
]

Only retrieve submissions that are text submissions, and filter out deleted submissions and submissions whose author is banned.

We keep the following fields:

* title
* selftext
* author
* permalink
* num_comments <font color='blue'>*</font>
* score <font color='blue'>*</font>
* upvote_ratio <font color='blue'>*</font>
* total_awards_received <font color='blue'>*</font>

<font color='blue'>*</font> *Used for future ranking purpose*

In [5]:
import datetime as dt
from datetime import date, datetime
from loguru import logger
import pandas as pd
from psaw import PushshiftAPI
import time
import os
import redditcleaner
import re
import base64

api = PushshiftAPI()

start_epoch = int(dt.datetime(2019, 1, 1).timestamp())
#end_epoch = int(dt.datetime(2020, 3, 1).timestamp())
end_epoch = int(time.time())
total = 0

SUBREDDIT = 'nft'
DATA_FILE_NAME = gdrive_path + '/reddit_submission_{}_{}_{}.tsv'.format(
    SUBREDDIT, datetime.fromtimestamp(start_epoch).strftime("%Y_%m_%d"),
    datetime.fromtimestamp(end_epoch).strftime("%Y_%m_%d"))
while True:
    gen = list(
        api.search_submissions(
            after=start_epoch, before=end_epoch,
            title='|'.join(QUESTION_WORDS), is_self=True,
            is_original_content=True, subreddit=SUBREDDIT,
            filter=['title', 'selftext', 'author', 'permalink', 'num_comments', 'score', 'total_awards_received',
                    'upvote_ratio'],
            sort='asc', sort_type='created_utc', limit=500))

    if len(gen) == 0:
        break

    def submission_filter(submission):
        if 'title' not in submission:
            return False
        if 'selftext' not in submission:
            return False
        if 'author' not in submission:
            return False
        if submission['author'] == "[deleted]":
            return False
        if any(submission['selftext'] == x for x in ["[removed]", "[deleted]"]):
            return False
        return True

    def prepare_data(data):
        # some of the fields may be missing
        # must manually set an init value to avoid
        # generating invalid csv
        _data = {
            'title': '',
            'selftext': '',
            'author': '',
            'permalink': '',
            'num_comments': 0,
            'score': 0,
            'total_awards_received': 0,
            'upvote_ratio': 1.0,
            'created_utc': None
        }

        for key in _data:
            if key in data and data[key] is not None:
                _data[key] = data[key]

        return _data


    items = map(prepare_data, [item.d_ for item in gen])

    items = list(filter(submission_filter, items))

    df = pd.DataFrame(items)

    # clean data
    def clean(text):
        # remove reddit styles
        text = redditcleaner.clean(
            text, quote=False, bullet_point=False, link=False, strikethrough=False, spoiler=False, code=False,
            superscript=False, table=False)

        # refer to https://towardsdatascience.com/cleaning-text-data-with-python-b69b47b97b76
        # Remove unicode characters
        text = text.encode('ascii', 'ignore').decode()

        # Remove Hashtags
        text = re.sub("#\S+", " ", text)

        # Remove markdown links
        text = re.sub(r"\[(.+)\]\(.+\)", r"\1", text)

        # Remove other urls
        text = re.sub(r"http\S+", " ", text)

        # remove text inside brackets
        text = re.sub("\(.*?\)"," ", text)
        text = re.sub("\[.*?\]"," ", text)

        # remove quotes
        # remove brackets
        # remove semicolon
        text = re.sub(r'[\t()[\]\"*:\\]',' ', text)

        # remove non-ascii chars
        text = re.sub(r"[^\x00-\x7F]+",' ', text)

         # Replace the over spaces
        text = re.sub('\s{2,}', " ", text)

        return text

    df['title'] = df['title'].map(clean)
    df['selftext'] = df['selftext'].map(clean)

    if not os.path.isfile(DATA_FILE_NAME):
        df.to_csv(DATA_FILE_NAME, sep='\t', header='column_names', index=False, quoting=3)
    else:  # else it exists so append without writing the header
        df.to_csv(DATA_FILE_NAME, sep='\t', mode='a', header=False, index=False, quoting=3)

    start_epoch = items[-1]['created_utc']
    total += len(items)
    logger.info('Added {} Total {} Last created_utc {}'.format(
        len(items), total, date.fromtimestamp(start_epoch)))

    time.sleep(3)

2021-07-14 17:19:51.936 | INFO     | __main__:<module>:126 - Added 472 Total 472 Last created_utc 2021-03-13
2021-07-14 17:19:57.428 | INFO     | __main__:<module>:126 - Added 330 Total 802 Last created_utc 2021-04-09
2021-07-14 17:20:06.018 | INFO     | __main__:<module>:126 - Added 305 Total 1107 Last created_utc 2021-05-17
2021-07-14 17:20:11.498 | INFO     | __main__:<module>:126 - Added 313 Total 1420 Last created_utc 2021-07-14


# Load Scrapped Reddit Data

Load submissions, filter out ads, and classified them by wh-words

From observation, some **submissions are ads**, which often contain the following symbols/phrases

```
'-', ':', ';' 
your business(es)
help you
case study
ALL CAPITAL chars
how i
dollar symbol $ (e.g turn $100 into 100k)
```

<mark>TODO</mark> 
 - [ ] Add more ad-indicative phrases
 - [ ] Use num_comments to filter ads

In [49]:
df = pd.read_csv(gdrive_path +"/reddit_submission_nft_2019_01_01_2021_07_12.tsv", sep='\t', quoting=3)

# data cleanup
print('before clean-up # rows: {}'.format(len(df)))
cleaned_rows = []

AD_INDICATIVE_PHRASES = ["your business", "your businesses", "help you", "case study"]

for index, row in df.iterrows():
    row_title = row['title'].lower()
    if not any(x in ['-', ':', ';'] for x in row_title) and not any(phrase in row_title for phrase in AD_INDICATIVE_PHRASES) and "?" in row_title:
        '''
        1. must contain '?'
        2. can only contain alphanumeric, punctuations and space
        3. should not contain '-', ':', ';' which indicates ads
        '''
        cleaned_rows.append(index)

df = df[df.index.isin(cleaned_rows)].reset_index()
print('after clean-up # rows: {}'.format(len(df)))

before clean-up # rows: 2812
after clean-up # rows: 1939


# Classified submissions by wh-words in title

In [51]:
WH_WORD_LIST = ["what", "when", "where", "who", "whom", "which", "whose", "why", "how", "wonder", "want", "has anyone"]

title_cat = []
for index, row in df.iterrows():
    title_cat.append('NO_WH_WORD')
    for wh_word in WH_WORD_LIST:
        if wh_word in row['title'].lower():
            title_cat[-1] = wh_word
            break

df['title_cat'] = title_cat

# Pick a wh-word category

In [52]:
df_selection = df[df['title_cat'] == 'how'].copy().reset_index()

In [53]:
print(df_selection.head(100).to_markdown())

|    |   level_0 |   index | title                                                                                                                                                                                                                                                     | selftext                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

# Topic Modeling with BERTopic

In [77]:
from bertopic import BERTopic

topic_model = BERTopic(n_gram_range=(1, 1), nr_topics='auto', calculate_probabilities=True, min_topic_size=5)
topics, probabilities = topic_model.fit_transform(df_selection['title'])
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,107,-1_art_artist_be_like
1,0,80,0_sell_worth_selling_collectors
2,1,59,1_mint_mintable_minted_blockchain
3,2,43,2_gif_put_art_buy
4,3,40,3_sandbox_protocol_pros_traction
5,4,39,4_gas_money_make_assets
6,5,37,5_foundation_crypto_creator_owner
7,6,34,6_gaming_link_tweet_packs
8,7,24,7_secure_rarible_token_fees
9,8,22,8_copyright_copyrighttrademark_authenticated_p...


In [78]:
from collections import defaultdict

docs_by_topics = defaultdict(list)

for index, topic in enumerate(topics):
  docs_by_topics[topic].append({
      'docIdx': index,
      'prob': probabilities[index][topic+1]
  })

# sort doc by their topic prob
for index, topic in enumerate(topics):
  docs_by_topics[topic].sort(key=lambda x: x['prob'], reverse=True)


In [81]:
for doc in docs_by_topics[3][:20]:
  print(doc['prob'], df_selection['title'][doc['docIdx']])


0.019 how does this work?
0.019 How might this work?
0.017 Is anyone else just trying this for fun and seeing how it goes?
0.017 how does this work?
0.016 How might this work?
0.016 How does staking NFT's work? Any info appreciated!
0.015 How does staking NFT's work? Any info appreciated!
0.014 How do I get more traction on my NFTs?
0.014 New NFT! Please tell me how I did?
0.013 Can someone explain how NFTs work for non functional, reproducible content?
0.013 How would making a textbook as an NFT work?
0.013 Question for the pros How does hatchable NFTs work?
0.013 How do I get more traction on my NFTs?
0.013 How do I do it?
0.013 New NFT! Please tell me how I did?
0.013 Can someone explain how NFTs work for non functional, reproducible content?
0.013 How do I do it?
0.012 How would making a textbook as an NFT work?
0.012 How do I start?
0.012 Question for the pros How does hatchable NFTs work?


In [71]:
topics_over_time = topic_model.topics_over_time(df_selection['title'], topics, df_selection['created_utc'], nr_bins=20)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)

In [72]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_hierarchy()