<a href="https://colab.research.google.com/github/MasslessAI/narratelab/blob/master/exp/exp_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [1]:
from google.colab import drive

drive.mount('gdrive', force_remount=True)

root_dir = '/content/gdrive/'

gdrive_path = root_dir + 'MyDrive/narratelab/exp_1'

!pip install spacy keybert[spacy] sentence-transformers redditcleaner psaw pandas loguru bertopic[spacy] distinctipy colour
# !python -m spacy download en_core_web_trf

Mounted at gdrive


# Scrap Reddit Data

Using pushshift api to quickly search for submissions where title contains **question-indicative phrases/words**. Only retrieve submissions whose **num_comments > 1**, this filters out most of the ads.

refer to [Content Ideas From Reddit](https://timothywangdev.github.io/knowledge/Content%20Marketing/content-ideas-from-reddit)

In [2]:
import datetime as dt
from datetime import date, datetime
from loguru import logger
import pandas as pd
from psaw import PushshiftAPI
import time
import os
import redditcleaner
import re
import base64
import IPython
from distinctipy import distinctipy
import random
from colour import Color

api = PushshiftAPI()

QUESTION_WORDS = [
  "what",
  "when",
  "where",
  "who",
  "whom",
  "which",
  "whose",
  "why",
  "how",
  "wonder",
  "want",
  "is anyone",
  "does anyone",
  "any tips",
  "advice",
  "suggestion",
  "suggestions",
  "suggest",
  "ideas on",
  "need help",
  "needs help",
  "need your help",
  "serious help",
  "please help",
  "challenge",
  "challenges",
  "can't stand",
  "struggle",
  "struggling",
  "can't figure out",
  "help me",
  "hardest part",
  "would appreciate",
  "would really appreciate",
  "any guidance",
  "no idea",
  "confused with",
  "new to",
  "is there any way"
]

# (Optional) Do some experiments on question words and psaw filters

In [192]:

# do some exps on question words and psaw filters
_QUESTION_WORDS=["ideas on", "struggle with", "suggestion"]
title_query = '|'.join(map(lambda x: '"{}"'.format(x), _QUESTION_WORDS))
print(title_query)

gen = list(
        api.search_submissions(
            title=title_query,
            is_self=True,
            is_original_content=True, 
            subreddit='content_marketing',
            num_comments=">1",
            filter=['title', 'selftext', 'author', 'permalink', 'num_comments', 'score', 'total_awards_received',
                    'upvote_ratio'],
            sort='asc', sort_type='created_utc', limit=500))

titles = [item.d_['title'] for item in gen]

# generate distinct hsl colors
bg_color_list = distinctipy.get_colors(50)
inverted_color_list = distinctipy.invert_colors(bg_color_list)
text_color_list = [distinctipy.get_text_color(color) for color in bg_color_list]

# convert colors to hex
bg_color_list = [Color(rgb=color).hex for color in bg_color_list]
inverted_color_list = [Color(rgb=color).hex for color in inverted_color_list]
text_color_list = [Color(rgb=color).hex for color in text_color_list]

def colored(word, color_idx=0):
  color = bg_color_list[color_idx]
  return '<span style="display: inline-block; font-weight: 600; font-size: 16px; border-radius: 15%; padding-left: 5px; padding-right: 5px; margin: 4px 2px; background-color: {}; color:{};">{}</span>'.format(bg_color_list[color_idx], text_color_list[color_idx], word)
  
colored_titles = []
for title in titles:
  _title = title.lower()
  for idx, question_word in enumerate(_QUESTION_WORDS):
    _title = _title.replace(question_word, colored(question_word, color_idx = idx))
  colored_titles.append('<li>' + _title + "</li>")

IPython.display.HTML("<ol>" +' '.join(colored_titles) + "</ol>")

"ideas on"|"struggle with"|"suggestion"


Only retrieve submissions that are text submissions, and filter out deleted submissions and submissions whose author is banned.

We keep the following fields:

* title
* selftext
* author
* permalink
* num_comments <font color='blue'>*</font>
* score <font color='blue'>*</font>
* upvote_ratio <font color='blue'>*</font>
* total_awards_received <font color='blue'>*</font>

<font color='blue'>*</font> *Used for future ranking purpose*

In [7]:
start_epoch = int(dt.datetime(2019, 1, 1).timestamp())
#end_epoch = int(dt.datetime(2020, 3, 1).timestamp())
end_epoch = int(time.time())
total = 0

SUBREDDIT = 'nft'
DATA_FILE_NAME = gdrive_path + '/reddit_submission_{}_{}_{}.tsv'.format(
    SUBREDDIT, datetime.fromtimestamp(start_epoch).strftime("%Y_%m_%d"),
    datetime.fromtimestamp(end_epoch).strftime("%Y_%m_%d"))

title_query = '|'.join(map(lambda x: '"{}"'.format(x), QUESTION_WORDS))
while True:
    gen = list(
        api.search_submissions(
            after=start_epoch, 
            before=end_epoch,
            title=title_query,
            is_self=True,
            is_original_content=True,
            subreddit=SUBREDDIT,
            num_comments=">1",
            filter=['title', 'selftext', 'author', 'permalink', 'num_comments', 'score', 'total_awards_received',
                    'upvote_ratio'],
            sort='asc', 
            sort_type='created_utc', 
            limit=500))

    if len(gen) == 0:
        break

    def submission_filter(submission):
        if 'title' not in submission:
            return False
        if 'selftext' not in submission:
            return False
        if len(submission['selftext']) == 0:
            # if submission is deleted, the psaw returns empty str
            return False
        if 'author' not in submission:
            return False
        if submission['author'] == "[deleted]":
            return False
        if any(submission['selftext'] == x for x in ["[removed]", "[deleted]"]):
            return False
        return True

    def prepare_data(data):
        # some of the fields may be missing
        # must manually set an init value to avoid
        # generating invalid csv
        _data = {
            'title': '',
            'selftext': '',
            'author': '',
            'permalink': '',
            'num_comments': 0,
            'score': 0,
            'total_awards_received': 0,
            'upvote_ratio': 1.0,
            'created_utc': None
        }

        for key in _data:
            if key in data and data[key] is not None:
                _data[key] = data[key]

        return _data


    items = map(prepare_data, [item.d_ for item in gen])

    items = list(filter(submission_filter, items))

    df = pd.DataFrame(items)

    # clean data
    def clean(text):
        # remove reddit styles
        text = redditcleaner.clean(
            text, quote=False, bullet_point=False, link=False, strikethrough=False, spoiler=False, code=False,
            superscript=False, table=False)

        # refer to https://towardsdatascience.com/cleaning-text-data-with-python-b69b47b97b76
        # Remove unicode characters
        text = text.encode('ascii', 'ignore').decode()

        # Remove Hashtags
        text = re.sub("#\S+", " ", text)

        # Remove markdown links
        text = re.sub(r"\[(.+)\]\(.+\)", r"\1", text)

        # Remove other urls
        text = re.sub(r"http\S+", " ", text)

        # remove text inside brackets
        text = re.sub("\(.*?\)"," ", text)
        text = re.sub("\[.*?\]"," ", text)

        # remove quotes
        # remove brackets
        # remove semicolon
        text = re.sub(r'[\t()[\]\"*:\\]',' ', text)

        # remove non-ascii chars
        text = re.sub(r"[^\x00-\x7F]+",' ', text)

         # Replace the over spaces# if submission is deleted, the psaw returns NaN, must check if 
            # it's a valid string
        text = re.sub('\s{2,}', " ", text)

        return text

    df['title'] = df['title'].map(clean)
    df['selftext'] = df['selftext'].map(clean)

    if not os.path.isfile(DATA_FILE_NAME):
        df.to_csv(DATA_FILE_NAME, sep='\t', header='column_names', index=False, quoting=3)
    else:  # else it exists so append without writing the header
        df.to_csv(DATA_FILE_NAME, sep='\t', mode='a', header=False, index=False, quoting=3)

    start_epoch = items[-1]['created_utc']
    total += len(items)
    logger.info('Added {} Total {} Last created_utc {}'.format(
        len(items), total, date.fromtimestamp(start_epoch)))

    time.sleep(3)

2021-07-15 03:46:24.496 | INFO     | __main__:<module>:125 - Added 317 Total 317 Last created_utc 2021-04-14
2021-07-15 03:46:32.429 | INFO     | __main__:<module>:125 - Added 277 Total 594 Last created_utc 2021-05-15
2021-07-15 03:46:38.281 | INFO     | __main__:<module>:125 - Added 273 Total 867 Last created_utc 2021-07-11
2021-07-15 03:46:45.058 | INFO     | __main__:<module>:125 - Added 11 Total 878 Last created_utc 2021-07-14


# Load Scrapped Reddit Data

Load submissions, filter out ads, and classified them by wh-words

From observation, some **submissions are ads**, which often contain the following symbols/phrases

```
'-', ':', ';' 
your business(es)
help you
case study
ALL CAPITAL chars
how i
dollar symbol $ (e.g turn $100 into 100k)
step-by-step
here is how
here's how
part 1/2/3
top [number] things
[number] reasons
ultimate guide
cheat sheet
cheatsheet
infographic
by [year]
ama
```

<mark>TODO</mark> 
 - [x] Add more ad-indicative phrases
 - [x] Use num_comments to filter ads

In [10]:
df = pd.read_csv(gdrive_path +"/reddit_submission_nft_2019_01_01_2021_07_15.tsv", sep='\t', quoting=3)

# data cleanup
print('before clean-up # rows: {}'.format(len(df)))
cleaned_rows = []

AD_INDICATIVE_PHRASES = [
  "your business", 
  "your businesses", 
  "help you", 
  "case study"
  "how i",
  "$",
  "step-by-step",
  "here is how",
  "here's how",
  "part 1",
  "part 2",
  "part 3",
  "ultimate guide",
  "cheatsheet",
  "infographic",
  "ama"
]

for index, row in df.iterrows():
    row_title = row['title'].lower()
    if not any(x in ['-', ':', ';'] for x in row_title) and not any(phrase in row_title for phrase in AD_INDICATIVE_PHRASES) and "?" in row_title:
        '''
        1. must contain '?'
        2. can only contain alphanumeric, punctuations and space
        3. should not contain '-', ':', ';' which indicates ads
        '''
        cleaned_rows.append(index)

df = df[df.index.isin(cleaned_rows)].reset_index()
print('after clean-up # rows: {}'.format(len(df)))

print(df.head(5).to_markdown())

before clean-up # rows: 878
after clean-up # rows: 516
|    |   index | title                                                                                          | selftext                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   | author       | permalink                                                                |   num_comments |   score |   total_awards_received |   upvote_ratio |   created_utc |
|---:|--------:|:-----------------------------------------------------------------------------------------------|:----------------------------------------------------------------

# Classified submissions by wh-words in title, extract keywords and aggregate stats

In [12]:
import pprint
from keybert import KeyBERT

kw_model = KeyBERT()

title_cat = []
keywords = []
for index, row in df.iterrows():
    title_cat.append('NO_WH_WORD')
    for wh_word in QUESTION_WORDS:
        if wh_word in row['title'].lower():
            title_cat[-1] = wh_word
            break
    doc_text = row['title'] + " " + row['selftext']
    doc_keywords = kw_model.extract_keywords(doc_text)

    keywords.append([keyword[0] for keyword in doc_keywords])

df['title_cat'] = title_cat
df['keywords'] = keywords

cat_stats = {
    'cat': QUESTION_WORDS,
    'num_docs': [],
    'total_score': [],
    'total_comments': []
}

# print out number of docs per category
for wh_word in QUESTION_WORDS:
    _df = df[df['title_cat'] == wh_word]
    cat_stats['num_docs'].append(len(_df))
    cat_stats['total_score'].append(_df['score'].sum())
    cat_stats['total_comments'].append(_df['num_comments'].sum())
  
cat_stats_df = pd.DataFrame(data=cat_stats)

print(cat_stats_df.to_markdown())
  

|    | cat                     |   num_docs |   total_score |   total_comments |
|---:|:------------------------|-----------:|--------------:|-----------------:|
|  0 | what                    |        168 |           224 |              773 |
|  1 | when                    |         19 |            20 |               99 |
|  2 | where                   |         30 |            45 |              180 |
|  3 | who                     |         17 |            20 |               55 |
|  4 | whom                    |          0 |             0 |                0 |
|  5 | which                   |         36 |            41 |              196 |
|  6 | whose                   |          0 |             0 |                0 |
|  7 | why                     |         38 |            42 |              258 |
|  8 | how                     |        173 |           202 |              883 |
|  9 | wonder                  |          0 |             0 |                0 |
| 10 | want                 

# Pick a wh-word category

In [14]:
df_selection = df[df['title_cat'] == 'how'].copy().reset_index()
print(df_selection.head(10).to_markdown())

|    |   level_0 |   index | title                                                          | selftext                                                                                                                                                                                                                                                                                                                                                                                                                                                                | author               | permalink                                                               |   num_comments |   score |   total_awards_received |   upvote_ratio |   created_utc | title_cat   | keywords                                                  |
|---:|----------:|--------:|:---------------------------------------------------------------|:-------------------------------------------------------------------------------------------------

# Topic Modeling with BERTopic

In [15]:
from bertopic import BERTopic

topic_model = BERTopic(n_gram_range=(1, 1), nr_topics='auto', calculate_probabilities=True, min_topic_size=5)
topics, probabilities = topic_model.fit_transform(df_selection['title'])
topic_info_df = topic_model.get_topic_info()
print(topic_info_df.to_markdown())



|    |   Topic |   Count | Name                                 |
|---:|--------:|--------:|:-------------------------------------|
|  0 |      -1 |      42 | -1_nfts_collectors_nft_my            |
|  1 |       0 |      32 | 0_sell_nfts_buy_nft                  |
|  2 |       1 |      25 | 1_art_artist_much_afford             |
|  3 |       2 |      17 | 2_work_nfts_understanding_traction   |
|  4 |       3 |      15 | 3_mint_mintable_minting_tokens       |
|  5 |       4 |      11 | 4_item_make_packs_can                |
|  6 |       5 |      10 | 5_copyright_nfts_ownership_copy      |
|  7 |       6 |       7 | 6_foundation_upvoted_wallet_waitlist |
|  8 |       7 |       7 | 7_crypto_rarible_gift_unlockable     |
|  9 |       8 |       7 | 8_marketplace_opensea_build_launch   |


In [18]:
from collections import defaultdict

docs_by_topics = defaultdict(list)

print(max(topics), min(topics))
for index, topic in enumerate(topics):
  docs_by_topics[topic].append({
      'docIdx': index,
      'prob': probabilities[index][topic],
      'num_comments':df_selection['num_comments'][index],
      'score': df_selection['score'][index],
      'title': df_selection['title'][index],
      'keywords': df_selection['keywords'][index]
  })

# sort doc by their topic prob
for index, topic in enumerate(topics):
  docs_by_topics[topic].sort(key=lambda x: x['prob'], reverse=True)


8 -1


In [19]:

for idx, row in topic_info_df.iterrows():
  print(row.to_frame().T)
  topic_docs_df = pd.DataFrame(docs_by_topics[row['Topic']])
  print(topic_docs_df.head(50).to_markdown())



  Topic Count                       Name
0    -1    42  -1_nfts_collectors_nft_my
|    |   docIdx |   prob |   num_comments |   score | title                                                                                            | keywords                                                      |
|---:|---------:|-------:|---------------:|--------:|:-------------------------------------------------------------------------------------------------|:--------------------------------------------------------------|
|  0 |       16 |      0 |              7 |       1 | How much are these Street Fighter NFTs worth?                                                    | ['cards', 'card', 'probability', 'fighter', 'worth']          |
|  1 |       18 |      0 |              7 |       1 | How do I find my NFTs token ID and address?                                                      | ['nfts', 'id', 'token', 'address', 'thanks']                  |
|  2 |       23 |      0 |              4 |       

In [20]:
topics_over_time = topic_model.topics_over_time(df_selection['title'], topics, df_selection['created_utc'], nr_bins=20)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)

In [21]:
topic_model.visualize_topics()

In [22]:
topic_model.visualize_heatmap()

In [23]:
topic_model.visualize_hierarchy()