In [99]:
import collections
import csv
import itertools
import matplotlib.pyplot as plt
import json
import os
import pathlib
import re

import pandas as pd


def extract_words_numbers(s):
    return re.findall(r"[a-zA-Z]+|[0-9]+|\$+", s)


def split_at_capital(s):
    return re.sub(r"([a-z])([A-Z])", r"\1 \2", s).split()


def tokenize(s):
    return [x.lower() for y in extract_words_numbers(s) for x in split_at_capital(y)]


def slug_to_url(slug):
    return f"https://opensea.io/collection/{slug}"

In [32]:
print(extract_words_numbers("aa12 Bc! #Net"))
print(split_at_capital("SuperCoolNFT"))
print(tokenize("moneyTake123 #BadApple$$"))

['aa', '12', 'Bc', 'Net']
['Super', 'Cool', 'NFT']
['money', 'take', '123', 'bad', 'apple', '$$']


In [3]:
import snowflake.connector as sf_conn

_SNOWFLAKE_CONTEXT = None


def snowflake_run_query(sql):
    global _SNOWFLAKE_CONTEXT
    if _SNOWFLAKE_CONTEXT is None:
        _SNOWFLAKE_CONTEXT = sf_conn.connect(
            user="xiang_wu",
            password=os.getenv("SNOWFLAKE_PASSWORD"),
            account="opensea.us-east-1",
        )

    cs = _SNOWFLAKE_CONTEXT.cursor()
    cs.execute(sql)
    return cs.fetch_pandas_all()

In [4]:
MIN_30D_SALES = 3

ALL_COLLECTION_SQL = f"""
SELECT
  a.ID,
  a.NAME,
  a.DEFAULT_CHAIN_ID,
  a.SLUG,
  a.DESCRIPTION
FROM
  PC_FIVETRAN_DB.PUBLIC.API_COLLECTION a
  LEFT JOIN PC_FIVETRAN_DB.PUBLIC.API_COLLECTIONSTATS b
  ON a.ID = b.COLLECTION_ID
WHERE
  a.IS_DELISTED = FALSE
  AND a.HIDDEN = FALSE
  AND a.IS_NSFW = FALSE
  AND a.BLACKLISTED = FALSE
  AND b.THIRTY_DAY_SALES >= {MIN_30D_SALES}
"""

print(ALL_COLLECTION_SQL)


SELECT
  a.ID,
  a.NAME,
  a.DEFAULT_CHAIN_ID,
  a.SLUG,
  a.DESCRIPTION
FROM
  PC_FIVETRAN_DB.PUBLIC.API_COLLECTION a
  LEFT JOIN PC_FIVETRAN_DB.PUBLIC.API_COLLECTIONSTATS b
  ON a.ID = b.COLLECTION_ID
WHERE
  a.IS_DELISTED = FALSE
  AND a.HIDDEN = FALSE
  AND a.IS_NSFW = FALSE
  AND a.BLACKLISTED = FALSE
  AND b.THIRTY_DAY_SALES >= 3



In [5]:
RUN_SQL = False
ALL_COLLECTION_DF_FILE = pathlib.Path(f"/Users/shinewu/tmp/all_collection.pickle")

if RUN_SQL:
    ALL_COLLECTION_DF = snowflake_run_query(ALL_COLLECTION_SQL)
    ALL_COLLECTION_DF.to_pickle(ALL_COLLECTION_DF_FILE)
else:
    ALL_COLLECTION_DF = pd.read_pickle(ALL_COLLECTION_DF_FILE)

In [6]:
ALL_COLLECTION_DF.iloc[100:120]

Unnamed: 0,ID,NAME,DEFAULT_CHAIN_ID,SLUG,DESCRIPTION
100,13214902,Nekumaru,3,nekumaru,Leave the fake reality and enter the world of ...
101,8412150,Life Is A Play.,1,life-is-a-play,"Since I was a child, I have always dreamt of c..."
102,1344058,Long Neckie Fellas,1,long-neckie-fellas,"Long Neckie Fellas are a fine art, hand drawn,..."
103,3290993,Heartbreak Bear Genesis Collection,1,heartbreak-bear-official,Holder count is not accurate as staking is LIV...
104,7058342,TIMEPieces Slices of TIME (Slices),1,timepieces-sot-slices,"Slices of TIME is a two-part collection, inspi..."
105,2324896,the Plug by LOGIK,1,the-plug-by-logik,LOGIK left St. Louis in pursuit of a dream. Ar...
106,7947796,Balloon_noname,2,balloon-noname,누군가의 손에서 벗어나고 나서야\n비로소 하늘을 보게된\n비로소 행복을 느끼게된\n...
107,15529016,G-Star RAW 3301 Community Token,1,g-star-raw-3301-community-token,This is the G-Star RAW 3301 Community Token gi...
108,884580,Save the Martians,1,save-the-martians,"15,000 Martians are being forced off their hom..."
109,13105884,CANIGH Cannabis Art,1,canigh-cannabis-art,#25 1/1 Unique Cannabis epoxy Art. \nMy fathe...


In [7]:
KEYWORDS = """
APR
APY
Backed
Bond
Buyback
Casino
Coin
Currency
CUSIP
Debt
DeFi
Disbursement
Distribution
Dividend
Entitled
Exchange
Finance
Financial
Forex
Fraction
Gains
ICO
IDO
IEO
Insurance
Interest
Interest Bearing
Invest
Liquidity
Mining
Monetize
Monthly
Offering
Ownership
Passive Income
Payout
Profit
Profit Share
Redistribution
Returns
Rev Share
Revenue Share
ROI
Royalties
Royalty
Securities
Share
STO
Stock
USD
Weekly
Yield
""".lower().split(
    "\n"
)[
    1:-1
]

KEYWORDS = [x.split(" ") for x in KEYWORDS]
SINGLE_KEYWORDS = {x[0] for x in KEYWORDS if len(x) == 1}
DOUBLE_KEYWORDS = {tuple(x) for x in KEYWORDS if len(x) == 2}

print(KEYWORDS)

[['apr'], ['apy'], ['backed'], ['bond'], ['buyback'], ['casino'], ['coin'], ['currency'], ['cusip'], ['debt'], ['defi'], ['disbursement'], ['distribution'], ['dividend'], ['entitled'], ['exchange'], ['finance'], ['financial'], ['forex'], ['fraction'], ['gains'], ['ico'], ['ido'], ['ieo'], ['insurance'], ['interest'], ['interest', 'bearing'], ['invest'], ['liquidity'], ['mining'], ['monetize'], ['monthly'], ['offering'], ['ownership'], ['passive', 'income'], ['payout'], ['profit'], ['profit', 'share'], ['redistribution'], ['returns'], ['rev', 'share'], ['revenue', 'share'], ['roi'], ['royalties'], ['royalty'], ['securities'], ['share'], ['sto'], ['stock'], ['usd'], ['weekly'], ['yield']]


In [8]:
def model_v0(tokens):
    for kw in SINGLE_KEYWORDS:
        if any(token == kw for token in tokens):
            return kw

    for kw_pair in DOUBLE_KEYWORDS:
        if any(x == kw_pair for x in zip(tokens[:-1], tokens[1:])):
            return " ".join(kw_pair)

    return ""


def model_v0_label(row):
    for field in ["NAME", "DESCRIPTION"]:
        tokens = tokenize(str(row[field]))
        flag = model_v0(tokens)
        if flag:
            return flag
    return "SAFE"

In [20]:
desc = """
If you buy all these collections for $200k I will buy for double in the near future. It's a rarest collection of my NFT's
"""
tokens = tokenize(desc)
print(tokens)
model_v0(tokens)

['if', 'you', 'buy', 'all', 'these', 'collections', 'for', '200', 'k', 'i', 'will', 'buy', 'for', 'double', 'in', 'the', 'near', 'future', 'it', 's', 'a', 'rarest', 'collection', 'of', 'my', 'nft', 's']


''

In [10]:
ALL_COLLECTION_DF["label"] = ALL_COLLECTION_DF.apply(model_v0_label, axis=1)

In [11]:
def print_full(x):
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", 2000)
    pd.set_option("display.float_format", "{:20,.2f}".format)
    pd.set_option("display.max_colwidth", None)
    print(x)
    pd.reset_option("display.max_rows")
    pd.reset_option("display.max_columns")
    pd.reset_option("display.width")
    pd.reset_option("display.float_format")
    pd.reset_option("display.max_colwidth")

In [30]:
POSITIVES = ALL_COLLECTION_DF.query("label == 'buyback'")
print(
    f"Positive rate = {len(POSITIVES)} / {len(ALL_COLLECTION_DF)} = {len(POSITIVES) / len(ALL_COLLECTION_DF)}"
)
for row in itertools.islice(POSITIVES.itertuples(), 5):
    print(f"Link: https://opensea.io/collection/{row.SLUG}")
    print("Description: {", row.DESCRIPTION, "}")
    print("Label:", row.label)
    print("\n====================\n")

Positive rate = 3 / 20744 = 0.00014462013112225222
Link: https://opensea.io/collection/klaycasper
Description: { W E   A R E    K L A Y C A S E R !  !  !  !  !  !
- it's PUNK'S BUYBACK PROTOCOL NFT }
Label: buyback


Link: https://opensea.io/collection/voyagersgame-genesis-pass
Description: { PLEASE READ LATEST ANNOUNCEMENT ON: 
https://voyagersgame.com 

We are currently issuing refunds through buyback and burn.  }
Label: buyback


Link: https://opensea.io/collection/iceverse-official
Description: { Iceverse is an innovative NFT and Metaverse project. Owning an Ice Bear will grant access to our P2E World and holders will get rewarded through our $WRLD buyback system.

https://iceversenft.io/ }
Label: buyback




In [242]:
# Cases don't matter, tokenizer will lower everything.
# Occurrences of terms on the same line are tallies in a single counter.
# "-" at the beginning merges current line with previous one.
TERMS_MONEY = """
Dollar, Dollars, Income, Incomes
Entitled, Entitlement, Entitlements
Gain, Gains, Payout, Profit, Profits, Return, Returns, Upside, Upsides
Financial Freedom, Financially Free
Make Money, Makes Money, Making Money, Money Maker, Money Making, Monetize
Never Lose, Never Loses, Never Losing, No Loss
Rev Share, Revenue Share, Revenue Sharing
Retire Early, Retiring Early, Early Retirement
"""

TERMS_TIME = """
Hourly, Every Hour
Daily, Every Day
Every Week, Weekly
Every Month, Monthly
Every Quarter, Quarterly
Annual, Anually, Every Year, Yearly
You Buy, I Will Buy, Buy This
"""

TERMS_HYPE = """
Chance, Chances, Opportunity, Opportunities
Enormous, Significant, Substantial
Good, Better, Best, Big, Bigger, Biggest, Great, Greater, Greatest, Large, Larger, Largest
Absolutely, Certainly, Guaranteed, No Doubt, Surely 
To The Moon
Alltime, All time, In A Lifetime, In Ages, Of The Year 
"""

TERMS_GAMBLE = """
Always Win, Always Winning
Bet, Betting, Bets
Casino
Odds
"""

TERMS_CRYPTO = """
Coin
DeFi
ICO, IDO, IEO
Liquidity Pool
Mine, Mining
Stake, Staking
Yield
"""

TERMS_FIN = """
Approved, Certified, Registered, Safe
APY, ROI
Bond, Bonds, Collateral, Debt, Debts, Finance, Financed, Financial, Financing
- Loan, Loans, Securities, Stock, Stocks
Buy back, Buyback
Currency, Forex, USD
CUSIP
Disbursement, Distribution, Dividend, Interest, Interest Bearing, Redistribution
Exchange, Invest, Investing, Investment, Investor
Fraction
Insurance
Liquidity Provider, Provide Liquidity, Providing Liquidity, Provides Liquidity
- Liquidity Taker, Take Liquidity, Taking Liquidity, Takes Liquidity
- Make Market, Making Market, Makes Market, Market Maker, Market Making
Offering
Ownership
Royalties, Royalty
"""

TERMS_ALL = "\n".join([TERMS_MONEY, TERMS_TIME, TERMS_GAMBLE, TERMS_CRYPTO, TERMS_FIN])

TERMS_CONFIG = {
    "MONEY": TERMS_MONEY,
    "TIME": TERMS_TIME,
    "HYPE": TERMS_HYPE,
    "GAMBLE": TERMS_GAMBLE,
    "CRYPTO": TERMS_CRYPTO,
    "FIN": TERMS_FIN
}

def parse_terms(terms):
    lines = terms.split("\n")
    forward = {}
    backward = collections.defaultdict(set)
    cur_id = -1
    line_i = 0
    for line in lines:
        if not line:
            continue

        if line_i == 0:
            assert line[0] != "-"
        if line[0] != "-":
            cur_id += 1
        if line[0] == "-":
            line = line[1:]

        items = [x.strip() for x in line.lower().split(",")]
        for item in items:
            if not item:
                continue
            seq = tuple(item.split(" "))
            forward[seq] = cur_id
            backward[cur_id].add(seq)
            
        line_i += 1

    return forward, backward

def build_terms():
    ret = {}
    for k, terms in TERMS_CONFIG.items():
        forward = parse_terms(terms)[0]
        max_i = max(forward.values())
        ret[k] = forward, (max_i + 1)
    return ret

TERMS_PARSER = build_terms()


def backward_to_list(backward):
    backward_list = [None] * len(backward)
    for k, v in backward.items():
        backward_list[k] = list(v)
    return backward_list

In [243]:
forward, backward = parse_terms(TERMS_ALL)
print(forward)
print(backward)
for k, v in backward.items():
    print(k, '->', v)

print(json.dumps(backward_to_list(backward)))

{('dollar',): 0, ('dollars',): 0, ('income',): 0, ('incomes',): 0, ('entitled',): 1, ('entitlement',): 1, ('entitlements',): 1, ('gain',): 2, ('gains',): 2, ('payout',): 2, ('profit',): 2, ('profits',): 2, ('return',): 2, ('returns',): 2, ('upside',): 2, ('upsides',): 2, ('financial', 'freedom'): 3, ('financially', 'free'): 3, ('make', 'money'): 4, ('makes', 'money'): 4, ('making', 'money'): 4, ('money', 'maker'): 4, ('money', 'making'): 4, ('monetize',): 4, ('never', 'lose'): 5, ('never', 'loses'): 5, ('never', 'losing'): 5, ('no', 'loss'): 5, ('rev', 'share'): 6, ('revenue', 'share'): 6, ('revenue', 'sharing'): 6, ('retire', 'early'): 7, ('retiring', 'early'): 7, ('early', 'retirement'): 7, ('hourly',): 8, ('every', 'hour'): 8, ('daily',): 9, ('every', 'day'): 9, ('every', 'week'): 10, ('weekly',): 10, ('every', 'month'): 11, ('monthly',): 11, ('every', 'quarter'): 12, ('quarterly',): 12, ('annual',): 13, ('anually',): 13, ('every', 'year'): 13, ('yearly',): 13, ('you', 'buy'): 14, (

In [244]:
print(TERMS_PARSER)

{'MONEY': ({('dollar',): 0, ('dollars',): 0, ('income',): 0, ('incomes',): 0, ('entitled',): 1, ('entitlement',): 1, ('entitlements',): 1, ('gain',): 2, ('gains',): 2, ('payout',): 2, ('profit',): 2, ('profits',): 2, ('return',): 2, ('returns',): 2, ('upside',): 2, ('upsides',): 2, ('financial', 'freedom'): 3, ('financially', 'free'): 3, ('make', 'money'): 4, ('makes', 'money'): 4, ('making', 'money'): 4, ('money', 'maker'): 4, ('money', 'making'): 4, ('monetize',): 4, ('never', 'lose'): 5, ('never', 'loses'): 5, ('never', 'losing'): 5, ('no', 'loss'): 5, ('rev', 'share'): 6, ('revenue', 'share'): 6, ('revenue', 'sharing'): 6, ('retire', 'early'): 7, ('retiring', 'early'): 7, ('early', 'retirement'): 7}, 8), 'TIME': ({('hourly',): 0, ('every', 'hour'): 0, ('daily',): 1, ('every', 'day'): 1, ('every', 'week'): 2, ('weekly',): 2, ('every', 'month'): 3, ('monthly',): 3, ('every', 'quarter'): 4, ('quarterly',): 4, ('annual',): 5, ('anually',): 5, ('every', 'year'): 5, ('yearly',): 5, ('you

In [245]:
def featureize(tokens):
    ret = {}
    for k, (parser, width) in TERMS_PARSER.items():
        counter = [0] * width
        for i in range(len(tokens)):
            for term_tuple, counter_i in parser.items():
                if tuple(tokens[i:(i + len(term_tuple))]) == term_tuple:
                    counter[counter_i] += len(term_tuple)
        ret[k] = [float(x) / len(tokens) for x in counter]
    return ret     

In [246]:
featureize(tokenize("Get rich every day with interest bearing"))

{'MONEY': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'TIME': [0.0, 0.2857142857142857, 0.0, 0.0, 0.0, 0.0, 0.0],
 'HYPE': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'GAMBLE': [0.0, 0.0, 0.0, 0.0],
 'CRYPTO': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'FIN': [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.42857142857142855,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0]}

In [247]:
def featurize_df(row):
    all_text = str(row['NAME']) + " 0 " + str(row["DESCRIPTION"])
    return row['SLUG'], featureize(tokenize(all_text))


ALL_FEATURES = ALL_COLLECTION_DF.apply(featurize_df, axis=1)

In [248]:
ALL_FEATURES_JOINED = dict(zip(ALL_COLLECTION_DF.ID, ALL_FEATURES))

In [249]:
def positive_heur(features):
    money = features['MONEY']
    time = features['TIME']
    hype = features['HYPE']
    gamble = features['GAMBLE']
    crypto = features['CRYPTO']
    fin = features['FIN']
    
    if sum(money) > 0.1 or sum(gamble) > 0.1 or sum(fin) > 0.1:
        return True
    if sum(time) > 0.05 or sum(hype) > 0.05:
        if sum(money) > 0.05 or sum(gamble) > 0.05 or sum(fin) > 0.05:
            return True
        
    return False

In [250]:
postive_flags = {k: positive_heur(v[1]) for k, v in ALL_FEATURES_JOINED.items()}
print(sum(postive_flags.values()))

22


In [251]:
for k, v in ALL_FEATURES_JOINED.items():
    if postive_flags[k]:
        print(slug_to_url(v[0]))
        print(v)
        print()

https://opensea.io/collection/market-maker
('market-maker', {'MONEY': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'TIME': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'HYPE': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'GAMBLE': [0.0, 0.0, 0.0, 0.0], 'CRYPTO': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'FIN': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6666666666666666, 0.0, 0.0, 0.0]})

https://opensea.io/collection/passive-income-nft-33-2
('passive-income-nft-33-2', {'MONEY': [0.18181818181818182, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'TIME': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'HYPE': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'GAMBLE': [0.0, 0.0, 0.0, 0.0], 'CRYPTO': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'FIN': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]})

https://opensea.io/collection/investman-justforfun
('investman-justforfun', {'MONEY': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'TIME': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'HYPE': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'GAMBLE': [0.0, 0.0,