In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from text_analysis import stem_sentence, remove_urls, remove_users, remove_retweets, TextCleaner
from data import load_data, array_to_df
from sklearn.metrics import precision_recall_curve, average_precision_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from scipy.sparse import lil_matrix, csr_matrix
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import pickle
import os
import re

# Data Prep

In [2]:
df = load_data("labelled-tweets-20-09-2021.json")

In [3]:
def prepare_inputs(df, col):
    tfidf_vect = TfidfVectorizer(max_features=5000)
    tfidf = tfidf_vect.fit_transform(df[col])
    unigram_counter = CountVectorizer(stop_words=stopwords.words("english"), ngram_range=(1,1))
    unigrams = unigram_counter.fit_transform(df[col])
    bigram_counter = CountVectorizer(stop_words=stopwords.words("english"), ngram_range=(2,2))
    bigrams = bigram_counter.fit_transform(df[col])
    unibigram_counter = CountVectorizer(stop_words=stopwords.words("english"), ngram_range=(1,2))
    unibigrams = unibigram_counter.fit_transform(df[col])
    return tfidf, unigrams, bigrams, unibigrams

In [4]:
unigram_counter = CountVectorizer(stop_words=stopwords.words("english"), ngram_range=(1,1))
unigrams = unigram_counter.fit_transform(df["text"])
unigram_counter.get_feature_names()



['00',
 '000',
 '00005',
 '0003',
 '002',
 '003',
 '005',
 '0097',
 '00am',
 '00michael02',
 '01',
 '018',
 '01_protocol',
 '01s',
 '02',
 '03',
 '04',
 '05',
 '050',
 '06',
 '06b',
 '07',
 '08',
 '085',
 '09',
 '0b',
 '0clckyp7zt',
 '0dpisxbivs',
 '0gtpqqoake',
 '0jfbi6gvti',
 '0kfpai4nre',
 '0mstnxgyfr',
 '0oocz3rrkt',
 '0ot35tcmw1',
 '0r6bbdp5ih',
 '0rz10qkqvr',
 '0v45vjmrpw',
 '0x00108',
 '0x4c756b65',
 '0x650d',
 '0x_clem',
 '0x_lucas',
 '0x_meow',
 '0x_tigerswami',
 '0xalena',
 '0xalice_',
 '0xaugustus',
 '0xb8adb3c41fb203a75f6952e91a8f26c83deaf2f0',
 '0xbebis_',
 '0xdazai',
 '0xdippur',
 '0xedenau',
 '0xgoober',
 '0xhalfinney',
 '0xminion',
 '0xmjs',
 '0xpolygon',
 '0xsisyphus',
 '0xthespaniard',
 '0xtuba',
 '0xwari',
 '0xwives',
 '0y6vcxsrj7',
 '0y8knkh9a1',
 '10',
 '100',
 '1000',
 '10000',
 '100000',
 '1000x',
 '100k',
 '100m',
 '100x',
 '105',
 '106',
 '10k',
 '10m',
 '10no',
 '10x',
 '11',
 '112',
 '113',
 '114',
 '115',
 '118',
 '11am',
 '11dlepjyez',
 '11th',
 '12',
 '120

In [5]:
tfidf_raw, unigrams_raw, bigrams_raw, unibigrams_raw = prepare_inputs(df, "text")

In [6]:
def remove_numerical_values(string):
    return re.sub("\\d+(?=\\s|$)", "", string)

In [7]:
df["text_clean"] = df["text"].apply(lambda text: remove_urls(text.lower()))
df["text_clean"] = df["text_clean"].apply(remove_users)
df["text_clean"] = df["text_clean"].apply(remove_retweets)
df["text_clean"] = df["text_clean"].apply(remove_numerical_values)

In [8]:
unigram_counter_clean = CountVectorizer(stop_words=stopwords.words("english"), ngram_range=(1,1))
unigrams_clean = unigram_counter_clean.fit_transform(df["text_clean"])
unigram_counter_clean.get_feature_names()

['00',
 '000',
 '00am',
 '01',
 '01s',
 '02',
 '04',
 '050',
 '06b',
 '07',
 '08',
 '09',
 '0b',
 '0x650d',
 '0xb8adb3c41fb203a75f6952e91a8f26c83deaf2f',
 '10',
 '100',
 '1000',
 '1000x',
 '100k',
 '100m',
 '100x',
 '105',
 '106',
 '10k',
 '10m',
 '10no',
 '10x',
 '11',
 '112',
 '113',
 '115',
 '118',
 '11am',
 '11th',
 '12',
 '120mins',
 '12am',
 '12noon',
 '12pm',
 '13',
 '130',
 '13b',
 '13bn',
 '13th',
 '14',
 '140',
 '140m',
 '14b',
 '15',
 '150k',
 '150x',
 '155',
 '15m',
 '15p',
 '15th',
 '16',
 '16th',
 '17',
 '170k',
 '17th',
 '17x',
 '18',
 '180mins',
 '185',
 '19',
 '1975',
 '1b',
 '1bn',
 '1f',
 '1inch',
 '1inchcommunity',
 '1k',
 '1m',
 '1million',
 '1pm',
 '1s',
 '1st',
 '1x',
 '20',
 '200',
 '200k',
 '200x',
 '2013',
 '2015',
 '2017',
 '2018',
 '2020',
 '2020s',
 '2021',
 '2022',
 '2035',
 '20m',
 '20x',
 '211005_release',
 '22',
 '23',
 '2300',
 '230php',
 '23rd',
 '24',
 '2400',
 '24h',
 '24hrs',
 '24k',
 '24th',
 '24x',
 '25',
 '252k',
 '25k',
 '25th',
 '25x',
 '26',


In [9]:
tfidf_clean, unigrams_clean, bigrams_clean, unibigrams_clean = prepare_inputs(df, "text_clean")

In [10]:
def stem_sentences(string):
    words = stem_sentence(string)
    return " ".join(words)

df["text_stemmed"] = df["text_clean"].apply(stem_sentences)
df["text_stemmed"]

0       join us discuss team understand certif legal p...
1       onlin live learn platform hope train rise mana...
2       congratul portfolio compani batch 25 rais 3m r...
3                                     mani strong insight
4       long set huberman lab podcast consider melaton...
                              ...                        
2118                                  rt rt offend someon
2119    rt new wolf pattern stuff like mousepad shirt ...
2120                           buck could buy new desktop
2121           rt look crowd come sofi stadium today game
2122    import observ would like add someth proactiv c...
Name: text_stemmed, Length: 2123, dtype: object

In [11]:
df["text_stemmed"].head(10)

0    join us discuss team understand certif legal p...
1    onlin live learn platform hope train rise mana...
2    congratul portfolio compani batch 25 rais 3m r...
3                                  mani strong insight
4    long set huberman lab podcast consider melaton...
5    go back life without anxieti know hotel withou...
6    rt andreessen horowitz put massiv new crypto f...
7                              huberman lab chang life
8    horribl infuri unfair tragic sorri loss good r...
9                            rt former cs love discuss
Name: text_stemmed, dtype: object

In [12]:
df["text"].head(20)

0     Join us for a discussion with the B-Lab team t...
1     "@newcampushq is an online, live learning plat...
2     Congratulations to portfolio company, @Resonad...
3     So many strong insights 👇👇 @FirstbaseHQ https:...
4     @louisanicola_ Long set of Huberman Lab podcas...
5     @ShaneMac @eightsleep Can’t go back to life wi...
6     RT @BloombergTV: How Andreessen Horowitz is pu...
7     @jenntejada Huberman Lab changed my life. @hub...
8     @joelle_emerson Horrible. Infuriating, unfair,...
9     RT @diarahmanTO: As a former CS, I am loving t...
10    And we just had a last minute addition of the ...
11    RT @kshenster: I'm discussing “Enterprise GTM:...
12    The whole world of Customer Success is changin...
13                    @btcarroccio You probably were!!!
14                    @nealkhosla No disagreement here.
15    We're not running a process, but we are talkin...
16    RT @memdotai: Say goodbye to copy-paste.\n\nIn...
17                      @SheilaSidhu This is gre

In [13]:
df["text"].loc[9]

'RT @diarahmanTO: As a former CS, I am loving this discussion https://t.co/igyjB8CYZX'

In [14]:
preprocess = Pipeline([('text_cleaner', TextCleaner())])
preprocess.transform(df[["text"]]).head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Unnamed: 0,text
0,join us discuss team understand certif legal p...
1,onlin live learn platform hope train rise mana...
2,congratul portfolio compani batch 25 rais 3m r...
3,mani strong insight
4,long set huberman lab podcast consider melaton...
5,go back life without anxieti know hotel withou...
6,andreessen horowitz put massiv new crypto fund...
7,huberman lab chang life
8,horribl infuri unfair tragic sorri loss good r...
9,former cs love discuss


In [15]:
tfidf_stem, unigrams_stem, bigrams_stem, unibigrams_stem = prepare_inputs(df, "text_stemmed")

In [16]:
topics = array_to_df(df["topics"])
topics

Unnamed: 0,crypto,early stage,NFT,defi,dex,yield,lending,presale,oracles,giveaway
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
2118,False,False,False,False,False,False,False,False,False,False
2119,False,False,False,False,False,False,False,False,False,False
2120,False,False,False,False,False,False,False,False,False,False
2121,False,False,False,False,False,False,False,False,False,False


In [17]:
topics.sum()

crypto         677
early stage     36
NFT            130
defi           248
dex             50
yield           46
lending         22
presale          9
oracles          2
giveaway        69
dtype: int64

In [18]:
topic_cols = ["crypto", "NFT", "defi"]

# Models

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [20]:
transformer_pipeline = Pipeline([
    ('text_cleaner', TextCleaner(remove_urls=True)),
    ('vectorizer', CountVectorizer(stop_words=stopwords.words("english"))),
    ('tfidf', TfidfTransformer()),
])


In [21]:
transformer_pipeline.fit_transform(df[["text"]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


<1x1 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

## SVC

In [22]:
svc_pipeline = Pipeline([
    ('text_cleaner', TextCleaner(remove_urls=True)),
    ('vectorizer', CountVectorizer(stop_words=stopwords.words("english"))),
    ('tfidf', TfidfTransformer()),
    ('classifier', SVC())
])

svc_search_params = {
    "text_cleaner__stem": [True, False],
    "vectorizer__ngram_range":[(1,1), (1,2), (2,2)],
    "classifier__kernel":["linear", "rbf"],
    "classifier__class_weight": ["balanced", None]
}

svc_search = GridSearchCV(svc_pipeline, svc_search_params, cv=4, n_jobs=-1, scoring="recall")

In [23]:
sgd_pipeline = Pipeline([
    ('text_cleaner', TextCleaner(remove_urls=True)),
    ('vectorizer', CountVectorizer(stop_words=stopwords.words("english"))),
    ('tfidf', TfidfTransformer()),
    ('classifier', SGDClassifier())
])

sgd_search_params = {
    "text_cleaner__stem": [True, False],
    "vectorizer__ngram_range":[(1,1), (1,2), (2,2)],
    "classifier__kernel":["linear", "rbf"],
}

sgd_search = GridSearchCV(sgd_pipeline, sgd_search_params, cv=4, n_jobs=-1, scoring="recall")

In [24]:
nb_pipeline = Pipeline([
    ('text_cleaner', TextCleaner(remove_urls=True)),
    ('vectorizer', CountVectorizer(stop_words=stopwords.words("english"))),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

nb_search_params = {
    "text_cleaner__stem": [True, False],
    "vectorizer__ngram_range":[(1,1), (1,2), (2,2)],
}

nb_search = GridSearchCV(nb_pipeline, nb_search_params, cv=4, n_jobs=-1, scoring="recall")

In [25]:
searches = [
    ("SVC", svc_search), 
    #("SGD", sgd_search), 
    ("MNB", nb_search)]

def sort_by_recall(df):
    return df.sort_values("recall", ascending=False)

def get_accuracy_precision_recall_f1(labels,pred):
    acc = accuracy_score(labels,pred)
    prec = precision_score(labels,pred,average='weighted') #,average='micro'
    recal = recall_score(labels,pred,average='weighted')
    f1 = f1_score(labels,pred,average='weighted')
    return acc,prec,recal,f1

def train_test_clf(x, y, clf):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
    print("train",y_train)
    print("test",y_test)
    clf.fit(x_train, y_train)
    predictions = clf.predict(x_test)
    accuracy, precision, recall, f1 = get_accuracy_precision_recall_f1(y_test, predictions)
    return accuracy, precision, recall, f1

def search_best(x, y):
    results_df = pd.DataFrame(columns=[ "classifier", "accuracy", "precision", "recall", "f1"])
    for name, search in searches:
        print("Fitting", name)
        accuracy, precision, recall, f1 = train_test_clf(x, y, search)
        results_df.loc[len(results_df.index)] = [name, accuracy, precision, recall, f1]
        print("Best params:", search.best_params_)
    return sort_by_recall(results_df)


In [26]:
search_best(df[["text"]], topics[["crypto"]])

Fitting SVC
train       crypto
1505   False
516     True
1461    True
507     True
815     True
...      ...
581    False
1070   False
1981   False
288    False
648    False

[1592 rows x 1 columns]
test       crypto
189    False
1952   False
1694   False
742    False
1214    True
...      ...
1676   False
1228   False
290    False
1574    True
1139   False

[531 rows x 1 columns]


96 fits failed out of a total of 96.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
64 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\micdu\Code\microservices\project-beacon\tweets-analysis-service\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\micdu\Code\microservices\project-beacon\tweets-analysis-service\venv\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\micdu\Code\microservices\project-beacon\tweets-analysis-service\venv\lib\site-packages\sklearn\svm\_base.py", line 190, in fit
    X, y = self

ValueError: Found input variables with inconsistent numbers of samples: [1, 1592]

In [None]:
topics["crypto"]

In [None]:
train_test_clf(df[["text"]], topics[["crypto"]], svc_pipeline)

In [None]:
topics[["crypto"]]

In [None]:
search_best(df["text"], topics["defi"])

In [None]:
search_best(df["text"], topics["NFT"])

Bayesian, MLP and SVM

In [None]:
svc_parameter_space = {
    "kernel":["linear", "rbf"]
}

def gscv_svc_factory():
    svc_parameter_space = {
        "kernel":["linear", "rbf"],
        "class_weight": ["balanced", None]
    }
    return GridSearchCV(SVC(), svc_parameter_space, n_jobs=-1, cv=2)


In [None]:
train_test_clf(tfidf_raw, topics["crypto"], gscv_svc_factory(), lambda cv: print("best params: ", cv.best_params_))

In [None]:
train_test_clf(unigrams_raw, topics["crypto"], gscv_svc_factory(), lambda cv: print("best params: ", cv.best_params_))

In [None]:
train_test_clf(bigrams_raw, topics["crypto"], gscv_svc_factory(), lambda cv: print("best params: ", cv.best_params_))

In [None]:
train_test_clf(unibigrams_raw, topics["crypto"], gscv_svc_factory(), lambda cv: print("best params: ", cv.best_params_))

In [None]:
named_datasets = [
    (tfidf_raw, "tfidf_raw"),
    (unigrams_raw, "unigrams_raw"),
    (bigrams_raw, "bigrams_raw"),
    (unibigrams_raw, "unibigrams_raw"),
    (tfidf_clean, "tfidf_clean"),
    (unigrams_clean, "unigrams_clean"),
    (bigrams_clean, "bigrams_clean"),
    (unibigrams_clean, "unibigrams_clean"),
    (tfidf_stem, "tfidf_stem"),
    (unigrams_stem, "unigrams_stem"),
    (bigrams_stem, "bigrams_stem"),
    (unibigrams_stem, "unibigrams_stem"),
]

def train_test_repeat(x, y, clf_factory, n_iterations):
    results_df = pd.DataFrame(columns=[ "iteration", "accuracy", "precision", "recall", "f1"])
    for i in range(n_iterations):
        results = train_test_clf(x, y, clf_factory())
        results = np.asarray(results)
        results = np.insert(results, 0, i)
        results_df.loc[len(results_df.index)] = results
    return results_df

def train_test(clf_factory, datasets, col, dataset_transformer=False):
    results_df = pd.DataFrame(columns=["dataset", "iteration", "accuracy", "precision", "recall", "f1"])
    for dataset, name in datasets:
        data = dataset
        if dataset_transformer:
            data = dataset_transformer
        train_test_df = train_test_repeat(data, topics[col], clf_factory, 5)
        train_test_df["dataset"] = name
        results_df = results_df.append(train_test_df, ignore_index=True)
    return results_df

## Results for SVC with different datasets

### Crypto topic

In [None]:
svc_crypto_results = train_test(gscv_svc_factory, named_datasets, "crypto")
svc_crypto_results.head(10)

### Defi topic

In [None]:
svc_defi_results = train_test(gscv_svc_factory, named_datasets, "defi")
svc_crypto_results.sort_values("recall", ascending=False).head(10)

### NFT topic

In [None]:
svc_nft_results = train_test(gscv_svc_factory, named_datasets, "NFT")
svc_crypto_results.sort_values("recall", ascending=False).head(10)

## Gaussian Naive Bayesian

In [None]:
from sklearn.naive_bayes import GaussianNB

def gscv_gnb_factory():
    gnb_parameter_space = {
    }
    return GridSearchCV(GaussianNB(), gnb_parameter_space, n_jobs=-1, cv=2)

train_test_clf(words, topics["crypto"], gscv_gnb_factory())

# Reducing dimension

In [None]:
def sort_by_recall(df):
    return df.sort_values("recall", ascending=False)

n_dimensions = [2500, 1000, 500, 200, 100, 50, 20]
def train_test_reduced_dimensions(clf_factory, datasets, col, n_components):
    results_df = pd.DataFrame(columns=["n_components", "dataset", "iteration", "accuracy", "precision", "recall", "f1"])
    train_df = train_test(gscv_svc_factory, named_datasets, "crypto")
    train_df["n_components"] = "-1"
    results_df = results_df.append(train_df, ignore_index=True)
    for dataset, name in datasets:
        for n in n_components:
            svd = TruncatedSVD(n_components=n, n_iter=7)
            x = svd.fit_transform(dataset)
            train_df = train_test_repeat(x, topics[col], clf_factory, 5)
            train_df["dataset"] = name
            train_df["n_components"] = n
            results_df = results_df.append(train_df, ignore_index=True)
    return sort_by_recall(results_df)
    

## Crypto topic

### SVC

In [None]:
train_results_crypto_dimensions = train_test_reduced_dimensions(gscv_svc_factory, named_datasets, "crypto", n_dimensions)
train_results_crypto_dimensions.head(10)

### GNB

In [None]:
train_results_crypto_dimensions_gnb = train_test_reduced_dimensions(gscv_gnb_factory, named_datasets, "crypto", n_dimensions)
train_results_crypto_dimensions_gnb.head(5)

### OneVsRestClassifier

In [None]:
from sklearn.multiclass import OneVsRestClassifier

def ovsr_svc_gnb_factory():
    parameter_space = {
    }
    return GridSearchCV(OneVsRestClassifier(SVC(kernel="linear")), parameter_space, n_jobs=-1, cv=2)

In [None]:
train_results_crypto_dimensions_gnb = train_test_reduced_dimensions(ovsr_svc_gnb_factory, named_datasets, ["crypto", "NFT", "defi"], n_dimensions)
train_results_crypto_dimensions_gnb.head(5)

## NFT topic

In [None]:
train_results_nft_dimensions_svc = train_test_reduced_dimensions(gscv_svc_factory, named_datasets, "NFT", n_dimensions)
train_results_nft_dimensions_svc.head(5)

## Defi topic

In [None]:
train_results_defi_dimensions_svc = train_test_reduced_dimensions(gscv_svc_factory, named_datasets, "defi", n_dimensions)
train_results_defi_dimensions_svc.head(5)

Increase are marginals, but reducing dimensionnality to 100-200 seems to yield better results

# Building final models

In [None]:
save_folder = "models"

def train_and_save(topic ,vectorizer, dimension_reductor, model):
    dataset = vectorizer.fit_transform(df["text"])
    save_model(vectorizer, "vectorizer", topic)
    dataset = dimension_reductor.fit_transform(dataset)
    save_model(dimension_reductor, "reducer", topic)
    model.fit(dataset, topics[topic])
    save_model(model, "model", topic)

def save_model(model, role, topic):
    filename = get_filename(role, topic)
    folder = save_folder
    if not os.path.exists(folder):
        os.makedirs(folder)
    filepath = folder +"/"+filename
    with open(filepath, 'wb') as file:
        pickle.dump(model, file)

def get_filename(role, topic):
    return role+"-"+topic+".pkl"


In [None]:
train_and_save(
    "crypto", 
    CountVectorizer(stop_words=stopwords.words("english"), ngram_range=(1,1)),
    TruncatedSVD(n_components=100, n_iter=7),
    SVC(kernel="linear")
)

In [None]:
train_and_save(
    "NFT", 
    CountVectorizer(stop_words=stopwords.words("english"), ngram_range=(1,1)),
    TruncatedSVD(n_components=100, n_iter=7),
    SVC(kernel="linear")
)

In [None]:
train_and_save(
    "defi", 
    CountVectorizer(stop_words=stopwords.words("english"), ngram_range=(1,1)),
    TruncatedSVD(n_components=200, n_iter=7),
    SVC(kernel="linear")
)

# Building pipeline

In [None]:
from sklearn.pipeline import Pipeline
save_folder = "models"

def get_filename(role, topic):
    return role + "-" + topic + ".pkl"

def load_pipeline(label):
    vectorizer = load_pipe("vectorizer", label)
    reducer = load_pipe("reducer", label)
    model = load_pipe("model", label)
    return Pipeline([vectorizer, reducer, model])
    
def load_pipe(role, label):
    return (role, load_model(role, label))
    
def load_model(role, label):
    filename = get_filename(role, label)
    folder = save_folder
    filepath = folder +"/"+filename
    model = None
    with open(filepath, 'rb') as file:
        model = pickle.load(file)
    return model

In [None]:
crypto_labeller = load_pipeline("crypto")
predictions_crypto = crypto_labeller.predict(unigrams_raw)
get_accuracy_precision_recall_f1(y_test, predictions_crypto)

In [None]:
defi_labeller = load_pipeline("defi")
defi_labeller.predict([
    "This is a tweet about Bitcoin", 
    "Check out this defi project", 
    "lending is insane", 
    "on ethereum"
])

In [None]:
nft_labeller = load_pipeline("NFT")
nft_labeller.predict([
    "This is a tweet about Bitcoin", 
    "Check out this defi project", 
    "I love NFTs", 
    "I bought this cryptopunk",
    "rarity"
])