In [70]:
 import json

from ruleset import Ruleset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from text_analysis import get_words
from data import load_data, array_to_df
from sklearn.metrics import precision_recall_curve, average_precision_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from scipy.sparse import lil_matrix, csr_matrix
from nltk.corpus import stopwords
from sklearn.decomposition import TruncatedSVD
import pickle
import os

# Data Prep

In [2]:
df = load_data("labelled-tweets-26-08-2021.json")

In [3]:
df["words"] = df.apply(lambda row: get_words(row["text"]), axis=1)
df

Unnamed: 0,text,topics,words
0,Join us for a discussion with the B-Lab team t...,[],"[join, us, discuss, team, understand, certif, ..."
1,"""@newcampushq is an online, live learning plat...",[],"[newcampushq, onlin, live, learn, platform, ho..."
2,"Congratulations to portfolio company, @Resonad...",[],"[congratul, portfolio, compani, resonadolab, b..."
3,So many strong insights 👇👇 @FirstbaseHQ https:...,[],"[mani, strong, insight, firstbasehq]"
4,@louisanicola_ Long set of Huberman Lab podcas...,[],"[long, set, huberman, lab, podcast, consider, ..."
...,...,...,...
1470,RT @0x_clem: Teleportr is live! You can now br...,[crypto],"[teleportr, live, bridg, eth, optimismpbc, low..."
1471,RT @optimismPBC: Numba 2! This time by the fan...,[crypto],"[optimismpbc, numba, time, fantast, gigamesh, ..."
1472,RT @jay_veekay: @davidzmorris I have minted an...,"[crypto, dex, NFT, defi]","[davidzmorri, mint, nft, optimismpbc, part, un..."
1473,RT @darren_ditto: Day in the life of a crypto ...,"[crypto, dex, defi, yield]","[day, life, crypto, user, rebal, lp, posit, un..."


In [4]:
topics = array_to_df(df["topics"])

In [5]:
words = array_to_df(df["words"])

In [6]:
swords = csr_matrix(words)
swords

<1475x3931 sparse matrix of type '<class 'numpy.bool_'>'
	with 13119 stored elements in Compressed Sparse Row format>

In [7]:
topics.sum()

crypto         455
early stage     30
NFT             86
defi           134
dex             29
yield           27
lending         11
presale          6
oracles          2
dtype: int64

In [8]:
topic_cols = ["crypto", "NFT", "defi"]

In [9]:
tfidf_vect = TfidfVectorizer(max_features=5000)

In [10]:
sentences = df["words"].map(lambda words: " ".join(words))
tfidf_words = tfidf_vect.fit_transform(sentences)
tfidf_words

<1475x3926 sparse matrix of type '<class 'numpy.float64'>'
	with 13096 stored elements in Compressed Sparse Row format>

In [11]:
tfidf_vect_sentences = TfidfVectorizer(max_features=5000)
tfidf = tfidf_vect_sentences.fit_transform(df["text"])
tfidf

<1475x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 22758 stored elements in Compressed Sparse Row format>

In [12]:
# CountVectorizer
unigram_counter = CountVectorizer(stop_words=stopwords.words("english"), ngram_range=(1,1))
unigrams = unigram_counter.fit_transform(df["text"])
unigrams

<1475x6161 sparse matrix of type '<class 'numpy.int64'>'
	with 16375 stored elements in Compressed Sparse Row format>

In [62]:
unigram_counter.get_feature_names()

['00',
 '000',
 '00005',
 '0003',
 '002',
 '003',
 '005',
 '00am',
 '01',
 '01s',
 '02',
 '05',
 '06',
 '06b',
 '07',
 '08',
 '085',
 '0b',
 '0dpisxbivs',
 '0gtpqqoake',
 '0jfbi6gvti',
 '0kfpai4nre',
 '0ot35tcmw1',
 '0rz10qkqvr',
 '0v45vjmrpw',
 '0x4c756b65',
 '0x650d',
 '0x_clem',
 '0x_lucas',
 '0x_meow',
 '0xalena',
 '0xalice_',
 '0xaugustus',
 '0xbebis_',
 '0xdazai',
 '0xdippur',
 '0xedenau',
 '0xhalfinney',
 '0xminion',
 '0xmjs',
 '0xpolygon',
 '0xsisyphus',
 '0xthespaniard',
 '0xtuba',
 '0xwari',
 '0y6vcxsrj7',
 '10',
 '100',
 '1000',
 '10000',
 '100000',
 '1000x',
 '100k',
 '100m',
 '100x',
 '106',
 '10k',
 '10m',
 '10no',
 '10x',
 '11',
 '112',
 '113',
 '114',
 '115',
 '118',
 '11am',
 '11dlepjyez',
 '11th',
 '12',
 '12noon',
 '12pm',
 '13',
 '1350hrki8p',
 '13bn',
 '14',
 '15',
 '150k',
 '150x',
 '155',
 '1559',
 '157',
 '15p',
 '16',
 '166',
 '17',
 '17jpmujmya',
 '185',
 '19',
 '1975',
 '1996',
 '1b',
 '1bn',
 '1imuesxphs',
 '1inc',
 '1inch',
 '1k',
 '1m',
 '1pm',
 '1rdh6winq

In [13]:
bigram_counter = CountVectorizer(stop_words=stopwords.words("english"), ngram_range=(2,2))
bigrams = bigram_counter.fit_transform(df["text"])
bigrams

<1475x13879 sparse matrix of type '<class 'numpy.int64'>'
	with 15467 stored elements in Compressed Sparse Row format>

In [14]:
unibigram_counter = CountVectorizer(stop_words=stopwords.words("english"), ngram_range=(1,2))
unibigrams = unibigram_counter.fit_transform(df["text"])
unibigrams

<1475x20040 sparse matrix of type '<class 'numpy.int64'>'
	with 31842 stored elements in Compressed Sparse Row format>

# Models

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC, LinearSVC

Bayesian, MLP and SVM

In [16]:
def get_accuracy_precision_recall_f1(labels,pred):
    acc = accuracy_score(labels,pred)
    prec = precision_score(labels,pred,average='weighted') #,average='micro'
    recal = recall_score(labels,pred,average='weighted')
    f1 = f1_score(labels,pred,average='weighted')
    return acc,prec,recal,f1

def plotResults(trueLabels, anomalyScores, returnPreds= False): 
    preds= pd.concat([trueLabels, anomalyScores], axis=1)
    preds.columns= ['trueLabel', 'anomalyScore']
    
    precision, recall, thresholds= precision_recall_curve(preds['trueLabel'], preds['anomalyScore'])
    average_precision= average_precision_score(preds['trueLabel'], preds['anomalyScore'])
    plt.step(recall, precision, color='k', alpha=0.7, where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.3, color='k')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recallcurve: ''AveragePrecision= {0:0.2f}'.format(average_precision))
    
    fpr, tpr, thresholds= roc_curve(preds['trueLabel'], preds['anomalyScore'])
    areaUnderROC= auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='r', lw=2, label='ROC curve')
    plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('TruePositive Rate')
    plt.title('Receiveroperating characteristic: ''Area under the curve= {0:0.2f}'.format(areaUnderROC))
    plt.legend(loc="lower right")
    plt.show()
    
    if returnPreds==True:
        return preds, average_precision
    else:
        return average_precision
    
def anomalyScores(originalDF, reducedDF):
    loss= np.sum((np.array(originalDF) - np.array(reducedDF))**2, axis=1)
    loss= pd.Series(data=loss, index=originalDF.index)
    loss= (loss-np.min(loss)) / (np.max(loss) -np.min(loss))
    return loss

In [17]:
def train_test_clf(x, y, clf, callback=False):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.75, stratify=y)
    clf.fit(x_train, y_train)
    predictions = clf.predict(x_test)
    accuracy, precision, recall, f1 = get_accuracy_precision_recall_f1(y_test, predictions)
    if callback:
        callback(clf)
    return accuracy, precision, recall, f1

In [18]:
svc_parameter_space = {
    "kernel":["linear", "rbf"]
}

def gscv_svc_factory():
    svc_parameter_space = {
        "kernel":["linear", "rbf"]
    }
    return GridSearchCV(SVC(), svc_parameter_space, n_jobs=-1, cv=2)

train_test_clf(swords, topics["crypto"], gscv_svc_factory(), lambda cv: print("best params: ", cv.best_params_))

best params:  {'kernel': 'linear'}


(0.8229448961156278,
 0.8259420054353444,
 0.8229448961156278,
 0.8090205538864541)

In [19]:
train_test_clf(tfidf_words, topics["crypto"], gscv_svc_factory(), lambda cv: print("best params: ", cv.best_params_))

best params:  {'kernel': 'linear'}


(0.7904245709123758,
 0.8159120164534361,
 0.7904245709123758,
 0.7569681238303242)

In [20]:
train_test_clf(tfidf, topics["crypto"], gscv_svc_factory(), lambda cv: print("best params: ", cv.best_params_))

best params:  {'kernel': 'linear'}


(0.7967479674796748,
 0.8184383855780407,
 0.7967479674796748,
 0.7670480818979423)

In [21]:
train_test_clf(unigrams, topics["crypto"], gscv_svc_factory(), lambda cv: print("best params: ", cv.best_params_))

best params:  {'kernel': 'linear'}


(0.8112014453477868,
 0.8104093732669462,
 0.8112014453477868,
 0.7971379373616038)

In [22]:
train_test_clf(bigrams, topics["crypto"], gscv_svc_factory(), lambda cv: print("best params: ", cv.best_params_))

best params:  {'kernel': 'rbf'}


(0.7407407407407407, 0.7251153137874392, 0.7407407407407407, 0.712494403111144)

In [23]:
train_test_clf(unibigrams, topics["crypto"], gscv_svc_factory(), lambda cv: print("best params: ", cv.best_params_))

best params:  {'kernel': 'rbf'}


(0.7524841915085817,
 0.7391330638388849,
 0.7524841915085817,
 0.7313164242215394)

In [24]:
named_datasets = [
    (tfidf, "tfidf"),
    (unigrams, "unigrams"),
    (bigrams, "bigrams"),
    (unibigrams, "unibigrams")
]

def train_test_repeat(x, y, clf_factory, n_iterations):
    results_df = pd.DataFrame(columns=[ "iteration", "accuracy", "precision", "recall", "f1"])
    for i in range(n_iterations):
        results = train_test_clf(x, y, clf_factory())
        results = np.asarray(results)
        results = np.insert(results, 0, i)
        results_df.loc[len(results_df.index)] = results
    return results_df

def train_test(clf_factory, datasets, col, dataset_transformer=False):
    results_df = pd.DataFrame(columns=["dataset", "iteration", "accuracy", "precision", "recall", "f1"])
    for dataset, name in datasets:
        data = dataset
        if dataset_transformer:
            data = dataset_transformer
        train_test_df = train_test_repeat(data, topics[col], clf_factory, 5)
        train_test_df["dataset"] = name
        results_df = results_df.append(train_test_df, ignore_index=True)
    return results_df

## Results for SVC with different datasets

### Crypto topic

In [25]:
svc_crypto_results = train_test(gscv_svc_factory, named_datasets, "crypto")
svc_crypto_results.groupby(["dataset"]).max()

Unnamed: 0_level_0,iteration,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bigrams,4.0,0.740741,0.724438,0.740741,0.719913
tfidf,4.0,0.813008,0.820229,0.813008,0.794456
unibigrams,4.0,0.763324,0.752774,0.763324,0.752539
unigrams,4.0,0.848238,0.852,0.848238,0.838498


### Defi topic

In [26]:
svc_defi_results = train_test(gscv_svc_factory, named_datasets, "defi")
svc_defi_results.groupby(["dataset"]).max()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0_level_0,iteration,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bigrams,4.0,0.909666,0.917833,0.909666,0.867523
tfidf,4.0,0.908762,0.825849,0.908762,0.865324
unibigrams,4.0,0.913279,0.920834,0.913279,0.875984
unigrams,4.0,0.921409,0.922663,0.921409,0.894454


### NFT topic

In [27]:
svc_nft_results = train_test(gscv_svc_factory, named_datasets, "NFT")
svc_nft_results.groupby(["dataset"]).max()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0_level_0,iteration,accuracy,precision,recall,f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bigrams,4.0,0.942186,0.945532,0.942186,0.915016
tfidf,4.0,0.943089,0.946334,0.943089,0.917168
unibigrams,4.0,0.948509,0.94875,0.948509,0.930292
unigrams,4.0,0.957543,0.954418,0.957543,0.948173


## Gaussian Naive Bayesian

In [28]:
from sklearn.naive_bayes import GaussianNB

def gscv_gnb_factory():
    gnb_parameter_space = {
    }
    return GridSearchCV(GaussianNB(), gnb_parameter_space, n_jobs=-1, cv=2)

train_test_clf(words, topics["crypto"], gscv_gnb_factory())

(0.5799457994579946,
 0.6900454106347774,
 0.5799457994579946,
 0.5933942100953262)

# Reducing dimension

In [55]:
def sort_by_recall(df):
    return df.sort_values("recall", ascending=False)

n_dimensions = [2500, 1000, 500, 200, 100, 50, 20]
def train_test_reduced_dimensions(clf_factory, datasets, col, n_components):
    results_df = pd.DataFrame(columns=["n_components", "dataset", "iteration", "accuracy", "precision", "recall", "f1"])
    train_df = train_test(gscv_svc_factory, named_datasets, "crypto")
    train_df["n_components"] = "-1"
    results_df = results_df.append(train_df, ignore_index=True)
    for dataset, name in datasets:
        for n in n_components:
            svd = TruncatedSVD(n_components=n, n_iter=7)
            x = svd.fit_transform(dataset)
            train_df = train_test_repeat(x, topics[col], clf_factory, 5)
            train_df["dataset"] = name
            train_df["n_components"] = n
            results_df = results_df.append(train_df, ignore_index=True)
    return sort_by_recall(results_df)
    

## Crypto topic

### SVC

In [56]:
train_results_crypto_dimensions = train_test_reduced_dimensions(gscv_svc_factory, named_datasets, "crypto", n_dimensions)
train_results_crypto_dimensions.head(5)

Unnamed: 0,n_components,dataset,iteration,accuracy,precision,recall,f1
147,100,unibigrams,2.0,0.849142,0.84964,0.849142,0.841339
75,100,unigrams,0.0,0.847335,0.844891,0.847335,0.841846
59,2500,unigrams,4.0,0.846432,0.847395,0.846432,0.837944
83,50,unigrams,3.0,0.842818,0.84606,0.842818,0.832488
79,100,unigrams,4.0,0.841915,0.839112,0.841915,0.836035


### GNB

In [57]:
train_results_crypto_dimensions_gnb = train_test_reduced_dimensions(gscv_gnb_factory, named_datasets, "crypto", n_dimensions)
train_results_crypto_dimensions_gnb.head(5)

Unnamed: 0,n_components,dataset,iteration,accuracy,precision,recall,f1
6,-1,unigrams,1.0,0.832882,0.833296,0.832882,0.822537
9,-1,unigrams,4.0,0.829268,0.835835,0.829268,0.814817
7,-1,unigrams,2.0,0.827462,0.835536,0.827462,0.811939
0,-1,tfidf,0.0,0.804878,0.80979,0.804878,0.785334
5,-1,unigrams,0.0,0.803975,0.81889,0.803975,0.779349


### OneVsRestClassifier

In [113]:
from sklearn.multiclass import OneVsRestClassifier

def ovsr_svc_gnb_factory():
    parameter_space = {
    }
    return GridSearchCV(OneVsRestClassifier(SVC(kernel="linear")), parameter_space, n_jobs=-1, cv=2)

In [114]:
train_results_crypto_dimensions_gnb = train_test_reduced_dimensions(ovsr_svc_gnb_factory, named_datasets, ["crypto", "NFT", "defi"], n_dimensions)
train_results_crypto_dimensions_gnb.head(5)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

## NFT topic

In [58]:
train_results_nft_dimensions_svc = train_test_reduced_dimensions(gscv_svc_factory, named_datasets, "NFT", n_dimensions)
train_results_nft_dimensions_svc.head(5)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,n_components,dataset,iteration,accuracy,precision,recall,f1
77,100,unigrams,2.0,0.974706,0.973222,0.974706,0.973264
79,100,unigrams,4.0,0.9729,0.971192,0.9729,0.970838
78,100,unigrams,3.0,0.971093,0.969061,0.971093,0.969175
71,200,unigrams,1.0,0.969286,0.966918,0.969286,0.96695
83,50,unigrams,3.0,0.968383,0.965865,0.968383,0.965495


## Defi topic

In [60]:
train_results_defi_dimensions_svc = train_test_reduced_dimensions(gscv_svc_factory, named_datasets, "defi", n_dimensions)
train_results_defi_dimensions_svc.head(5)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,n_components,dataset,iteration,accuracy,precision,recall,f1
65,500,unigrams,0.0,0.922313,0.909964,0.922313,0.903871
74,200,unigrams,4.0,0.922313,0.910865,0.922313,0.913479
146,100,unibigrams,1.0,0.922313,0.908839,0.922313,0.907025
58,2500,unigrams,3.0,0.921409,0.922663,0.921409,0.894454
76,100,unigrams,1.0,0.915989,0.900241,0.915989,0.903535


Increase are marginals, but reducing dimensionnality to 100-200 seems to yield better results

# Building final models

In [76]:
save_folder = "models"

def train_and_save(topic ,vectorizer, dimension_reductor, model):
    dataset = vectorizer.fit_transform(df["text"])
    save_model(vectorizer, "vectorizer", topic)
    dataset = dimension_reductor.fit_transform(dataset)
    save_model(dimension_reductor, "reducer", topic)
    model.fit(dataset, topics[topic])
    save_model(model, "model", topic)

def save_model(model, role, topic):
    filename = get_filename(role, topic)
    folder = save_folder
    if not os.path.exists(folder):
        os.makedirs(folder)
    filepath = folder +"/"+filename
    with open(filepath, 'wb') as file:
        pickle.dump(model, file)

def get_filename(role, topic):
    return role+"-"+topic+".pkl"


In [77]:
train_and_save(
    "crypto", 
    CountVectorizer(stop_words=stopwords.words("english"), ngram_range=(1,1)),
    TruncatedSVD(n_components=100, n_iter=7),
    SVC(kernel="linear")
)

In [78]:
train_and_save(
    "NFT", 
    CountVectorizer(stop_words=stopwords.words("english"), ngram_range=(1,1)),
    TruncatedSVD(n_components=100, n_iter=7),
    SVC(kernel="linear")
)

In [79]:
train_and_save(
    "defi", 
    CountVectorizer(stop_words=stopwords.words("english"), ngram_range=(1,1)),
    TruncatedSVD(n_components=200, n_iter=7),
    SVC(kernel="linear")
)

# Building pipeline

In [84]:
from sklearn.pipeline import Pipeline

def load_pipeline(label):
    vectorizer = load_pipe("vectorizer", label)
    reducer = load_pipe("reducer", label)
    model = load_pipe("model", label)
    return Pipeline([vectorizer, reducer, model])
    
def load_pipe(role, label):
    return (role, load_model(role, label))
    
def load_model(role, label):
    filename = get_filename(role, label)
    folder = save_folder
    filepath = folder +"/"+filename
    model = None
    with open(filepath, 'rb') as file:
        model = pickle.load(file)
    return model

In [108]:
crypto_labeller = load_pipeline("crypto")
crypto_labeller.predict([
    "This is a tweet about Bitcoin", 
    "Check out my crypto project", 
    "I like soup", 
    "Ethereum",
    "I like lending on AAVE protocol"
])

array([ True,  True, False,  True, False])

In [109]:
defi_labeller = load_pipeline("defi")
defi_labeller.predict([
    "This is a tweet about Bitcoin", 
    "Check out this defi project", 
    "lending is insane", 
    "on ethereum"
])

array([False,  True, False, False])

In [110]:
nft_labeller = load_pipeline("NFT")
nft_labeller.predict([
    "This is a tweet about Bitcoin", 
    "Check out this defi project", 
    "I love NFTs", 
    "I bought this cryptopunk",
    "rarity"
])

array([False, False,  True, False, False])