## Import data from S3

In [1]:
from utils.db_conn import query
import pandas as pd
import requests

# get all the file names from db
list_ids = [id[0] for id in query('get_all_videos_ids', [])]

# load the data into a list
data = []
for id in list_ids:
    url = f'https://youtube-joao-crypto.s3.eu-central-1.amazonaws.com/{id}.txt'
    response = requests.get(url)
    for t in response.text.split('\n'):
        # we only want text or sentences with numbers
        if any(map(str.isdigit, t)):
            data.append(t)

data

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): youtube-joao-crypto.s3.eu-central-1.amazonaws.com:443
DEBUG:urllib3.connectionpool:https://youtube-joao-crypto.s3.eu-central-1.amazonaws.com:443 "GET /4qIoXEvkbpY.txt HTTP/1.1" 200 6213
DEBUG:charset_normalizer:Encoding detection: ascii is most likely the one.
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): youtube-joao-crypto.s3.eu-central-1.amazonaws.com:443
DEBUG:urllib3.connectionpool:https://youtube-joao-crypto.s3.eu-central-1.amazonaws.com:443 "GET /GMxYBzbI_mo.txt HTTP/1.1" 200 7397
DEBUG:charset_normalizer:Encoding detection: ascii is most likely the one.
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): youtube-joao-crypto.s3.eu-central-1.amazonaws.com:443
DEBUG:urllib3.connectionpool:https://youtube-joao-crypto.s3.eu-central-1.amazonaws.com:443 "GET /NWbjiHBtKCI.txt HTTP/1.1" 200 7245
DEBUG:charset_normalizer:Encoding detection: ascii is most likely the one.
DEBUG:urllib3.connectionp

['currently sitting at 29',
 '740 dollars so continues to battle',
 'shorting off of the 21 ema targeting',
 '26 000 and as low down as 19 thousand',
 'into that 21 ema on a weekly time frame',
 "but anything that's below this 313",
 'having a look at the s p 500',
 'above that 9 ema',
 "since the 11th of may so it's been",
 "and i've got it on the 15-minute chart",
 '15-minute chart and just have the normal',
 '200 ema and the',
 '21 ema over here which are coming',
 'coming down at the 21k area below and',
 "103.37 it's now coming down to",
 '101.56 where i would anticipate that uh',
 'price holding above the 9 ema which is',
 'start to close above this 31 500 level',
 'showed at 37 000. so if i move back to',
 'area between 36 and 38 000 is going to',
 'range is sitting between 27 000 and the',
 'top of the box at 31 500 so get above',
 'target higher towards the 34 000 zone',
 'about 50 if you look at things like',
 'and especially above that 200 ema then',
 "21 ema on a weekly cha

## Save the data to txt

In [2]:
with open('utils/prodigy/data.txt', 'w') as f:
    for t in data:
        f.write(t + '\n')

## Use spacy NER module to clean text

In [3]:
import spacy

nlp = spacy.load("utils/prodigy/model/model-best", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])

def clean_text(text):
    text = text.replace('000', '')
    text = text.replace('00', '')
    text = text.replace('k', '')
    text = text.replace('.', '')

    return text

final = []
for line in data:
    doc = nlp(line)
    final.append(" ".join([clean_text(word.text)+'k' if word.ent_type_ else word.text for word in doc]))

final

['currently sitting at 29k',
 '740 dollars so continues to battle',
 'shorting off of the 21 ema targeting',
 '26k k and as low down as 19k thousand',
 'into that 21 ema on a weekly time frame',
 "but anything that 's below this 313",
 'having a look at the s p 500',
 'above that 9 ema',
 "since the 11th of may so it 's been",
 "and i 've got it on the 15 - minute chart",
 '15 - minute chart and just have the normal',
 '200 ema and the',
 '21 ema over here which are coming',
 'coming down at the 21k area below and',
 "103.37 it 's now coming down to",
 '101.56 where i would anticipate that uh',
 'price holding above the 9 ema which is',
 'start to close above this 31k 5k level',
 'showed at 37k k . so if i move back to',
 'area between 36k and 38k k is going to',
 'range is sitting between 27k k and the',
 'top of the box at 31k 5k so get above',
 'target higher towards the 34k k zone',
 'about 50 if you look at things like',
 'and especially above that 200 ema then',
 "21 ema on a wee

## Remove stopwords

In [22]:
from nltk.corpus import stopwords

stopwords = list(stopwords.words('english'))

## Vectorize data with TFIDF

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_text = TfidfVectorizer(use_idf=True, min_df=3, max_df=0.8,
                             stop_words=stopwords, ngram_range=(1, 2))
# Fit and transform to our data
# vectors_text is going to be used later in the NMF algorithm
vectors_text = tfidf_text.fit_transform(final)

In [24]:
tfidf = pd.DataFrame(vectors_text[0].T.todense(), index=tfidf_text.get_feature_names(), columns=["TF-IDF"])
# Sort from the more important to least important
tfidf = tfidf.sort_values('TF-IDF', ascending=False)
tfidf



Unnamed: 0,TF-IDF
sitting,0.655247
currently,0.592654
29k,0.468414
00,0.000000
major weekly,0.000000
...,...
back know,0.000000
back january,0.000000
back december,0.000000
back 50,0.000000


## Decomposition with NMF

In [29]:
from sklearn.decomposition import NMF

nmf_text_model = NMF(n_components=3, random_state=42)
w_text_matrix = nmf_text_model.fit_transform(vectors_text)
w_text_matrix # rows are documents, columns are topics

array([[7.08895963e-04, 3.12208385e-02, 0.00000000e+00],
       [9.58326493e-05, 5.92610946e-03, 1.39633691e-03],
       [6.07068632e-03, 5.26869712e-03, 1.45235459e-03],
       ...,
       [3.00326860e-02, 3.79012681e-03, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.06029165e-17],
       [0.00000000e+00, 2.95096721e-04, 9.56610449e-04]])

## Topics

In [30]:
dicts = {}
new_list = []

for topic, word_vector in enumerate(nmf_text_model.components_):
    largest = word_vector.argsort()[::-1]
    dicts["Tópico " + str(topic+1)] = new_list

    for i in range(0, 5):
        new_list.append(tfidf_text.get_feature_names()[largest[i]])
        if i == 4:
            new_list = []

df_topicos = pd.DataFrame.from_dict(dicts)
df_topicos



Unnamed: 0,Tópico 1,Tópico 2,Tópico 3
0,200,30k,50
1,week,2018,50 week
2,moving,bitcoin,sma
3,day,back,percent
4,average,right,week
