In [30]:
import re
import numpy as np
import pandas as pd
import spacy
import string
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from hdbscan import HDBSCAN
from umap import UMAP
from mpl_toolkits.mplot3d import Axes3D
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from functools import reduce
%matplotlib inline


In [31]:
file = pd.read_csv('text_classifier.csv', usecols=['id', 'title', 'text', 'Tags'])

In [32]:
file.head()

Unnamed: 0,id,title,text,Tags
0,https://hackernoon.com/whats-the-best-smartpho...,What the Best Smartphone for keeping your Conv...,"In the world of today, personal data like our ...",Blockchain
1,https://hackernoon.com/whats-the-best-smartpho...,What the Best Smartphone for keeping your Conv...,"In the world of today, personal data like our ...",Bitcoin
2,https://hackernoon.com/whats-the-best-smartpho...,What the Best Smartphone for keeping your Conv...,"In the world of today, personal data like our ...",Cryptocurrency
3,https://hackernoon.com/whats-the-best-smartpho...,What the Best Smartphone for keeping your Conv...,"In the world of today, personal data like our ...",Education
4,https://hackernoon.com/blockchain-what-the-hel...,Blockchain: What the Hell is a Merkle Tree?,No you can find this kind of a tree in the for...,Blockchain


In [33]:
file.Tags.nunique()


407

In [34]:
#create a dataframe copy to remove duplicated id's

file_unique_urls = file.copy()
file_unique_urls.drop_duplicates(subset='id', inplace=True)

In [35]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
for token in doc:
    print(token.pos_)

PROPN
VERB
VERB
ADP
VERB
PROPN
NOUN
ADP
SYM
NUM
NUM


In [36]:
def words_tokenizer(text,repetidos=False):
    nlp = spacy.load('en_core_web_sm')
    tokens = nlp(text)
    filtered_tokens = []
    for word in tokens:
        lemma = word.lemma_.lower().strip()
        pos = word.pos_
        if lemma not in STOP_WORDS and re.search ('^[a-zA-Z]+$', lemma) and pos == 'NOUN':
            if lemma == 'datum':
                continue
            else:
                filtered_tokens.append(lemma)
    if repetidos: return list(filtered_tokens)
    return list(set(filtered_tokens))


In [41]:
token_words = words_tokenizer(file_unique_urls.text[0])

In [42]:
token_words

['mechanism',
 'microphone',
 'fracas',
 'solution',
 'space',
 'limit',
 'log',
 'blockchain',
 'area',
 'place',
 'architecture',
 'giant',
 'face',
 'success',
 'decentralization',
 'conversation',
 'sale',
 'encryption',
 'internet',
 'tip',
 'art',
 'array',
 'imagining',
 'prompt',
 'home',
 'verge',
 'transmit',
 'file',
 'decline',
 'phone',
 'acquisition',
 'detail',
 'percent',
 'list',
 'extent',
 'knowledge',
 'operation',
 'cell',
 'feature',
 'isn',
 'sharing',
 'number',
 'intermediary',
 'privacy',
 'help',
 'client',
 'transaction',
 'company',
 'application',
 'shoulder',
 'security',
 'grade',
 'rest',
 'bank',
 'thousand',
 'information',
 'sidechain',
 'protection',
 'crypto',
 'access',
 'messaging',
 'unit',
 'work',
 'naysayer',
 'hand',
 'opposite',
 'mindset',
 'capsule',
 'pc',
 'setting',
 'contract',
 'control',
 'article',
 'total',
 'boundary',
 'root',
 'forum',
 'time',
 'case',
 'server',
 'order',
 'failure',
 'fortune',
 'cryptocurrency',
 'generatio

In [41]:
#display tags

with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    print(file['Tags'].value_counts())

Latest Tech Stories               67
Cryptocurrency                    49
Hackernoon Top Story              29
Artificial Intelligence           24
Crypto                            22
Blockchain                        17
Machine Learning                  16
Startups                          15
Coding                            15
Ai                                14
Programming                       13
Software Development               9
Deep Learning                      9
Data Science                       9
Ethereum                           8
Bitcoin                            8
Education                          7
Libra                              6
Facebook Libra                     6
Startup                            6
Facebook                           6
Javascript                         6
Latest Tech Story                  4
Futurism                           3
Calibra                            3
Venture Capital                    3
Cryptocurrency Investment          3
B

In [42]:
#start preprocessing
nlp = spacy.load('en_core_web_sm')
parser = English()

In [43]:
#create a words tokenizer

def words_tokenizer(text):
    tokens = nlp(text)
    filtered_tokens = []
    for word in tokens:
        lemma = word.lemma_.lower().strip()
        pos = word.pos_
        if lemma not in STOP_WORDS and re.search ('^[a-zA-Z]+$', lemma):
            if pos == 'NOUN' and pos == 'PROPN':
                continue
            filtered_tokens.append(lemma)
    return list(set(filtered_tokens))

In [44]:
words_tokenizer(file_unique_urls.text.values[1])

['science',
 'chance',
 'hierarchy',
 'blockchain',
 'place',
 'limitation',
 'search',
 'opinion',
 'man',
 'piece',
 'child',
 'comment',
 'leaf',
 'terminal',
 'parent',
 'machine',
 'improvement',
 'series',
 'programming',
 'merkle',
 'zombie',
 'bit',
 'woman',
 'uniform',
 'range',
 'control',
 'root',
 'time',
 'case',
 'order',
 'cryptocurrency',
 'beginning',
 'leafs',
 'uniformity',
 'computer',
 'size',
 'wikipedia',
 'trouble',
 'goal',
 'idea',
 'data',
 'scratch',
 'way',
 'hash',
 'topic',
 'graph',
 'channel',
 'pseudo',
 'forest',
 'party',
 'kind',
 'process',
 'joke',
 'couldn',
 'question',
 'structure',
 'password',
 'rule',
 'type',
 'organisation',
 'function',
 'facebook',
 'tree',
 'street',
 'verification',
 'source',
 'system',
 'term',
 'output',
 'road',
 'screen',
 'content',
 'point',
 'node',
 'wild',
 'problem',
 'prerequisite',
 'xkcd',
 'patient',
 'peer',
 'block',
 'requirement',
 'example',
 'language',
 'doesn',
 'collision',
 'link',
 'input',
 

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [48]:
#vectorization of token words and document-term matrix creation.

tfidf_vectorizer = TfidfVectorizer(max_df= 0.9, tokenizer=words_tokenizer)
terms_matrix = tfidf_vectorizer.fit_transform(file_unique_urls.text)
print("shape of the matrix:", terms_matrix.shape)


shape of the matrix: (101, 4414)


In [49]:
#Feature words in docs

terms = tfidf_vectorizer.get_feature_names()
terms

['ability',
 'abolition',
 'abraham',
 'absence',
 'absorption',
 'abstract',
 'abstraction',
 'abundance',
 'abuse',
 'abuzz',
 'ac',
 'academic',
 'acceleration',
 'accelerator',
 'accent',
 'acceptance',
 'access',
 'accessibility',
 'accident',
 'accommodate',
 'accord',
 'accordance',
 'account',
 'accountability',
 'accountant',
 'accounting',
 'accuracy',
 'accusation',
 'achievement',
 'acquisition',
 'act',
 'action',
 'activite',
 'activity',
 'actor',
 'ad',
 'adam',
 'adaptation',
 'addiction',
 'addition',
 'address',
 'addressing',
 'ade',
 'adio',
 'adjust',
 'administration',
 'admission',
 'ado',
 'adobe',
 'adopter',
 'adoption',
 'adrenaline',
 'adult',
 'advance',
 'advancement',
 'advantage',
 'advent',
 'adventure',
 'adversity',
 'advert',
 'advertise',
 'advertisement',
 'advertiser',
 'advertising',
 'advice',
 'adviser',
 'advisor',
 'advocacy',
 'advocate',
 'aes',
 'aesthetic',
 'affair',
 'affiliate',
 'affinity',
 'affordability',
 'afterall',
 'aftermath'

In [112]:
#Density matrix

doc_term_matrix = terms_matrix.todense()

In [125]:
#create dataframe

df_words = pd.DataFrame(doc_term_matrix, columns=terms, index=file_unique_urls.title)
df_words.head(20)

Unnamed: 0_level_0,ability,abolition,abraham,absence,absorption,abstract,abstraction,abundance,abuse,abuzz,...,zip,zombie,zone,zsh,zuboff,zuck,zuckerberg,zug,zulu,zymergen
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
What the Best Smartphone for keeping your Conversations Private?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.07128,0.0,0.0,0.0
Blockchain: What the Hell is a Merkle Tree?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.153608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Data Signals vs. Noise: Misleading Metrics and Misconceptions About Crypto-Asset Analytics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A crypto-trader diary week 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Understanding Cryptocurrency Development,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
How Irrational Exuberance aligned Dot-Com and Crypto bubbles.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Security tokens: The New Blockchain Era,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ethereum.org's New Art: \nRethinking Identity as Community,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
What Does India's Potential Crypto Ban Mean for Startups?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Ultimate Guide To Understanding EOS Accounts,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [127]:
for index, row in df_words.iterrows():
    print (row['title'])

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.05642879588582751
0.0
0.0
0.0
0.0
0.10005806932926355
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.08241100879837845
0.05530763200973759
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.10354613779934818
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


TypeError: Cannot index by location index with a non-integer key

In [106]:
v_list = []
for val in(df_words.T.values):
    v_list.append(val[np.argsort(val)[-5:]])
    
len(v_list)
    
    

4413

In [107]:
df_words

Unnamed: 0_level_0,ability,abolition,abraham,absence,absorption,abstract,abstraction,abundance,abuse,abuzz,...,zip,zombie,zone,zsh,zuboff,zuck,zuckerberg,zug,zulu,zymergen
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.071280,0.000000,0.0,0.000000
0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.153608,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


In [247]:
#Cosine similarity

from sklearn.metrics.pairwise import cosine_similarity


In [248]:
dist = 1-cosine_similarity(doc_term_matrix)


In [249]:
dist.shape, dist.dtype

((101, 101), dtype('float64'))

In [250]:
import warnings
from umap import UMAP

In [251]:
def getUmap(file, n_components):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        umap = UMAP(n_components=n_components, n_epochs=15 , random_state=42).fit_transform(file)
        return pd.DataFrame(umap, columns=[f'emb_{i+1}' for i in range(n_components)])


In [252]:
umap_df= getUmap(tfidf_matrix(file_unique_urls), 3)

NameError: name 'tfidf_matrix' is not defined

In [236]:
from hdbscan import HDBSCAN
def getClusters(umap):
    hdbscan = HDBSCAN(min_cluster_size=5)
    clusters = hdbscan.fit_predict(umap_df)
    return clusters

plt.scatter(umap_df.emb_1, umap_df.emb_2, c=getClusters(umap_df))

getClusters()
file_unique_urls['n_clusters'] = word_clusters


AttributeError: 'numpy.ndarray' object has no attribute 'emb_1'

In [232]:
#draw plot for umap

df = getUmap(df_words, 3).head()
df.head()


Unnamed: 0,emb_1,emb_2,emb_3
0,0.452819,0.664919,-1.699222
1,1.04485,-0.478647,-0.601621
2,0.750031,-1.483364,1.331195
3,-0.442296,-0.558052,-1.806393
4,0.498861,-1.201659,-0.993574


In [233]:
plt.scatter(df.emb_1, df.emb_2, c=getClusters(umap_df))
plt.show(block=False)
plt.pause(6)
plt.close(1)

NameError: name 'params' is not defined

AttributeError: 'numpy.ndarray' object has no attribute 'emb_1'

In [188]:
def articlesinClusters(clusters):  
    return file_unique_urls.title[word_clusters==clusters]

articlesinClusters(1)

NameError: name 'word_clusters' is not defined

In [189]:
articlesinClusters(0)

NameError: name 'word_clusters' is not defined

In [190]:
top_words = articlesinClusters(0).T.sum(axis=1).sort_values(ascending=False)
top_words.head()




NameError: name 'word_clusters' is not defined

In [191]:
def textinClusters(clusters):
    return file_unique_urls.text[word_clusters==clusters]

In [192]:
hdbscan = HDBSCAN(min_cluster_size=5)

clusters = hdbscan.fit(umap_df)
clusters.outlier_scores_


array([0.5350725 , 0.32310482, 0.34174089, 0.50025183, 0.3010297 ,
       0.40482602, 0.25357992, 0.31761138, 0.44206666, 0.33947189,
       0.47653853, 0.14984562, 0.01698264, 0.        , 0.01698264,
       0.07474402, 0.        , 0.09029781, 0.16942869, 0.25850435,
       0.12487042, 0.03438951, 0.24550487, 0.30506118, 0.53949859,
       0.07670499, 0.        , 0.        , 0.        , 0.16589736,
       0.20118901, 0.24803142, 0.20729519, 0.48924596, 0.20863965,
       0.54353319, 0.08315253, 0.08252991, 0.        , 0.09029781,
       0.13328542, 0.11654823, 0.05290244, 0.15513227, 0.28033061,
       0.05372332, 0.        , 0.09377754, 0.08757641, 0.28113803,
       0.01700199, 0.19257058, 0.19257058, 0.47167579, 0.07094543,
       0.03443124, 0.46078495, 0.22941698, 0.23494697, 0.05602902,
       0.27506356, 0.        , 0.27082615, 0.2304009 , 0.13931623,
       0.        , 0.29616051, 0.27082615, 0.28313896, 0.22786327,
       0.33292944, 0.        , 0.13269786, 0.21836554, 0.12393

In [193]:
def wordcloud(text):
    wordcloud = WordCloud(max_font_size=50, background_color="white").generate(text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()



    

In [194]:
def textWordcloud(lista):
    return wordcloud(' '.join(words_tokenizer(reduce((lambda x,y:x+y),lista))))


       

In [195]:
def clusterWordcloud(dataset):
    for x in set(dataset.n_clusters):
        textWordcloud(dataset[dataset.n_clusters==x].text)
        
        
        

In [215]:
def getDfCluster(cluster):
    return df_words[getClusters(umap_df)==cluster]


s = list(getDfCluster(1).T.sum(axis=1).sort_values(ascending=False).index[:20])
s



['people',
 'person',
 'tech',
 'machine',
 'year',
 'group',
 'sense',
 'venture',
 'startup',
 'work',
 'world',
 'learning',
 'business',
 'role',
 'team',
 'end',
 'job',
 'experience',
 'time',
 'thing']

In [197]:
top_words_cluster_min1= getDfCluster(-1).T.sum(axis=1).sort_values(ascending=False)
top_words_cluster_0 = getDfCluster(0).T.sum(axis=1).sort_values(ascending=False)
top_words_cluster_1= getDfCluster(1).T.sum(axis=1).sort_values(ascending=False)

print(top_words_cluster_min1[:5])
print(top_words_cluster_0[:5])
print(top_words_cluster_1[:5])

example    3.694261
way        3.608138
time       3.555486
thing      3.490835
project    3.450790
dtype: float64
ethereum       1.600581
blockchain     1.600377
transaction    1.555163
bitcoin        1.535256
currency       1.435997
dtype: float64
people     1.766872
person     1.704963
tech       1.679810
machine    1.670389
year       1.660100
dtype: float64


In [198]:
keywords= []

for i in range(len(df_words)):
    keywords.append(nlp(df_words.iloc[i,:].sort_values(ascending=False)[:5].index[0])[0])

keywords

[bank,
 chance,
 fee,
 age,
 difference,
 crypto,
 chance,
 website,
 bank,
 resource,
 fee,
 aren,
 age,
 profit,
 fee,
 website,
 eye,
 crypto,
 search,
 wouldn,
 crypto,
 search,
 basis,
 search,
 crypto,
 economy,
 crypto,
 credit,
 crypto,
 credit,
 focus,
 crypto,
 wouldn,
 profit,
 eye,
 basis,
 economy,
 fee,
 crypto,
 focus,
 chance,
 chance,
 economy,
 chance,
 chance,
 practice,
 crypto,
 resource,
 eye,
 aren,
 search,
 profit,
 chance,
 practice,
 wouldn,
 search,
 practice,
 post,
 resource,
 address,
 communication,
 eye,
 difference,
 understanding,
 record,
 age,
 human,
 human,
 focus,
 age,
 eye,
 communication,
 resource,
 difference,
 age,
 age,
 wouldn,
 communication,
 eye,
 chance,
 mind,
 program,
 resource,
 age,
 focus,
 practice,
 communication,
 year,
 future,
 bank,
 search,
 communication,
 chance,
 credit,
 age,
 focus,
 eye,
 practice,
 credit,
 profit,
 chance]

In [199]:
for word in df_words:
    print(nlp(word))

ability
access
account
action
activity
address
advantage
age
ai
algorithm
analysis
answer
app
application
approach
area
aren
article
asset
attention
bank
basis
benefit
bit
bitcoin
blockchain
business
capital
case
challenge
chance
change
code
coin
communication
community
company
computer
concept
contract
control
core
cost
country
course
credit
crypto
cryptocurrencie
cryptocurrency
currency
customer
data
day
decision
design
detail
developer
development
difference
doesn
economy
ecosystem
effort
end
environment
ethereum
event
example
exchange
experience
eye
facebook
fact
factor
feature
fee
field
focus
form
founder
friend
fund
future
goal
government
group
growth
hand
hour
human
idea
impact
individual
industry
information
instance
intelligence
interest
internet
investment
investor
issue
job
kind
knowledge
language
learning
level
life
line
list
look
lot
machine
market
medium
method
mind
model
money
month
need
network
news
number
operation
opinion
opportunity
option
order
party
pattern
payment

In [200]:
nlp(df_words.iloc[0,:].sort_values(ascending=False)[:5].index[0])[0].pos_

'NOUN'

In [201]:
token_words = []

for i in range(len(df_words)):
    token_words.append(df_words.iloc[i,:].sort_values(ascending=False)[:5])
    
                    
token_words


[bank             0.131199
 aren             0.131199
 communication    0.131199
 resource         0.131199
 difference       0.131199
 Name: What the Best Smartphone for keeping your Conversations Private?, dtype: float64,
 chance     0.210877
 search     0.210877
 goal       0.198601
 science    0.194916
 opinion    0.194916
 Name: Blockchain: What the Hell is a Merkle Tree?, dtype: float64,
 fee           0.159557
 crypto        0.159557
 age           0.159557
 difference    0.159557
 period        0.156291
 Name: Data Signals vs. Noise: Misleading Metrics and Misconceptions About Crypto-Asset Analytics, dtype: float64,
 age       0.145410
 aren      0.145410
 profit    0.145410
 view      0.142433
 plan      0.142433
 Name: A crypto-trader diary  week 1, dtype: float64,
 difference    0.157972
 address       0.154737
 core          0.154737
 mind          0.154737
 look          0.151678
 Name: Understanding Cryptocurrency Development, dtype: float64,
 crypto     0.162421
 profit 

In [202]:
words = df_words.iloc[0,:].sort_values(ascending=False)[:20].index
words

Index(['bank', 'aren', 'communication', 'resource', 'difference', 'crypto',
       'knowledge', 'address', 'operation', 'situation', 'news', 'hour',
       'privacy', 'record', 'action', 'ai', 'rule', 'purpose', 'detail',
       'ethereum'],
      dtype='object')

In [203]:
keywords = []
for word in words:
    token_type = nlp(word)[0].pos_
    if token_type != 'NOUN' and token_type!= 'PROPN':
        continue
    
    keywords.append(word)

keywords

['bank',
 'aren',
 'communication',
 'resource',
 'difference',
 'knowledge',
 'address',
 'operation',
 'situation',
 'news',
 'hour',
 'privacy',
 'record',
 'action',
 'rule',
 'purpose',
 'detail']