In [25]:
import re
import numpy as np
import pandas as pd
import spacy
import string
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from hdbscan import HDBSCAN
from umap import UMAP
from mpl_toolkits.mplot3d import Axes3D
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from functools import reduce
%matplotlib inline


In [26]:
file = pd.read_csv('text_classifier.csv', usecols=['id', 'title', 'text', 'Tags'])

In [27]:
file.head()

Unnamed: 0,id,title,text,Tags
0,https://hackernoon.com/whats-the-best-smartpho...,What the Best Smartphone for keeping your Conv...,"In the world of today, personal data like our ...",Blockchain
1,https://hackernoon.com/whats-the-best-smartpho...,What the Best Smartphone for keeping your Conv...,"In the world of today, personal data like our ...",Bitcoin
2,https://hackernoon.com/whats-the-best-smartpho...,What the Best Smartphone for keeping your Conv...,"In the world of today, personal data like our ...",Cryptocurrency
3,https://hackernoon.com/whats-the-best-smartpho...,What the Best Smartphone for keeping your Conv...,"In the world of today, personal data like our ...",Education
4,https://hackernoon.com/blockchain-what-the-hel...,Blockchain: What the Hell is a Merkle Tree?,No you can find this kind of a tree in the for...,Blockchain


In [28]:
file.Tags.nunique()


407

In [29]:
#create a dataframe copy to remove duplicated id's

file_unique_urls = file.copy()
file_unique_urls.drop_duplicates(subset='id', inplace=True)

In [30]:
file_unique_urls.text.dtype

dtype('O')

In [31]:
#display tags

with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    print(file['Tags'].value_counts())

Latest Tech Stories               67
Cryptocurrency                    49
Hackernoon Top Story              29
Artificial Intelligence           24
Crypto                            22
Blockchain                        17
Machine Learning                  16
Startups                          15
Coding                            15
Ai                                14
Programming                       13
Deep Learning                      9
Data Science                       9
Software Development               9
Bitcoin                            8
Ethereum                           8
Education                          7
Libra                              6
Javascript                         6
Startup                            6
Facebook                           6
Facebook Libra                     6
Latest Tech Story                  4
Cryptocurrency Investment          3
Calibra                            3
Blockchain Development             3
Venture Capital                    3
P

In [172]:
#start preprocessing
nlp = English()


In [178]:
#create a words tokenizer

def words_tokenizer(text):
    tokens = nlp(text)
    filtered_tokens = []
    for word in tokens:
        lemma = word.lemma_.lower().strip()
        pos = word.pos
        if lemma not in STOP_WORDS and re.search ('^[a-zA-Z]+$', lemma) and (pos == 'NOUN' and pos == 'PROPN'):
            filtered_tokens.append(lemma)
    return list(set(filtered_tokens))

In [179]:
words_tokenizer(file_unique_urls.text.values[1])

[]

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
#vectorization of token words and document-term matrix creation.

tfidf_vectorizer = TfidfVectorizer(min_df=0.15, max_df= 0.9, tokenizer=words_tokenizer)
terms_matrix = tfidf_vectorizer.fit_transform(file_unique_urls.text)
print("shape of the matrix:", terms_matrix.shape)


shape of the matrix: (101, 458)


In [37]:
#Feature words in docs

terms = tfidf_vectorizer.get_feature_names()
terms

['ability',
 'able',
 'access',
 'accord',
 'account',
 'achieve',
 'act',
 'action',
 'activity',
 'add',
 'address',
 'advance',
 'advantage',
 'age',
 'ago',
 'algorithm',
 'allow',
 'analysis',
 'answer',
 'app',
 'application',
 'approach',
 'area',
 'aren',
 'article',
 'artificial',
 'ask',
 'asset',
 'attempt',
 'attention',
 'available',
 'away',
 'bad',
 'bank',
 'base',
 'basis',
 'begin',
 'believe',
 'benefit',
 'big',
 'billion',
 'bitcoin',
 'bite',
 'blockchain',
 'book',
 'break',
 'bring',
 'build',
 'business',
 'buy',
 'capital',
 'case',
 'cause',
 'certain',
 'challenge',
 'chance',
 'change',
 'check',
 'choose',
 'clear',
 'close',
 'code',
 'coin',
 'come',
 'common',
 'communication',
 'community',
 'company',
 'compare',
 'complete',
 'complex',
 'computer',
 'concept',
 'concern',
 'connect',
 'consider',
 'content',
 'continue',
 'contract',
 'control',
 'core',
 'cost',
 'country',
 'course',
 'cover',
 'create',
 'credit',
 'crypto',
 'cryptocurrencies',


In [38]:
#Density matrix

doc_term_matrix = terms_matrix.todense()

In [39]:
#create dataframe

df_words = pd.DataFrame(doc_term_matrix, columns=terms, index=file_unique_urls.title)
df_words.head(-10)


Unnamed: 0_level_0,ability,able,access,accord,account,achieve,act,action,activity,add,...,win,word,work,world,worry,worth,wouldn,write,wrong,year
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
What the Best Smartphone for keeping your Conversations Private?,0.000000,0.061334,0.072445,0.000000,0.000000,0.081310,0.000000,0.084368,0.000000,0.000000,...,0.000000,0.071357,0.043265,0.050761,0.000000,0.000000,0.000000,0.000000,0.072445,0.050761
Blockchain: What the Hell is a Merkle Tree?,0.000000,0.092491,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.065244,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Data Signals vs. Noise: Misleading Metrics and Misconceptions About Crypto-Asset Analytics,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.089989,...,0.000000,0.000000,0.058625,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
A crypto-trader diary week 1,0.000000,0.000000,0.000000,0.072502,0.073534,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.046581,0.000000,0.000000,0.090834,0.000000,0.072502,0.077998,0.000000
Understanding Cryptocurrency Development,0.000000,0.000000,0.000000,0.081641,0.082802,0.000000,0.000000,0.000000,0.000000,0.080513,...,0.000000,0.086510,0.000000,0.000000,0.106382,0.000000,0.000000,0.000000,0.000000,0.000000
How Irrational Exuberance aligned Dot-Com and Crypto bubbles.,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.081960,...,0.000000,0.088065,0.053395,0.000000,0.000000,0.104122,0.000000,0.000000,0.000000,0.062646
Security tokens: The New Blockchain Era,0.000000,0.000000,0.104459,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.106082,0.102890,0.000000,0.073193,0.126525,0.000000,0.000000,0.000000,0.000000,0.073193
Ethereum.org's New Art: \nRethinking Identity as Community,0.000000,0.082344,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.089160,...,0.000000,0.095801,0.058085,0.068150,0.000000,0.000000,0.000000,0.090409,0.000000,0.068150
What Does India's Potential Crypto Ban Mean for Startups?,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.113540,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.068313
The Ultimate Guide To Understanding EOS Accounts,0.000000,0.000000,0.000000,0.000000,0.102641,0.000000,0.000000,0.126790,0.000000,0.099804,...,0.000000,0.000000,0.065020,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [40]:
words = df_words.columns
print(words)

Index(['ability', 'able', 'access', 'accord', 'account', 'achieve', 'act',
       'action', 'activity', 'add',
       ...
       'win', 'word', 'work', 'world', 'worry', 'worth', 'wouldn', 'write',
       'wrong', 'year'],
      dtype='object', length=458)


In [41]:
v_list = []
for val in(df_words.T.values):
    v_list.append(val[np.argsort(val)[-5:]])
    
len(v_list)
    
    

458

In [42]:
df_words

Unnamed: 0_level_0,ability,able,access,accord,account,achieve,act,action,activity,add,...,win,word,work,world,worry,worth,wouldn,write,wrong,year
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
What the Best Smartphone for keeping your Conversations Private?,0.000000,0.061334,0.072445,0.000000,0.000000,0.081310,0.000000,0.084368,0.000000,0.000000,...,0.000000,0.071357,0.043265,0.050761,0.000000,0.000000,0.000000,0.000000,0.072445,0.050761
Blockchain: What the Hell is a Merkle Tree?,0.000000,0.092491,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.065244,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Data Signals vs. Noise: Misleading Metrics and Misconceptions About Crypto-Asset Analytics,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.089989,...,0.000000,0.000000,0.058625,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
A crypto-trader diary week 1,0.000000,0.000000,0.000000,0.072502,0.073534,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.046581,0.000000,0.000000,0.090834,0.000000,0.072502,0.077998,0.000000
Understanding Cryptocurrency Development,0.000000,0.000000,0.000000,0.081641,0.082802,0.000000,0.000000,0.000000,0.000000,0.080513,...,0.000000,0.086510,0.000000,0.000000,0.106382,0.000000,0.000000,0.000000,0.000000,0.000000
How Irrational Exuberance aligned Dot-Com and Crypto bubbles.,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.081960,...,0.000000,0.088065,0.053395,0.000000,0.000000,0.104122,0.000000,0.000000,0.000000,0.062646
Security tokens: The New Blockchain Era,0.000000,0.000000,0.104459,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.106082,0.102890,0.000000,0.073193,0.126525,0.000000,0.000000,0.000000,0.000000,0.073193
Ethereum.org's New Art: \nRethinking Identity as Community,0.000000,0.082344,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.089160,...,0.000000,0.095801,0.058085,0.068150,0.000000,0.000000,0.000000,0.090409,0.000000,0.068150
What Does India's Potential Crypto Ban Mean for Startups?,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.113540,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.068313
The Ultimate Guide To Understanding EOS Accounts,0.000000,0.000000,0.000000,0.000000,0.102641,0.000000,0.000000,0.126790,0.000000,0.099804,...,0.000000,0.000000,0.065020,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [44]:
#Cosine similarity

from sklearn.metrics.pairwise import cosine_similarity


In [45]:
dist = 1-cosine_similarity(doc_term_matrix)


In [46]:
dist.shape, dist.dtype

((101, 101), dtype('float64'))

In [47]:
import warnings
from umap import UMAP

In [48]:
def getUmap(dataset, n_components):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        umap = UMAP(n_components=n_components, n_epochs=15 , random_state=42).fit_transform(dataset)
        return umap


In [23]:
#draw plot for umap

umap_df = getUmap(df_words, 3)
umap_df

array([[ 1.73767138e+00, -8.79085481e-01, -6.38789952e-01],
       [-7.29743958e-01,  1.26263118e+00, -6.78081632e-01],
       [ 1.29205370e+00, -6.11839712e-01,  1.75277162e+00],
       [ 5.57603776e-01,  1.65462518e+00,  2.62813896e-01],
       [ 5.32841921e-01,  3.40831816e-01, -1.26164210e+00],
       [ 1.47629011e+00, -2.28825524e-01,  1.88518405e+00],
       [ 2.37368584e+00, -6.97284639e-01,  4.08992916e-01],
       [-4.95299429e-01, -8.48308146e-01,  6.78063214e-01],
       [ 1.61243773e+00, -1.47570562e+00,  1.09065211e+00],
       [ 9.33211207e-01,  2.44097903e-01, -1.48942029e+00],
       [ 2.64426470e+00,  6.39723301e-01,  3.28687504e-02],
       [ 1.41967130e+00,  1.20038474e+00,  2.36656934e-01],
       [ 1.89478803e+00, -1.23177660e+00,  1.68533087e+00],
       [ 1.60134528e-02,  9.02651072e-01,  1.61034298e+00],
       [ 1.80125558e+00, -6.98018849e-01, -3.87517959e-01],
       [ 1.22465992e+00,  1.11575261e-01, -5.52717030e-01],
       [ 1.10446441e+00,  1.16286623e+00

In [49]:
from hdbscan import HDBSCAN

In [50]:
def getClusters(umap):
    hdbscan = HDBSCAN(min_cluster_size=5)
    clusters = hdbscan.fit_predict(umap_df)
    return clusters

plt.scatter(umap_df.emb_1, umap_df.emb_2, c=getClusters(umap_df))

getClusters()
file_unique_urls['n_clusters'] = word_clusters

AttributeError: 'numpy.ndarray' object has no attribute 'emb_1'

In [51]:
def articlesinClusters(clusters):  
    return file_unique_urls.title[word_clusters==clusters]

articlesinClusters(1)

NameError: name 'word_clusters' is not defined

In [52]:
articlesinClusters(0)

NameError: name 'word_clusters' is not defined

In [53]:
top_words = articlesinClusters(0).T.sum(axis=1).sort_values(ascending=False)
top_words.head()




NameError: name 'word_clusters' is not defined

In [54]:
def textinClusters(clusters):
    return file_unique_urls.text[word_clusters==clusters]

In [55]:
hdbscan = HDBSCAN(min_cluster_size=5)

clusters = hdbscan.fit(umap_df)
clusters.outlier_scores_


array([0.4253456 , 0.35532371, 0.04647314, 0.29386543, 0.36706536,
       0.15210612, 0.37472886, 0.20601671, 0.37215532, 0.39799675,
       0.5011646 , 0.22291907, 0.32157521, 0.55493545, 0.34286837,
       0.11109901, 0.19198117, 0.        , 0.291091  , 0.24193491,
       0.        , 0.31336062, 0.09045879, 0.34140609, 0.13746242,
       0.03204937, 0.        , 0.13746242, 0.2136784 , 0.06111122,
       0.14842616, 0.10651304, 0.09848337, 0.22291907, 0.        ,
       0.07616841, 0.        , 0.04901706, 0.10620711, 0.2097445 ,
       0.        , 0.14957594, 0.08877376, 0.        , 0.20506682,
       0.        , 0.03204937, 0.        , 0.11869007, 0.51232067,
       0.37290551, 0.25989151, 0.21417596, 0.20063013, 0.25497454,
       0.25702026, 0.10975763, 0.18756791, 0.22058194, 0.284066  ,
       0.07568533, 0.09848337, 0.26480996, 0.13444096, 0.24244119,
       0.03180324, 0.24826583, 0.20547392, 0.11444227, 0.07317399,
       0.3515925 , 0.07345366, 0.10222407, 0.        , 0.38906

In [57]:
def wordcloud(text):
    wordcloud = WordCloud(max_font_size=50, background_color="white").generate(text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()



    

In [58]:
def textWordcloud(lista):
    return wordcloud(' '.join(words_tokenizer(reduce((lambda x,y:x+y),lista))))


       

In [59]:
def clusterWordcloud(dataset):
    for x in set(dataset.n_clusters):
        textWordcloud(dataset[dataset.n_clusters==x].text)
        
        
        

In [160]:
def getDfCluster(cluster):
    return df_words[getClusters(umap_df)==cluster]

(getDfCluster(-1))



Unnamed: 0_level_0,ability,able,access,accord,account,achieve,act,action,activity,add,...,win,word,work,world,worry,worth,wouldn,write,wrong,year
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
What the Best Smartphone for keeping your Conversations Private?,0.000000,0.061334,0.072445,0.000000,0.000000,0.081310,0.000000,0.084368,0.000000,0.000000,...,0.000000,0.071357,0.043265,0.050761,0.000000,0.000000,0.000000,0.000000,0.072445,0.050761
Blockchain: What the Hell is a Merkle Tree?,0.000000,0.092491,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.065244,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Data Signals vs. Noise: Misleading Metrics and Misconceptions About Crypto-Asset Analytics,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.089989,...,0.000000,0.000000,0.058625,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
A crypto-trader diary week 1,0.000000,0.000000,0.000000,0.072502,0.073534,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.046581,0.000000,0.000000,0.090834,0.000000,0.072502,0.077998,0.000000
Understanding Cryptocurrency Development,0.000000,0.000000,0.000000,0.081641,0.082802,0.000000,0.000000,0.000000,0.000000,0.080513,...,0.000000,0.086510,0.000000,0.000000,0.106382,0.000000,0.000000,0.000000,0.000000,0.000000
How Irrational Exuberance aligned Dot-Com and Crypto bubbles.,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.081960,...,0.000000,0.088065,0.053395,0.000000,0.000000,0.104122,0.000000,0.000000,0.000000,0.062646
Security tokens: The New Blockchain Era,0.000000,0.000000,0.104459,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.106082,0.102890,0.000000,0.073193,0.126525,0.000000,0.000000,0.000000,0.000000,0.073193
Ethereum.org's New Art: \nRethinking Identity as Community,0.000000,0.082344,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.089160,...,0.000000,0.095801,0.058085,0.068150,0.000000,0.000000,0.000000,0.090409,0.000000,0.068150
What Does India's Potential Crypto Ban Mean for Startups?,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.113540,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.068313
The Ultimate Guide To Understanding EOS Accounts,0.000000,0.000000,0.000000,0.000000,0.102641,0.000000,0.000000,0.126790,0.000000,0.099804,...,0.000000,0.000000,0.065020,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [163]:
top_words_cluster_min1= getDfCluster(-1).T.sum(axis=1).sort_values(ascending=False)
top_words_cluster_0 = getDfCluster(0).T.sum(axis=1).sort_values(ascending=False)
top_words_cluster_1= getDfCluster(1).T.sum(axis=1).sort_values(ascending=False)

print(top_words_cluster_min1[:5])
print(top_words_cluster_0[:5])
print(top_words_cluster_1[:5])




read    3.145342
like    3.107169
use     3.079257
time    3.056297
way     2.990833
dtype: float64
model         0.900004
learn         0.876937
experience    0.840412
machine       0.838258
datum         0.834201
dtype: float64
crypto      0.679713
exchange    0.649534
increase    0.635209
numb        0.615511
offer       0.585318
dtype: float64


In [112]:
keywords= []

for i in range(len(df_words)):
    keywords.append(nlp(df_words.iloc[i,:].sort_values(ascending=False)[:5].index[0])[0])

keywords

[file,
 size,
 age,
 profit,
 remove,
 cause,
 remove,
 particular,
 country,
 resource,
 credit,
 resource,
 age,
 remove,
 monitor,
 website,
 size,
 cause,
 near,
 ago,
 website,
 credit,
 basis,
 remove,
 speed,
 credit,
 particular,
 credit,
 age,
 credit,
 entire,
 strong,
 profit,
 profit,
 website,
 credit,
 cause,
 remove,
 communication,
 strong,
 chance,
 remove,
 speed,
 chance,
 monitor,
 difference,
 resource,
 website,
 eye,
 near,
 strong,
 profit,
 speed,
 fall,
 didn,
 basis,
 ready,
 home,
 near,
 file,
 communication,
 eye,
 difference,
 file,
 remove,
 basis,
 advance,
 book,
 economy,
 forward,
 resource,
 age,
 forward,
 difference,
 near,
 remove,
 difference,
 entire,
 advance,
 chance,
 home,
 didn,
 near,
 forward,
 monitor,
 home,
 communication,
 read,
 present,
 didn,
 difference,
 communication,
 didn,
 credit,
 didn,
 didn,
 economy,
 cause,
 pretty,
 profit,
 chance]

In [106]:
for word in df_words:
    print(nlp(word))

ability
able
access
accord
account
achieve
act
action
activity
add
address
advance
advantage
age
ago
algorithm
allow
analysis
answer
app
application
approach
area
aren
article
artificial
ask
asset
attempt
attention
available
away
bad
bank
base
basis
begin
believe
benefit
big
billion
bitcoin
bite
blockchain
book
break
bring
build
business
buy
capital
case
cause
certain
challenge
chance
change
check
choose
clear
close
code
coin
come
common
communication
community
company
compare
complete
complex
computer
concept
concern
connect
consider
content
continue
contract
control
core
cost
country
course
cover
create
credit
crypto
cryptocurrencies
cryptocurrency
currency
current
customer
datum
day
deal
decentralize
decide
decision
deep
depend
describe
design
detail
develop
develope
developer
development
didn
difference
different
difficult
digital
discuss
distribute
doe
doesn
don
drive
easy
economy
ecosystem
effort
enable
end
engineer
ensure
entire
environment
etc
ethereum
event
example
exchange
ex

In [125]:
nlp(df_words.iloc[0,:].sort_values(ascending=False)[:5].index[0])[0].pos_

'NOUN'

In [133]:
token_words = []

for i in range(len(df_words)):
    token_words.append(df_words.iloc[i,:].sort_values(ascending=False)[:5])
    
                    
token_words


[file          0.089583
 advance       0.089583
 home          0.089583
 difference    0.089583
 remove        0.089583
 Name: What the Best Smartphone for keeping your Conversations Private?, dtype: float64,
 size       0.135091
 chance     0.135091
 content    0.132325
 thank      0.132325
 hear       0.132325
 Name: Blockchain: What the Hell is a Merkle Tree?, dtype: float64,
 age           0.121388
 difference    0.121388
 period        0.118903
 introduce     0.118903
 tend          0.118903
 Name: Data Signals vs. Noise: Misleading Metrics and Misconceptions About Crypto-Asset Analytics, dtype: float64,
 profit    0.096449
 pretty    0.096449
 age       0.096449
 lose      0.094474
 aren      0.094474
 Name: A crypto-trader diary  week 1, dtype: float64,
 remove        0.108606
 ready         0.108606
 fall          0.108606
 difference    0.108606
 hear          0.106382
 Name: Understanding Cryptocurrency Development, dtype: float64,
 cause      0.110558
 fall       0.110558
 e

In [132]:
words = df_words.iloc[0,:].sort_values(ascending=False)[:20].index
words

Index(['file', 'advance', 'home', 'difference', 'remove', 'entire', 'near',
       'communication', 'resource', 'remember', 'decentralize', 'ago', 'hear',
       'release', 'sound', 'aren', 'transfer', 'knowledge', 'integrate',
       'etc'],
      dtype='object')

In [140]:
keywords = []
for word in words:
    token_type = nlp(word)[0].pos_
    if token_type != 'NOUN' and token_type!= 'PROPN':
        continue
    
    keywords.append(word)

keywords

['file',
 'advance',
 'home',
 'difference',
 'communication',
 'resource',
 'release',
 'aren',
 'transfer',
 'knowledge']