In [127]:
import nltk
import re
import pandas as pd
import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


In [128]:
file = pd.read_csv('text_classifier.csv', usecols=['id', 'title', 'text', 'Tags'])

In [130]:
file.head()

Unnamed: 0,id,title,text,Tags
0,https://hackernoon.com/whats-the-best-smartpho...,What the Best Smartphone for keeping your Conv...,"In the world of today, personal data like our ...",Blockchain
1,https://hackernoon.com/whats-the-best-smartpho...,What the Best Smartphone for keeping your Conv...,"In the world of today, personal data like our ...",Bitcoin
2,https://hackernoon.com/whats-the-best-smartpho...,What the Best Smartphone for keeping your Conv...,"In the world of today, personal data like our ...",Cryptocurrency
3,https://hackernoon.com/whats-the-best-smartpho...,What the Best Smartphone for keeping your Conv...,"In the world of today, personal data like our ...",Education
4,https://hackernoon.com/blockchain-what-the-hel...,Blockchain: What the Hell is a Merkle Tree?,No you can find this kind of a tree in the for...,Blockchain


In [131]:
file.Tags.nunique()


407

In [136]:
#create a dataframe copy to remove duplicated id's

file_unique_urls = file.copy()
file_unique_urls.drop_duplicates(subset='id', inplace=True)

In [150]:
file_unique_urls.text.dtype

dtype('O')

In [138]:
#display tags

with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    print(file['Tags'].value_counts())

Cryptocurrency                 15
Blockchain                     10
Crypto                          6
Hackernoon Top Story            4
Artificial Intelligence         4
Coding                          4
Machine Learning                4
Data Science                    3
Startups                        3
Ai                              3
Ethereum                        2
Programming                     2
Education                       2
Bitcoin                         2
Venture Capital                 2
Startup                         2
Tech Debt                       2
Javascript                      2
Ieo                             2
Crowdsourcing                   1
Futurism                        1
Dapps                           1
Latest Tech Stories             1
Futurist                        1
Libra                           1
React                           1
Weak Ai                         1
Market Making                   1
Programming Languages           1
Project Manage

In [146]:
#start preprocessing
nlp = spacy.load('en_core_web_sm')
parser = English()

In [147]:
#create a words tokenizer

def words_tokenizer(text):
    tokens = parser(text)
    filtered_tokens = []
    for word in tokens:
        lemma = word.lemma_.lower().strip() 
        if lemma not in STOP_WORDS and re.search ('^[a-zA-Z]+$', lemma):
            if lemma.endswith('ly'):
                continue
            filtered_tokens.append(lemma)
    return list(set(filtered_tokens))

In [157]:
words_tokenizer(file_unique_urls.text.values[2])

['predictor',
 'bitcoin',
 'know',
 'complex',
 'network',
 'watson',
 'theory',
 'enigma',
 'measure',
 'human',
 'score',
 'door',
 'subjective',
 'disaster',
 'variable',
 'exception',
 'real',
 'crowdsourced',
 'provider',
 'specific',
 'space',
 'cornerstone',
 'new',
 'speculation',
 'mislead',
 'explanation',
 'significant',
 'performance',
 'crypto',
 'area',
 'lack',
 'describe',
 'regardless',
 'bullet',
 'change',
 'bake',
 'people',
 'rigor',
 'misinterpret',
 'signal',
 'role',
 'serve',
 'linear',
 'technique',
 'assistant',
 'evaluate',
 'model',
 'deep',
 'offer',
 'dissect',
 'require',
 'markets',
 'meaningful',
 'sophisticate',
 'social',
 'today',
 'fantasy',
 'add',
 'recognize',
 'tool',
 'work',
 'technical',
 'ridiculous',
 'asset',
 'trader',
 'demographic',
 'vendor',
 'tailor',
 'factor',
 'public',
 'time',
 'simple',
 'easy',
 'impact',
 'generation',
 'prefer',
 'element',
 'sell',
 'mention',
 'hard',
 'complicated',
 'list',
 'golden',
 'flaw',
 'proof',

In [158]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [159]:
#vectorization of token words and document-term matrix creation.

tfidf_vectorizer = TfidfVectorizer(min_df=0.15, max_df= 0.9, tokenizer=words_tokenizer)
terms_matrix = tfidf_vectorizer.fit_transform(file_unique_urls.text)
print("shape of the matrix:", terms_matrix.shape)


shape of the matrix: (101, 458)


In [160]:
#Feature words in docs

terms = tfidf_vectorizer.get_feature_names()
print("The number of feature words in all the texts is", len(terms))

The number of feature words in all the texts is 458


In [161]:
#Density matrix

doc_term_matrix = terms_matrix.todense()

In [63]:
#create dataframe

df_words = pd.DataFrame(doc_term_matrix, columns=terms, index=file_unique_urls.title)
df_words.head()


Unnamed: 0_level_0,ability,able,access,accord,account,achieve,act,action,activity,actually,...,win,word,work,world,worry,worth,wouldn,write,wrong,year
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
What the Best Smartphone for keeping your Conversations Private?,0.0,0.017943,0.190742,0.0,0.0,0.023787,0.0,0.024682,0.0,0.020269,...,0.0,0.020875,0.037971,0.0297,0.0,0.0,0.0,0.0,0.042387,0.0297
Blockchain: What the Hell is a Merkle Tree?,0.0,0.034817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039332,...,0.0,0.0,0.02456,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Data Signals vs. Noise: Misleading Metrics and Misconceptions About Crypto-Asset Analytics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01168,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A crypto-trader diary week 1,0.0,0.0,0.0,0.022479,0.022799,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.028884,0.0,0.0,0.140812,0.0,0.089915,0.048365,0.0
Understanding Cryptocurrency Development,0.0,0.0,0.0,0.016341,0.049721,0.0,0.0,0.0,0.0,0.0,...,0.0,0.051948,0.0,0.0,0.021294,0.0,0.0,0.0,0.0,0.0


In [None]:
#Cosine similarity

from sklearn.metrics.pairwise import cosine_similarity


In [64]:
dist = 1-cosine_similarity(doc_term_matrix)


In [72]:
dist.shape, dist.dtype

((101, 101), dtype('float64'))

SyntaxError: invalid syntax (<ipython-input-76-7e39275dad0e>, line 1)