In [1]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import nltk
from datetime import datetime
lmtizer=nltk.stem.WordNetLemmatizer()



In [2]:
news = pd.read_csv('bitcoin_news.csv')
news.head()

Unnamed: 0,Date,Summary,Tags,Title
0,2018-09-04T08:00:04-04:00,The Tuesday edition of our daily roundup is ja...,"ATM,Blockchain,Brave,Deutsche Börse,Ethereum,G...","The Daily: Wirex Adds Ethereum, Deutsche Börse..."
1,2018-08-02T17:15:26-04:00,Japan’s SBI Group is reportedly planning to cr...,"ATM,BCH,Bitcoin,bitcoin cash,BTC,BTM,coin asse...","SBI Plans Derivatives Platform, Huobi Eyes 30%..."
2,2018-09-05T01:15:31-04:00,Non-custodial crypto trading platform Shapeshi...,"AML,anti-money laundering,Bitcoin,BTC,crypto,C...",Shapeshift Moves to Membership Model Requiring...
3,2018-08-04T04:55:54-04:00,"Intercontinental Exchange (ICE), owner of argu...","401k,Bitcoin,Boston Consulting Group,Canada,CF...",NYSE Owner: Bitcoin Should Be in Retirement Fu...
4,2018-08-06T03:45:21-04:00,Thailand’s central bank has announced the rule...,"bank of thailand,banks,Bitcoin,Bot,BTC,commerc...",Bank of Thailand Green-Lights Financial Compan...


# Date Mutation

In [3]:
news['Datetime'] = list(map(lambda x: datetime.strptime(x[0:10],'%Y-%m-%d'), news['Date']))
news.head()

Unnamed: 0,Date,Summary,Tags,Title,Datetime
0,2018-09-04T08:00:04-04:00,The Tuesday edition of our daily roundup is ja...,"ATM,Blockchain,Brave,Deutsche Börse,Ethereum,G...","The Daily: Wirex Adds Ethereum, Deutsche Börse...",2018-09-04
1,2018-08-02T17:15:26-04:00,Japan’s SBI Group is reportedly planning to cr...,"ATM,BCH,Bitcoin,bitcoin cash,BTC,BTM,coin asse...","SBI Plans Derivatives Platform, Huobi Eyes 30%...",2018-08-02
2,2018-09-05T01:15:31-04:00,Non-custodial crypto trading platform Shapeshi...,"AML,anti-money laundering,Bitcoin,BTC,crypto,C...",Shapeshift Moves to Membership Model Requiring...,2018-09-05
3,2018-08-04T04:55:54-04:00,"Intercontinental Exchange (ICE), owner of argu...","401k,Bitcoin,Boston Consulting Group,Canada,CF...",NYSE Owner: Bitcoin Should Be in Retirement Fu...,2018-08-04
4,2018-08-06T03:45:21-04:00,Thailand’s central bank has announced the rule...,"bank of thailand,banks,Bitcoin,Bot,BTC,commerc...",Bank of Thailand Green-Lights Financial Compan...,2018-08-06


# Tags

In [4]:
tags = sum(list(map(lambda x: lmtizer.lemmatize(x).split(','), news.Tags[news.Tags.isnull() == False])), [])

In [5]:
tag = []
count = []

for i in set(tags):
    num = tags.count(i)
    tag.append(i)
    count.append(num)
    
df_tag = pd.DataFrame({'tag':tag, 'count':count}).sort_values(by = 'count', ascending = False)

In [6]:
df_tag

Unnamed: 0,tag,count
18165,Bitcoin,2776
7132,N-Featured,2341
10985,Cryptocurrency,1752
13895,N-Economy,997
14037,BTC,992
18727,crypto,721
7375,bitcoin cash,719
16297,Cryptocurrencies,584
9331,Digital Currency,580
14415,BCH,567


# Titles

In [7]:
title = ' '.join(news['Title'])
title = list(map(lambda x: lmtizer.lemmatize(x), simple_preprocess(title)))
title = nltk.pos_tag(title, tagset='universal')

In [8]:
title

[('the', 'DET'),
 ('daily', 'ADJ'),
 ('wirex', 'NOUN'),
 ('add', 'VERB'),
 ('ethereum', 'NOUN'),
 ('deutsche', 'NOUN'),
 ('börse', 'NOUN'),
 ('embrace', 'NOUN'),
 ('blockchain', 'NOUN'),
 ('sbi', 'NOUN'),
 ('plan', 'NOUN'),
 ('derivative', 'ADJ'),
 ('platform', 'NOUN'),
 ('huobi', 'NOUN'),
 ('eye', 'NOUN'),
 ('korean', 'ADJ'),
 ('market', 'NOUN'),
 ('thai', 'NOUN'),
 ('crypto', 'NOUN'),
 ('atm', 'NOUN'),
 ('unveiled', 'VERB'),
 ('shapeshift', 'ADJ'),
 ('move', 'NOUN'),
 ('to', 'PRT'),
 ('membership', 'NOUN'),
 ('model', 'NOUN'),
 ('requiring', 'VERB'),
 ('user', 'ADJ'),
 ('information', 'NOUN'),
 ('nyse', 'NOUN'),
 ('owner', 'NOUN'),
 ('bitcoin', 'NOUN'),
 ('should', 'VERB'),
 ('be', 'VERB'),
 ('in', 'ADP'),
 ('retirement', 'NOUN'),
 ('fund', 'NOUN'),
 ('credit', 'NOUN'),
 ('card', 'NOUN'),
 ('retail', 'ADJ'),
 ('store', 'NOUN'),
 ('bank', 'NOUN'),
 ('of', 'ADP'),
 ('thailand', 'NOUN'),
 ('green', 'ADJ'),
 ('light', 'ADJ'),
 ('financial', 'ADJ'),
 ('company', 'NOUN'),
 ('for', 'ADP'),


In [9]:
nouns_title = [i for i in title if i[1] == 'NOUN']
noun_title = []
noun_title_count = []

noun_title_counting_list = list(map(lambda x: x[0], nouns_title))

for i in set(nouns_title):
    noun_title.append(i[0])
    noun_title_count.append(noun_title_counting_list.count(i[0]))
    
df_nouns_title = pd.DataFrame({'noun_title':noun_title, 'noun_title_count':noun_title_count}).sort_values(by = 'noun_title_count', ascending = False)

In [10]:
df_nouns_title.head(50)

Unnamed: 0,noun_title,noun_title_count
1640,bitcoin,2333
1819,exchange,658
3402,cryptocurrency,548
269,crypto,491
1060,blockchain,428
2355,market,387
2612,bank,335
4156,cash,304
1400,wallet,231
3678,price,216


In [11]:
adjectives_title = [i for i in title if i[1] == 'ADJ']
adjective_title = []
adjective_title_count = []

adjective_title_counting_list = list(map(lambda x: x[0], adjectives_title))

for i in set(adjectives_title):
    adjective_title.append(i[0])
    adjective_title_count.append(adjective_title_counting_list.count(i[0]))
    
df_adjectives_title = pd.DataFrame({'adjective_title':adjective_title, 'adjective_title_count':adjective_title_count}).sort_values(by = 'adjective_title_count', ascending = False)

In [12]:
df_adjectives_title

Unnamed: 0,adjective_title,adjective_title_count
314,bitcoin,584
620,new,510
2203,crypto,252
589,update,144
326,u,119
1149,south,118
1504,japanese,117
1802,chinese,110
1557,korean,106
1116,central,104


# Summaries

In [13]:
summary = ' '.join(news['Summary'])
summary = list(map(lambda x: lmtizer.lemmatize(x), simple_preprocess(summary)))
summary = nltk.pos_tag(summary, tagset='universal')

In [14]:
summary

[('the', 'DET'),
 ('tuesday', 'ADJ'),
 ('edition', 'NOUN'),
 ('of', 'ADP'),
 ('our', 'PRON'),
 ('daily', 'ADJ'),
 ('roundup', 'NOUN'),
 ('is', 'VERB'),
 ('jam', 'ADJ'),
 ('packed', 'VERB'),
 ('with', 'ADP'),
 ('tidbit', 'NOUN'),
 ('snippet', 'NOUN'),
 ('and', 'CONJ'),
 ('other', 'ADJ'),
 ('term', 'NOUN'),
 ('used', 'VERB'),
 ('to', 'PRT'),
 ('describe', 'VERB'),
 ('story', 'NOUN'),
 ('in', 'ADP'),
 ('brief', 'NOUN'),
 ('that', 'ADP'),
 ('deserve', 'NOUN'),
 ('telling', 'VERB'),
 ('on', 'ADP'),
 ('this', 'DET'),
 ('day', 'NOUN'),
 ('september', 'VERB'),
 ('we', 'PRON'),
 ('bring', 'VERB'),
 ('you', 'PRON'),
 ('news', 'NOUN'),
 ('of', 'ADP'),
 ('major', 'ADJ'),
 ('security', 'NOUN'),
 ('trading', 'NOUN'),
 ('firm', 'NOUN'),
 ('boarding', 'VERB'),
 ('the', 'DET'),
 ('blockchain', 'NOUN'),
 ('bandwagon', 'NOUN'),
 ('bitcoin', 'NOUN'),
 ('atm', 'NOUN'),
 ('proliferating', 'VERB'),
 ('in', 'ADP'),
 ('europe', 'NOUN'),
 ('and', 'CONJ'),
 ('brave', 'VERB'),
 ('browser', 'NOUN'),
 ('hitting', '

In [15]:
nouns_summary = [i for i in summary if i[1] == 'NOUN']
noun_summary = []
noun_summary_count = []

noun_summary_counting_list = list(map(lambda x: x[0], nouns_summary))

for i in set(nouns_summary):
    noun_summary.append(i[0])
    noun_summary_count.append(noun_summary_counting_list.count(i[0]))
    
df_nouns_summary = pd.DataFrame({'noun_summary':noun_summary, 'noun_summary_count':noun_summary_count}).sort_values(by = 'noun_summary_count', ascending = False)

In [21]:
df_nouns_summary.head(50)

Unnamed: 0,noun_summary,noun_summary_count
6790,bitcoin,4257
5294,ha,2591
7573,cryptocurrency,1991
2805,exchange,1790
5037,company,977
996,market,977
1550,week,899
7242,bank,809
7768,currency,786
1513,year,751


In [17]:
adjectives_summary = [i for i in summary if i[1] == 'ADJ']
adjective_summary = []
adjective_summary_count = []

adjective_summary_counting_list = list(map(lambda x: x[0], adjectives_summary))

for i in set(adjectives_summary):
    adjective_summary.append(i[0])
    adjective_summary_count.append(adjective_summary_counting_list.count(i[0]))
    
df_adjectives = pd.DataFrame({'adjective_summary':adjective_summary, 'adjective_summary_count':adjective_summary_count}).sort_values(by = 'adjective_summary_count', ascending = False)

In [19]:
df_adjectives.head(50)

Unnamed: 0,adjective_summary,adjective_summary_count
1110,new,1290
559,bitcoin,720
819,digital,712
2844,financial,614
3822,many,535
1479,last,406
521,more,397
2725,recent,392
344,past,388
4007,other,367
