# 1. Import Libraries

In [3]:
from google.colab import drive
drive.mount('/content/drive/')
path= "/content/drive/MyDrive/APESTGTSTONK/assets/btc_all_news.csv"

Mounted at /content/drive/


In [53]:
import pandas as pd 
import re 
import string
import nltk
import sklearn
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [54]:
btc_headlines= pd.read_csv(path)

In [55]:
btc_headlines.tail()

Unnamed: 0,date,headline,hookline,all_content
16652,20150629,\nBlockchain Technology To Disrupt Data Harves...,,It goes without saying that most consumers a...
16653,20150626,\nUsing the Blockchain for the Digitization of...,Think about it like this. If a business were t...,Think about it like this. If a business were t...
16654,20150620,\nHow the Blockchain can be Applied to Ownership,While mainstream media is just now catching on...,While mainstream media is just now catching on...
16655,20150612,\nBlockstream Releases Working Sidechain Proto...,Sidechains have been in development for almost...,Sidechains have been in development for almost...
16656,20150609,"\nRed Cross Donations Go Astray, Time To Embra...",The current financial infrastructure and ecosy...,The current financial infrastructure and ecosy...


# 2. Data Cleaning for BTC News

**Date**
* Sort date 

**Data cleaning:**
* Remove missing values
* Make text all lower case
* Remove special characters
* Tokenize text
* Remove stop words
* Stemming 

## Data Cleaning 1 
- Remove missing values
- Make text lower case 
- Remove special characters
- Remove punctuations
- Remove stop words

**Data Deduplication**

In [56]:
btc_headlines.drop_duplicates(inplace=True)

In [57]:
btc_headlines.sample(15)

Unnamed: 0,date,headline,hookline,all_content
16082,20160611,\nGovernments & Central Banks are Now Funding ...,Governments are showing increasing interest in...,Governments are showing increasing interest in...
14497,20171203,\nJapan Increases Lead – Approves Another Four...,The Japanese Financial Services Agency has app...,The Japanese Financial Services Agency has app...
3493,20180225,\nNew Bitcoin Embassy Opens in the United States,A mooted Bitcoin Embassy is the first of its k...,A mooted Bitcoin Embassy is the first of its k...
13909,20180214,\nKorean Government Answers Petition Against U...,The South Korean government has officially res...,The South Korean government has officially res...
6195,20151005,"\nGemini bitcoin exchange, a first look",The long awaited launch of the Gemini bitcoin ...,The long awaited launch of the Gemini bitcoin ...
2840,20180910,\nTired of Bank Bailouts and Hyperinflation? B...,People often wonder why a cryptocurrency with ...,People often wonder why a cryptocurrency with ...
4637,20170704,"\nRollout of 260,000+ Bitcoin-Accepting Stores...",Three months after Recruit Lifestyle partnered...,Three months after Recruit Lifestyle partnered...
13035,20180516,\nWorld’s Second Largest Search Engine Bans Cr...,"Melissa Alsoszatai-Petheo, of Microsoft’s Bing...","Melissa Alsoszatai-Petheo, of Microsoft’s Bing..."
252,20210609,\nEl Salvador Bitcoin Law Making BTC Legal Ten...,The bill to make bitcoin legal tender in El Sa...,The bill to make bitcoin legal tender in El Sa...
4765,20170531,\nA 'Segwit2x Working Group' Has Submitted a B...,Just recently Bitcoin.com reported on a Bitcoi...,Just recently Bitcoin.com reported on a Bitcoi...


In [58]:
btc_headlines.sort_values("date", inplace= True)

In [59]:
btc_headlines.isna().sum()

date           0
headline       0
hookline       3
all_content    1
dtype: int64

In [60]:
btc_headlines.dropna(inplace= True)
btc_headlines.isna().sum()

date           0
headline       0
hookline       0
all_content    0
dtype: int64

In [61]:
lemmatizer = WordNetLemmatizer()
#decided to lemmatise instead of stem.. 

In [62]:
#decided not to remove stopwords- realised it removes negation words such as "not". eg. "bitocoin not in correction yet" -> "bitcoin correction". might be misleading. 
#thinking to tackle the issue of overly common words with tf-idf instd of stopwords.

def clean_text(text):
    text = " ".join([lemmatizer.lemmatize(w.lower()) for w in word_tokenize(re.sub('[^a-zA-Z]+', ' ', str(text).replace("<br />", ""))) ])
    text= re.sub('[%s]' % re.escape(string.punctuation), '', text) #remove punctuation and words with numbers
    return text

In [63]:
btc_headlines['headline_clean'] = pd.DataFrame(btc_headlines['headline'].apply(clean_text))
btc_headlines['hookline_clean'] = pd.DataFrame(btc_headlines['hookline'].apply(clean_text))
btc_headlines['all_content_clean'] = pd.DataFrame(btc_headlines['all_content'].apply(clean_text))

In [64]:
btc_headlines.iloc[1,6]

'coinbase announced the completion of a mm series c round this week at an approximate valuation of mm this is the largest funding round of any bitcoin company to date and put the amount of vc money invested into the bitcoin ecosystem for at of s total already around mm last year video ceo brian armstrong talk about the funding round on cnbc more interesting than the amount of money raised however is the list of investor who participated in this round the nyse bbva and usaa bank all chipped in along with former citigroup ceo vikram pandit and former thompson reuters ceo thomas glocer also participating wa the capital arm of docomo japan s largest mobile telco pando daily the nyse tweeted we will work with coinbase to bring additional transparency to bitcoin pricing with the nasdaq aligning itself with the winklevoss etf and reportedly working on some blockchain based project of their own it seems like wall street is really starting to warm up to the idea of digital currency check this p

## Stop Words Removal
- Using TF-IDF to find common words 
- Adding common words into stopword list
- Remove common stop words 

In [65]:
#hookline_clean=  ''.join(btc_headlines['hookline_clean'])
#hookline_clean

In [66]:
btc_by_date = btc_headlines.groupby(['date'])['hookline_clean'].apply(lambda x: ','.join(x)).reset_index()
btc_by_date

Unnamed: 0,date,hookline_clean
0,20150118,this week saw the beginning of alleged silk ro...
1,20150125,coinbase announced the completion of a mm seri...
2,20150201,last week i linked to a countdown timer that c...
3,20150222,a few week ago the new hampshire house of repr...
4,20150301,in a move reminiscent of that time wikileaks h...
...,...,...
2258,20210823,the central bank of nigeria cbn ha threatened ...
2259,20210824,global cryptocurrency exchange binance ha hire...
2260,20210825,abey foundation the organization that oversees...
2261,20210826,brazil and argentina are present in the recent...


In [67]:
#TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tv= TfidfVectorizer(stop_words= 'english') 
data_tv= tv.fit_transform(btc_by_date.hookline_clean)
data_dtm= pd.DataFrame(data_tv.toarray(), columns= tv.get_feature_names())
data_dtm.index= btc_by_date.index
data_dtm

Unnamed: 0,aadhaar,aaron,aarp,aave,aayog,ab,abacha,abaco,abandon,abandoned,abandoning,abandonment,abatement,abating,abbc,abbreviation,abc,abcc,abch,abcon,abcripto,abdol,abducted,abdulaziz,abdulhassan,abdulmalik,abe,abel,aberdeen,abey,abeychain,abide,abiding,abif,abigail,ability,abkhazia,able,abn,abnormal,...,zimswitch,zinnov,zipping,zirp,ziv,zjbc,zk,zkcp,zkin,zkswap,zlcassic,zloty,zodia,zombie,zone,zongolica,zoning,zonte,zoo,zooko,zoom,zorro,zrx,zschaepitz,zse,ztohoven,zubr,zuckerberg,zuckerman,zug,zulutrade,zuma,zurich,zwahlen,zwet,zwl,zy,zyra,zyskind,zytara
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101391,0.202782,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.039996,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
data= data_dtm.transpose()
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,2223,2224,2225,2226,2227,2228,2229,2230,2231,2232,2233,2234,2235,2236,2237,2238,2239,2240,2241,2242,2243,2244,2245,2246,2247,2248,2249,2250,2251,2252,2253,2254,2255,2256,2257,2258,2259,2260,2261,2262
aadhaar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaron,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aarp,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aave,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aayog,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:

# Find the top 30 words said by each day
top_dict = {}
for c in data.columns:
    top = data[c].sort_values(ascending=False).head(30)
    top_dict[c]= list(zip(top.index, top.values))

top_dict

{0: [('ulbricht', 0.47748654884398645),
  ('trial', 0.31718806750384254),
  ('operator', 0.2479541227633149),
  ('manhattan', 0.22074897620793124),
  ('courthouse', 0.22074897620793124),
  ('dread', 0.22074897620793124),
  ('counsel', 0.2045938968890272),
  ('arguing', 0.20251009260927133),
  ('robert', 0.1921200207931631),
  ('pirate', 0.1921200207931631),
  ('mysterious', 0.1748619941042114),
  ('ross', 0.15748515553141054),
  ('founded', 0.15588615821365706),
  ('silk', 0.15242127452191398),
  ('began', 0.15195361773275415),
  ('beginning', 0.14842516412502985),
  ('road', 0.14842516412502985),
  ('interesting', 0.13992693266553236),
  ('note', 0.135337159086816),
  ('saw', 0.12566438143008263),
  ('site', 0.1206449492096246),
  ('alleged', 0.1206449492096246),
  ('marketplace', 0.11693019572157487),
  ('free', 0.1157761031064799),
  ('federal', 0.10465371474057093),
  ('online', 0.09491238937177665),
  ('week', 0.0602298705267795),
  ('wa', 0.05927678976853826),
  ('exposed', 0.0),

In [70]:
# Print the top 15 words said by each day
for category, top_words in top_dict.items():
    print(category)
    print(', '.join([word for word, count in top_words[0:14]]))
    print('---')

    #find common words , consider them as stop words 

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
shady, preying, scammy, monetas, sketchy, mmm, refrain, btm, btms, canton, suisse, people, xapo, lucrative
---
597
draft, israeli, bitcoin, conclave, nanopayments, congresswoman, satoshipay, framework, byrne, corrected, ita, awaits, misconception, mired
---
598
ey, loss, bitcoin, nuisance, caused, accept, btms, ecommerce, zurich, demise, assist, compromise, press, release
---
599
breitbart, politiken, anders, maced, samuelsen, cancelled, milo, yiannopoulos, downtown, apparel, berkeley, wearing, concurrently, penned
---
600
surbitcoin, attacker, venezuelan, undetectable, keystroke, keylogger, banesco, relayed, copying, verifying, validate, validated, pin, sends
---
601
hearing, volatility, earful, continuation, bucharest, unknowingly, worldwide, singularity, uncommon, postpone, romania, month, excellent, sixth
---
602
victimless, bitcoin, litecoin, crime, bu, silver, surveillance, segwit, lifeblood, undiscussed, pool, caus

In [71]:
# Look at the most common top words --> add them to the stop word list
# Counter makes it easy to count values in a list see more on https://docs.python.org/3/library/collections.html#collections.Counter
from collections import Counter

# Let's first pull out the top 30 words for each category
words = []
for category in data.columns:
    top = [word for (word, count) in top_dict[category]]
    for t in top:
        words.append(t)
        
Counter(words).most_common()



[('bitcoin', 1090),
 ('ha', 544),
 ('crypto', 344),
 ('press', 307),
 ('cryptocurrency', 307),
 ('release', 298),
 ('com', 260),
 ('exchange', 224),
 ('bch', 164),
 ('bank', 161),
 ('cash', 143),
 ('market', 131),
 ('mining', 127),
 ('peer', 112),
 ('blockchain', 106),
 ('wallet', 97),
 ('token', 97),
 ('new', 86),
 ('company', 85),
 ('asset', 85),
 ('price', 84),
 ('service', 82),
 ('tax', 80),
 ('cryptocurrencies', 78),
 ('btc', 72),
 ('million', 71),
 ('technology', 69),
 ('trading', 67),
 ('nft', 66),
 ('revolution', 64),
 ('wa', 58),
 ('block', 57),
 ('payment', 55),
 ('fee', 55),
 ('defi', 55),
 ('gold', 54),
 ('card', 54),
 ('caused', 54),
 ('wright', 54),
 ('etf', 54),
 ('currency', 52),
 ('digital', 52),
 ('fork', 52),
 ('people', 51),
 ('atm', 49),
 ('coin', 49),
 ('china', 48),
 ('project', 48),
 ('sec', 48),
 ('court', 48),
 ('week', 46),
 ('developer', 45),
 ('billion', 45),
 ('investment', 44),
 ('transaction', 44),
 ('city', 44),
 ('slp', 44),
 ('network', 43),
 ('fund',

In [72]:
# If more than 30 days  have it as a top word, exclude it from the list 
add_stop_words = [word for word, count in Counter(words).most_common() if count > 30]
add_stop_words

['bitcoin',
 'ha',
 'crypto',
 'press',
 'cryptocurrency',
 'release',
 'com',
 'exchange',
 'bch',
 'bank',
 'cash',
 'market',
 'mining',
 'peer',
 'blockchain',
 'wallet',
 'token',
 'new',
 'company',
 'asset',
 'price',
 'service',
 'tax',
 'cryptocurrencies',
 'btc',
 'million',
 'technology',
 'trading',
 'nft',
 'revolution',
 'wa',
 'block',
 'payment',
 'fee',
 'defi',
 'gold',
 'card',
 'caused',
 'wright',
 'etf',
 'currency',
 'digital',
 'fork',
 'people',
 'atm',
 'coin',
 'china',
 'project',
 'sec',
 'court',
 'week',
 'developer',
 'billion',
 'investment',
 'transaction',
 'city',
 'slp',
 'network',
 'fund',
 'future',
 'account',
 'year',
 'march',
 'security',
 'dogecoin',
 'usd',
 'miner',
 'ethereum',
 'volume',
 'tether',
 'coinbase',
 'financial',
 'satoshi',
 'state',
 'game',
 'india',
 'hashrate',
 'august',
 'ulbricht',
 'pool',
 'announced',
 'ripple',
 'node',
 'support',
 'hardware',
 'statement',
 'segwit',
 'money',
 'world',
 'paper',
 'investor',
 '

In [73]:
stop = stopwords.words('english')
stop.extend(add_stop_words)
stop

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [74]:
#removing stop words 
def clean_text2(text):
    return " ".join([word for word in text.split() if word not in (stop)])

In [75]:
btc_headlines['headline_clean'] = pd.DataFrame(btc_headlines['headline_clean'].apply(clean_text2))
btc_headlines['hookline_clean'] = pd.DataFrame(btc_headlines['hookline_clean'].apply(clean_text2))
btc_headlines['all_content_clean'] = pd.DataFrame(btc_headlines['all_content_clean'].apply(clean_text2))

In [76]:
btc_headlines.iloc[1,6]

'completion mm series c round approximate valuation mm largest funding round date put amount vc invested ecosystem total already around mm last video ceo brian armstrong talk funding round cnbc interesting amount raised however list participated round nyse bbva usaa chipped along former citigroup ceo vikram pandit former thompson reuters ceo thomas glocer also participating capital arm docomo japan largest mobile telco pando daily nyse tweeted work bring additional transparency pricing nasdaq aligning winklevoss reportedly working seems like wall street really starting warm idea check page tomorrow lunar either gon na really awesome really big letdown crowdfunding software lighthouse finally launched playing around seems pretty slick already software used development protocol early adopter olivier janssens pledged donate platform download lighthouse winklevii plan launch fully regulated fully compliant u named gemini know twin nyt dealbook sign notified advance version go live gemini w

In [77]:
btc_headlines.to_csv("/content/drive/MyDrive/APESTGTSTONK/assets/clean1_all_btc.csv")
