In [106]:
#Import required packages
import pandas as pd
import numpy as np
import pickle
import re


import nltk
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim import corpora, models, similarities, matutils

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
#Initial Dataframe Import
#Open Corpus of News Article Text
with open('../news_articles_scrape/data_frames/bitcoin_news_text_dates_combined_df.pickle', 'rb') as file:
     bitcoin_news_df = pickle.load(file)

### Approach to preprocessing

1- Remove capitalization and punctuation

2- Remove overfit words/phrases (including source names, format-specific words (e.g., one source listed the day of the week in the first line of every article), and phrases contained in every article –usually a header/footer); 

3- Remove short words (words less than 3 characters long); 

4- Remove stop words.

5- Convert numbers into words or removing numbers

6- Expanding abbreviations

7- Text canoncalization

# Clean Real News Text

In [None]:
#Remove punctuation except for period function
def remove_punctuation_and_lower_case(string): 
  
    # punctuation marks 
    punctuations = '''\n“”!()-[]{};:'"\,<>/?@#$%^&*_~–'''
  
    # traverse the given string and if any punctuation 
    # marks occur replace it with null 
    string = string.lower()
    
    for x in string: 
        if x in punctuations: 
            string = string.replace(x, "") 
  
    # Return string without punctuation 
    return string

In [None]:
# Text preprocessing steps - remove numebrs and make text lower case, remove punctuation from article text
bitcoin_news_df.text = bitcoin_news_df.text.apply(lambda x: remove_punctuation_and_lower_case(x))

In [None]:
#Keep only sentences that contain a number in them 
def sentences_with_string(string):
    #Split Article by sentence
    string = nltk.sent_tokenize(string)
    
    return_list=[]
    
    for x in string:
        if bool(re.search(r'\d', x)) == True:
            return_list.append(x)
        else: 
            pass
    return return_list

In [None]:
bitcoin_news_df['sent_with_num'] = bitcoin_news_df['text'].apply(lambda x: sentences_with_string(x))

In [None]:
#Remove Number from Text Column and Tokenize Column
# Text preprocessing steps - remove numebrs and make text lower case, remove punctuation from article text
import re
import string

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
bitcoin_news_df['text'] = bitcoin_news_df.text.map(alphanumeric).map(punc_lower)

#Remove punctuation from text
bitcoin_news_df['text'] = bitcoin_news_df['text'].apply(lambda x: re.sub(r'[^\w\s]','', x))

#Remove All Spaces
bitcoin_news_df['text'] = bitcoin_news_df['text'].apply(lambda x: ' '.join(x.split()))

In [None]:
bitcoin_news_df.head(5)

### Tokenization
`process of splitting the given text into smaller pieces called tokens`

In [None]:
#Remove Tokeniz individual texts and remove stop words
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
#Tokenize Words
bitcoin_news_df['tokenized_text'] = bitcoin_news_df['text'].apply(word_tokenize)

#Remove Stop Words
bitcoin_news_df['tokenized_text']= bitcoin_news_df['tokenized_text'].apply(lambda x: [item for item in x if item not in stop_words])

In [114]:
bitcoin_news_df.head(5)

Unnamed: 0,text,date,sent_with_num,tokenized_text,tfid_vec_text,stemmed_text,lemmatized_text,pos_text,tokenized_sent_with_num
0,initial coin offerings icos have raised billio...,2019-05-31 14:34:00+01:00,[initial coin offerings icos have raised 20 bi...,"[initial, coin, offerings, icos, raised, billi...","(0, 127992)\t0.024305504438058914\n (0, 127...","[initi, coin, offer, ico, rais, billion, sinc,...","[initial, coin, offering, icos, raised, billio...","[[(initial, JJ)], [(coin, NN)], [(offerings, N...","[[initial, coin, offerings, icos, have, raised..."
1,watts miners started its operation in the begi...,2019-05-26 13:10:00+01:00,[],"[watts, miners, started, operation, beginning,...","(0, 128438)\t0.10179798676343357\n (0, 1283...","[watt, miner, start, oper, begin, cryptotechno...","[watt, miner, started, operation, beginning, c...","[[(watts, NN)], [(miners, NNS)], [(started, VB...",[]
2,google call screen promises to help you deal w...,2019-06-01 20:37:00+01:00,[bitcoin mining consumes a lot of energy with ...,"[google, call, screen, promises, help, deal, c...","(0, 128398)\t0.040594752388170174\n (0, 128...","[googl, call, screen, promis, help, deal, call...","[google, call, screen, promise, help, deal, ca...","[[(google, NN)], [(call, NN)], [(screen, NN)],...","[[bitcoin, mining, consumes, a, lot, of, energ..."
3,friday may most of the top cryptocurrencies ar...,,[friday may 31 — most of the top 20 cryptocurr...,"[friday, may, top, cryptocurrencies, reporting...","(0, 128331)\t0.05846390674965055\n (0, 1276...","[friday, may, top, cryptocurr, report, moder, ...","[friday, may, top, cryptocurrencies, reporting...","[[(friday, NN)], [(may, MD)], [(top, NN)], [(c...","[[friday, may, 31, —, most, of, the, top, 20, ..."
4,highend swiss watchmaker franck muller has par...,2019-06-02 11:30:19+00:00,[highend swiss watchmaker franck muller has pa...,"[highend, swiss, watchmaker, franck, muller, p...","(0, 128167)\t0.03309387246976487\n (0, 1279...","[highend, swiss, watchmak, franck, muller, par...","[highend, swiss, watchmaker, franck, muller, p...","[[(highend, NN)], [(swiss, JJ)], [(watchmaker,...","[[highend, swiss, watchmaker, franck, muller, ..."


In [108]:
#Create Column to store tokenized sentence with numbers. Will be used during sentiment analysis
bitcoin_news_df['tokenized_sent_with_num'] = np.NaN

In [None]:
tokenized=[]
for count,ele in enumerate(bitcoin_news_df.sent_with_num): 
    for sent in ele:
        tokenized.append(word_tokenize(sent))
    bitcoin_news_df['tokenized_sent_with_num'][count] = tokenized
    tokenized=[]

### Tfid Vectorization
- Gives the relative importance of a term in a corpus (text data)

In [None]:
#tfid_vectorization function
def tfid_vectorization(df, column_to_vectorize=None, vectorized_name=None):
    # list of text documents
    article= df[column_to_vectorize]

    # create the transform
    vectorizer= TfidfVectorizer()

    # tokenize and build vocab
    vectorizer.fit(article)

    # summarize
    #print(vectorizer.vocabulary_)
    #print(vectorizer.idf_)

    # # # encode documents
    df[vectorized_name] = article.apply(lambda x: vectorizer.transform([x]))
    
    # # summarize encoded vector
    #print(df[vectorized_name].shape)
    #print(df[vectorized_name].toarray())
    
    print('Tfid Vectorization Completed \n')
    
    return df;

In [None]:
#tfid vectorization of document text
tfid_vectorization(bitcoin_news_df,'text','tfid_vec_text');

### Lexicon Normalization
- Convert all disparities of a word into their normalized form as a part of feature engineering with text as it converts the high dimensional features (N different features) to the low dimensional space (1 feature), which is an ideal ask for any ML model. 

### Stemming using NLTK
-Stemming is a process of linguistic normalization, which reduces words to their root word or chops off the derviational affices. For example, connection, connected, connecting word reduce to a common word 'connect'

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer= PorterStemmer()
lemmatizer=WordNetLemmatizer()

#Stemming Text
bitcoin_news_df['stemmed_text'] = bitcoin_news_df['tokenized_text'].apply(lambda x: [stemmer.stem(y) for y in x])

### Lemmatization using NLTK

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

#Lemmatizing Text
bitcoin_news_df['lemmatized_text'] = bitcoin_news_df['tokenized_text'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])

### POS and Chunking Text
- Helps overcome bagofwords weakness which fails to capture the structure of sentences and sometimes gives its appropriate meaning. 

In [None]:
#POS
#POS Text
bitcoin_news_df['pos_text'] = bitcoin_news_df['tokenized_text'].apply(lambda x: [nltk.pos_tag([y]) for y in x])

In [140]:
#Create empty column to add POS of Tokenized Sentences with Numbers
bitcoin_news_df['pos_sent_with_num'] = np.NaN

In [150]:
parts_of_speech = []
for count, elem in enumerate(bitcoin_news_df.tokenized_sent_with_num):
    for sent in elem:
        parts_of_speech.append(nltk.pos_tag(sent))
    bitcoin_news_df['pos_sent_with_num'][count] = parts_of_speech
    parts_of_speech = []

In [155]:
#Sort Dataframe By Date
bitcoin_news_df.sort_values('date', inplace=True)

In [162]:
#Reset Dataframe Row Index
bitcoin_news_df = bitcoin_news_df.reset_index(drop=True)

In [164]:
#Save Updated Data Frame
with open('./data_frames/bitcoin_news_df_processed_for_modeling.pickle', 'wb') as file:
     pickle.dump(bitcoin_news_df, file)       