In [1]:
#Import required packages
import pandas as pd
import numpy as np
import pickle
import re


import nltk
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime, timedelta


from gensim import corpora, models, similarities, matutils

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [2]:
#Initial Dataframe Import
#Open Corpus of News Article Text
with open('../news_articles_scrape/data_frames/bitcoin_news_text_dates_combined_df.pickle', 'rb') as file:
     bitcoin_news_df = pickle.load(file)

### Approach to preprocessing

1- Remove Articles Without Dates

2- Remove capitalization and punctuation

3- Remove overfit words/phrases (including source names, format-specific words (e.g., one source listed the day of the week in the first line of every article), and phrases contained in every article –usually a header/footer); 

4- Remove short words (words less than 3 characters long); 

5- Remove stop words.

6- Convert numbers into words or removing numbers

7- Expanding abbreviations

8- Text canoncalization

# Clean Real News Text

In [3]:
#Remove punctuation except for period function
def remove_punctuation_and_lower_case(string): 
  
    # punctuation marks 
    punctuations = '''\n“”!()-[]{};:'"\,<>/?@#$%^&*_~–'''
  
    # traverse the given string and if any punctuation 
    # marks occur replace it with null 
    string = string.lower()
    
    for x in string: 
        if x in punctuations: 
            string = string.replace(x, "") 
  
    # Return string without punctuation 
    return string

In [4]:
# Text preprocessing steps - remove numebrs and make text lower case, remove punctuation from article text
bitcoin_news_df.text = bitcoin_news_df.text.apply(lambda x: remove_punctuation_and_lower_case(x))

In [5]:
#Keep only sentences that contain a number in them 
def sentences_with_string(string):
    #Split Article by sentence
    string = nltk.sent_tokenize(string)
    
    return_list=[]
    
    for x in string:
        if bool(re.search(r'\d', x)) == True:
            return_list.append(x)
        else: 
            pass
    return return_list

In [6]:
bitcoin_news_df['sent_with_num'] = bitcoin_news_df['text'].apply(lambda x: sentences_with_string(x))

In [7]:
#Remove Number from Text Column and Tokenize Column
# Text preprocessing steps - remove numebrs and make text lower case, remove punctuation from article text
import re
import string

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
bitcoin_news_df['text'] = bitcoin_news_df.text.map(alphanumeric).map(punc_lower)

#Remove punctuation from text
bitcoin_news_df['text'] = bitcoin_news_df['text'].apply(lambda x: re.sub(r'[^\w\s]','', x))

#Remove All Spaces
bitcoin_news_df['text'] = bitcoin_news_df['text'].apply(lambda x: ' '.join(x.split()))

In [8]:
#Keep rows with a date and remove all else
bitcoin_news_df = bitcoin_news_df[(bitcoin_news_df.date != '0001-11-30 00:00:00+00:00') & (bitcoin_news_df.date !='None')]

In [9]:
#Strip Timezone Info
bitcoin_news_df.date = bitcoin_news_df.date.astype(str).str[:-6]

#Convert all string dates to datetime
bitcoin_news_df.date = pd.to_datetime(bitcoin_news_df.date)

In [10]:
#Reset Dataframe Row Index
bitcoin_news_df = bitcoin_news_df.reset_index(drop=True)

In [11]:
#Sort Dataframe By Date
bitcoin_news_df.sort_values('date', inplace=True)

In [12]:
#Round date to nearest hour
def hour_rounder(t):
    # Rounds to nearest hour by adding a timedelta hour if minute >= 30
    return (t.replace(second=0, microsecond=0, minute=0, hour=t.hour)
               +timedelta(hours=t.minute//30))



In [13]:
bitcoin_news_df.date = bitcoin_news_df.date.apply(hour_rounder)

### Tokenization
`process of splitting the given text into smaller pieces called tokens`

In [None]:
#Remove Tokeniz individual texts and remove stop words
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
#Tokenize Words
bitcoin_news_df['tokenized_text'] = bitcoin_news_df['text'].apply(word_tokenize)

#Remove Stop Words
bitcoin_news_df['tokenized_text']= bitcoin_news_df['tokenized_text'].apply(lambda x: [item for item in x if item not in stop_words])

In [None]:
bitcoin_news_df.head(5)

In [None]:
#Create Column to store tokenized sentence with numbers. Will be used during sentiment analysis
bitcoin_news_df['tokenized_sent_with_num'] = np.NaN

In [None]:
tokenized=[]
for count,ele in enumerate(bitcoin_news_df.sent_with_num): 
    for sent in ele:
        tokenized.append(word_tokenize(sent))
    bitcoin_news_df['tokenized_sent_with_num'][count] = tokenized
    tokenized=[]

### Tfid Vectorization
- Gives the relative importance of a term in a corpus (text data)

In [None]:
#tfid_vectorization function
def tfid_vectorization(df, column_to_vectorize=None, vectorized_name=None):
    # list of text documents
    article= df[column_to_vectorize]

    # create the transform
    vectorizer= TfidfVectorizer()

    # tokenize and build vocab
    vectorizer.fit(article)

    # summarize
    #print(vectorizer.vocabulary_)
    #print(vectorizer.idf_)

    # # # encode documents
    df[vectorized_name] = article.apply(lambda x: vectorizer.transform([x]))
    
    # # summarize encoded vector
    #print(df[vectorized_name].shape)
    #print(df[vectorized_name].toarray())
    
    print('Tfid Vectorization Completed \n')
    
    return df;

In [None]:
#tfid vectorization of document text
tfid_vectorization(bitcoin_news_df,'text','tfid_vec_text');

### Lexicon Normalization
- Convert all disparities of a word into their normalized form as a part of feature engineering with text as it converts the high dimensional features (N different features) to the low dimensional space (1 feature), which is an ideal ask for any ML model. 

### Stemming using NLTK
-Stemming is a process of linguistic normalization, which reduces words to their root word or chops off the derviational affices. For example, connection, connected, connecting word reduce to a common word 'connect'

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer= PorterStemmer()
lemmatizer=WordNetLemmatizer()

#Stemming Text
bitcoin_news_df['stemmed_text'] = bitcoin_news_df['tokenized_text'].apply(lambda x: [stemmer.stem(y) for y in x])

### Lemmatization using NLTK

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

#Lemmatizing Text
bitcoin_news_df['lemmatized_text'] = bitcoin_news_df['tokenized_text'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])

### POS and Chunking Text
- Helps overcome bagofwords weakness which fails to capture the structure of sentences and sometimes gives its appropriate meaning. 

In [None]:
#POS
#POS Text
bitcoin_news_df['pos_text'] = bitcoin_news_df['tokenized_text'].apply(lambda x: [nltk.pos_tag([y]) for y in x])

In [None]:
#Create empty column to add POS of Tokenized Sentences with Numbers
bitcoin_news_df['pos_sent_with_num'] = np.NaN

In [None]:
parts_of_speech = []
for count, elem in enumerate(bitcoin_news_df.tokenized_sent_with_num):
    for sent in elem:
        parts_of_speech.append(nltk.pos_tag(sent))
    bitcoin_news_df['pos_sent_with_num'][count] = parts_of_speech
    parts_of_speech = []

In [None]:
#Save Updated Data Frame
with open('./data_frames/bitcoin_news_df_processed_for_modeling.pickle', 'wb') as file:
     pickle.dump(bitcoin_news_df, file)       