In [1]:
#Import required packages
import pandas as pd
import numpy as np
import pickle
import re


import nltk
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim import corpora, models, similarities, matutils

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [8]:
#Initial Dataframe Import
#Open Corpus of News Article Text
with open('../news_articles_scrape/data_frames/bitcoin_news_text_dates_combined_df.pickle', 'rb') as file:
     bitcoin_news_df = pickle.load(file)

### Approach to preprocessing

1- Remove capitalization and punctuation

2- Remove overfit words/phrases (including source names, format-specific words (e.g., one source listed the day of the week in the first line of every article), and phrases contained in every article –usually a header/footer); 

3- Remove short words (words less than 3 characters long); 

4- Remove stop words.

5- Convert numbers into words or removing numbers

6- Expanding abbreviations

7- Text canoncalization

# Clean Real News Text

### Text Column Without Numbers

In [12]:
import re
re.search("\d", bitcoin_news_df.text[0]).start()

43

In [18]:
bitcoin_news_df.text[0]

'initial coin offerings icos have raised 20 billion since the start of 2017 which is 18 billion more than the previous year according to a recent study by financial research firm autonomous research the study dubbed crypto utopia explores the cryptocurrency industry over the past year focusing on icos and the regulation to which they are exposed per the study 12 billion has been raised through icos in the course of 2018 while last year they raised 7 billion the icos of blockchain protocol eos and messaging app telegram are responsible for almost half of all ico funds in 2018 at 4 2 billion and 1 7 billion respectively though over 300 crypto funds have been launched to invest in crypto assets a vast majority of funds are concentrated within a small minority of organizations according to autonomous the research notes that icos are often exposed to fraud and scams which form 20 percent of project white papers while phishing and hacking are responsible for stealing 15 percent of all crypto

In [13]:
# Text preprocessing steps - remove numebrs and make text lower case, remove punctuation from article text
import re
import string

# alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
bitcoin_news_df['text'] = bitcoin_news_df.text.map(punc_lower)

# bitcoin_news_df['text'] = bitcoin_news_df.text.map(alphanumeric).map(punc_lower)

In [15]:
#Remove punctuation from text
bitcoin_news_df['text'] = bitcoin_news_df['text'].apply(lambda x: re.sub(r'[^\w\s]','', x))

In [17]:
#Remove All Spaces
bitcoin_news_df['text'] = bitcoin_news_df['text'].apply(lambda x: ' '.join(x.split()))

In [21]:
# #Remove words of length 3 or less
# bitcoin_news_df['text']= bitcoin_news_df['text'].str.findall('\w{3,}').str.join(' ')

### The text will be tokenized twice for the following reasons. 
We will first tokenized with numbers included in the text. Once text is tokenized with number, the 5 words preceding and following a number will be extracted from analysis. 

After this is complete, all numbers will be removed from the tokenized text. 

### Tokenization
`process of splitting the given text into smaller pieces called tokens`

In [19]:
#Remove stop words and tokeniz individual texts
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [20]:
from nltk.tokenize import word_tokenize
bitcoin_news_df['tokenized_text'] = bitcoin_news_df['text'].apply(word_tokenize)

In [21]:
#Remove Stop Words
bitcoin_news_df['tokenized_text']= bitcoin_news_df['tokenized_text'].apply(lambda x: [item for item in x if item not in stop_words])

### Tfid Vectorization
- Gives the relative importance of a term in a corpus (text data)

In [None]:
#tfid_vectorization function
def tfid_vectorization(df, column_to_vectorize=None, vectorized_name=None):
    # list of text documents
    article= df[column_to_vectorize]

    # create the transform
    vectorizer= TfidfVectorizer()

    # tokenize and build vocab
    vectorizer.fit(article)

    # summarize
    #print(vectorizer.vocabulary_)
    #print(vectorizer.idf_)

    # # # encode documents
    df[vectorized_name] = article.apply(lambda x: vectorizer.transform([x]))
    
    # # summarize encoded vector
    #print(df[vectorized_name].shape)
    #print(df[vectorized_name].toarray())
    
    print('Tfid Vectorization Completed \n')
    
    return df;

In [None]:
#tfid vectorization of document text
tfid_vectorization(bitcoin_news_df,'text','tfid_vec_text');

### Lexicon Normalization
- Convert all disparities of a word into their normalized form as a part of feature engineering with text as it converts the high dimensional features (N different features) to the low dimensional space (1 feature), which is an ideal ask for any ML model. 

### Stemming using NLTK
-Stemming is a process of linguistic normalization, which reduces words to their root word or chops off the derviational affices. For example, connection, connected, connecting word reduce to a common word 'connect'

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer= PorterStemmer()
lemmatizer=WordNetLemmatizer()

#Stemming Text
bitcoin_news_df['stemmed_text'] = bitcoin_news_df['tokenized_text'].apply(lambda x: [stemmer.stem(y) for y in x])

### Lemmatization using NLTK

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

#Lemmatizing Text
bitcoin_news_df['lemmatized_text'] = bitcoin_news_df['tokenized_text'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])

### POS and Chunking Text
- Helps overcome bagofwords weakness which fails to capture the structure of sentences and sometimes gives its appropriate meaning. 

In [None]:
#POS
#POS Text
bitcoin_news_df['pos_text'] = bitcoin_news_df['tokenized_text'].apply(lambda x: [nltk.pos_tag([y]) for y in x])

In [None]:
#Save Updated Data Frame
with open('./data_frames/bitcoin_news_df_preprocessed.pickle', 'wb') as file:
     pickle.dump(bitcoin_news_df, file)       