# Journals

This notebook is used for processing daily journals.

In [None]:
import pandas as pd
import re
import datetime

# These will be specific to the journal data file used
JOURNAL_FILENAME = 'journals.txt'
HEADER_REGEX = r'(\d+)\/(\d+)\/(\d+)'
GRATEFUL_REGEX = r'\-(.*)'

Lets put the journals into two seperate dataframes. One will include the bulleted grateful items for that day. The other will include the raw paragraphs. 

In [None]:
raw_paragraphs = []
raw_gratefuls = []

date = 'Empty'
grateful = 'Empty'
paragraph = 'Empty'

# Section headers (not to be included)
headers = ['\n', '#journal\n', '#daily\n']

# Import journal data from text file
with open(JOURNAL_FILENAME) as file:
    for line in file:
        # Remove unicode escape characters when necessary
#         line = line.strip(u'\u')
        if line.endswith(':\n'):
            continue
        if 'Journal' in line:
            raw_date = re.search(HEADER_REGEX, line).groups()
            date = '/'.join(raw_date)
            weekday = datetime.datetime(int(raw_date[2]),
                                        int(raw_date[1]),
                                        int(raw_date[0])).weekday()
        elif line.startswith('-'):
            text = re.search(GRATEFUL_REGEX, line).group(1)
            word_list = text.split()
            raw_gratefuls.append((date, weekday, text, word_list))
        elif line not in headers:
            word_list = line.split()
            raw_paragraphs.append((date, weekday, line, word_list))
        else:
            pass
                
# Convert to dataframes
grateful = pd.DataFrame(raw_gratefuls, columns=['date', 'weekday', 'text', 'words'])
paragraph = pd.DataFrame(raw_paragraphs, columns=['date', 'weekday', 'text', 'words'])

In [None]:
# Print out dataframes to see data
print(paragraph)
print(grateful)
grateful.head()
paragraph.head()

## Tokenization

Lets convert each of the paragraphs into a list of tokens.

Sources:
- [1] http://pythondata.com/text-analytics-visualization/

In [None]:
import nltk
from string import punctuation

# Download nltk resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Use nltk to set up stopwords, stemmer, and lemmatizer
stop = nltk.corpus.stopwords.words('english')
porter = nltk.stem.PorterStemmer()
wnl = nltk.stem.WordNetLemmatizer() 

def tokenizer(text):
 
    tokens_ = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)]
 
    tokens = []
    for token_by_sent in tokens_:
        tokens += token_by_sent
 
    tokens = list(filter(lambda t: t.lower() not in stop, tokens))
    tokens = list(filter(lambda t: t not in punctuation, tokens))
    tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', u'\u2014', u'\u2026', u'\u2013'], tokens))
     
    filtered_tokens = []
    for token in tokens:
        token = wnl.lemmatize(token)
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
 
    filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))
 
    return filtered_tokens

In [None]:
# Use the map method to create new columns with tokens
grateful['tokens'] = grateful['text'].map(tokenizer)
paragraph['tokens'] = paragraph['text'].map(tokenizer)

## Keywords

Lets get the most common tokens for each text, and use those as keywords

In [None]:
from collections import Counter

def keywords(tokens, num=2):
    return Counter(tokens).most_common(num)

# Use the map method to create new columns with keywords
grateful['keywords'] = grateful['tokens'].map(keywords)
paragraph['keywords'] = paragraph['tokens'].map(keywords)

## More Stats

Lets get some more statistics for each text, such as length of text.

In [None]:
def num_words(text):
    return len(text.split())

# Use the map method to create new columns with keywords
grateful['num_words'] = grateful['text'].map(num_words)
paragraph['num_words'] = paragraph['text'].map(num_words)

## Plot

Make some plots of the statistics we have extracted

In [None]:
import matplotlib.pyplot as plt

# Histogram for the number of entries per day of the week
grateful['weekday'].plot.hist(alpha=0.5)
paragraph['weekday'].plot.hist(alpha=0.5)

## Pre-processing

We use the tensorflow data API to read in the text data line by line. Separating it into test, validate, and train as well as shuffling and batching it when appropriate.

Sources:

- [2] Tensorflow for Deep Learning

In [None]:
from collections import Counter

def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

# List of all words in gratefuls
grateful_list = [word for words in grateful['words'].tolist() for word in words]

build_dataset(grateful_list, 200)

## Word Embedding

Make a word embedding from our text data.


In [None]:
import tensorflow as tf

vocab_size = 1000
embed_size = 32

with tf.name_scope('inputs'):
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size,])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size,])

with tf.name_scope('embeddings'):
    embeddings = tf.Variable(tf.random_uniform([vocab_size, embed_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)
    
with tf.name_scope('weights'):
    nce_weights = tf.Variable(tf.truncated_normal([vocab_size, embed_size], stddev = 1.0 / math.sqrt(embed_size)))
    
with tf.name_scope('biases'):
    nce_bias = tf.Variable(tf.zeros([vocab_size]))

