# Journals

This notebook is used for processing daily journals.

In [None]:
import pandas as pd
import re
import datetime

# These will be specific to the journal data file used
JOURNAL_FILENAME = 'journals.txt'
HEADER_REGEX = r'(\d+)\/(\d+)\/(\d+)'
GRATEFUL_REGEX = r'\-(.*)'

Lets put the journals into two seperate dataframes. One will include the bulleted grateful items for that day. The other will include the raw paragraphs. 

In [None]:
raw_paragraphs = []
raw_gratefuls = []

date = 'Empty'
grateful = 'Empty'
paragraph = 'Empty'

# Section headers (not to be included)
headers = ['\n', '#journal\n', '#daily\n']

# Import journal data from text file
with open(JOURNAL_FILENAME) as file:
    for line in file:
        if line.endswith(':\n'):
            continue
        if 'Journal' in line:
            raw_date = re.search(HEADER_REGEX, line).groups()
            date = '/'.join(raw_date)
            weekday = datetime.datetime(int(raw_date[2]),
                                        int(raw_date[1]),
                                        int(raw_date[0])).weekday()
        elif line.startswith('-'):
            grateful = re.search(GRATEFUL_REGEX, line).group(1)
            raw_gratefuls.append((date, weekday, grateful))
        elif line not in headers:
            raw_paragraphs.append((date, weekday, line))
        else:
            pass
                
# Convert to dataframes
grateful = pd.DataFrame(raw_gratefuls, columns=['date', 'weekday', 'text'])
paragraph = pd.DataFrame(raw_paragraphs, columns=['date', 'weekday', 'text'])

In [None]:
# Print out dataframes to see data
print(paragraph)
print(grateful)
grateful.head()
paragraph.head()

## Tokenization

Lets convert each of the paragraphs into a list of tokens.

Sources:
- [1] http://pythondata.com/text-analytics-visualization/

In [None]:
import nltk
from string import punctuation

# Download nltk resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Use nltk to set up stopwords, stemmer, and lemmatizer
stop = nltk.corpus.stopwords.words('english')
porter = nltk.stem.PorterStemmer()
wnl = nltk.stem.WordNetLemmatizer() 

def tokenizer(text):
 
    tokens_ = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)]
 
    tokens = []
    for token_by_sent in tokens_:
        tokens += token_by_sent
 
    tokens = list(filter(lambda t: t.lower() not in stop, tokens))
    tokens = list(filter(lambda t: t not in punctuation, tokens))
    tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', u'\u2014', u'\u2026', u'\u2013'], tokens))
     
    filtered_tokens = []
    for token in tokens:
        token = wnl.lemmatize(token)
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
 
    filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))
 
    return filtered_tokens

In [None]:
# Use the map method to create new columns with tokens
grateful['tokens'] = grateful['text'].map(tokenizer)
paragraph['tokens'] = paragraph['text'].map(tokenizer)

## Keywords

Lets get the most common tokens for each text, and use those as keywords

In [None]:
from collections import Counter

def keywords(tokens, num=2):
    return Counter(tokens).most_common(num)

# Use the map method to create new columns with keywords
grateful['keywords'] = grateful['tokens'].map(keywords)
paragraph['keywords'] = paragraph['tokens'].map(keywords)