In [21]:
from nltk.parse import CoreNLPParser
import pandas as pd

In [29]:
def tokenize_and_pos(data, output):
    """Tokenize text and get POS for each token.
    
    Keywords argument:
    data -- an excel file containing text and formality label
    output -- an excel file containing tokens, POS, and formality
    """
    data_df = pd.read_excel(data)
    n = len(data_df)
    parser = CoreNLPParser(url='http://localhost:9000')
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    tokens, pos_tags = [], []
    
    for i in range(n):
        text = data_df['text'][i]
        # Get tokenized tokens 
        parsed_object = parser.tokenize(text)
        token_list = list(parsed_object)
        tokens.append(token_list)
        
        # Get pos tags
        token_pos_pairs = list(pos_tagger.tag(token_list))
        pos_list = []
        
        for token, pos in token_pos_pairs:
            pos_list.append(pos)
            
        pos_tags.append(pos_list)
        
    output_df = data_df[['id', 'text', 'score']]
    output_df['token'] = tokens
    output_df['pos'] = pos_tags
    # Order columns
    columns = ['id', 'text', 'token', 'pos', 'score']
    output_df = output_df[columns]
    output_df.to_excel(output, index=None)

In [30]:
# Deal with answers data
answers_data = 'answers.xlsx'
answers_output = 'tokenized_answers.xlsx'
tokenize_and_pos(answers_data, answers_output)

In [31]:
# Deal with blog data
blog_data = 'blog.xlsx'
blog_output = 'tokenized_blog.xlsx'
tokenize_and_pos(blog_data, blog_output)

In [32]:
# Deal with email data
email_data = 'email.xlsx'
email_output = 'tokenized_email.xlsx'
tokenize_and_pos(email_data, email_output)

In [33]:
# Deal with news data
news_data = 'news.xlsx'
news_output = 'tokenized_news.xlsx'
tokenize_and_pos(news_data, news_output)