In [13]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#### In this notebook, I will create several new columns in our data representing post characteristics, as well as tokenize and lemmatize the text. The dataframes will be combined into one and shuffled

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [76]:
# read in data
sa_data = pd.read_csv('../data/SA_data.csv')
gt_data = pd.read_csv('../data/GT_data.csv')

In [77]:
# remove unnamed columns
sa_data.drop(columns='Unnamed: 0', inplace=True)
gt_data.drop(columns='Unnamed: 0', inplace=True)

In [78]:
# Make post text lower case
sa_data['selftext'] = sa_data['selftext'].str.lower()
gt_data['selftext'] = gt_data['selftext'].str.lower()

In [149]:
book_data.loc[116,:]

selftext                                 a
subreddit               Stormlight_Archive
title                          Test2 (row)
tokenized_text                           a
tokenized_text_clean                      
char_count                               1
word_count                               1
tagged_tokens                           []
lemmatized_words                          
Name: 116, dtype: object

### Add Tokenized Text, Word Count and Character Count Columns

In [79]:
# Instantiate tokenizer
tokenizer = RegexpTokenizer('\w+')

In [80]:
# tokenize sa selftext into new columns
for i in range(len(sa_data)):
    sa_data.loc[i,'tokenized_text'] = ' '.join(tokenizer.tokenize(sa_data.loc[i,'selftext']))

In [81]:
# tokenize gt selftext into new columns
for i in range(len(gt_data)):
    gt_data.loc[i,'tokenized_text'] = ' '.join(tokenizer.tokenize(gt_data.loc[i,'selftext']))

In [82]:
# remove english stop words from tokenized text
stop_words = stopwords.words('english')
gt_data['tokenized_text_clean'] = gt_data['tokenized_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
sa_data['tokenized_text_clean'] = sa_data['tokenized_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [83]:
# Add post character count column
gt_data['char_count'] = gt_data['selftext'].apply(lambda x: len(x))
sa_data['char_count'] = sa_data['selftext'].apply(lambda x: len(x))

In [84]:
# Add post word count column
gt_data['word_count'] = gt_data['selftext'].apply(lambda x: len(x.split()))
sa_data['word_count'] = sa_data['selftext'].apply(lambda x: len(x.split()))

### Combine Dataframes

In [97]:
# Combine Stormlight and ASOIAF data into book data
book_data = pd.concat([gt_data, sa_data], axis=0)

In [100]:
# Shuffle book data so that classes are not stratified
book_data = book_data.sample(frac=1).reset_index(drop=True)

### Lemmatize Cleaned Text

In [103]:
# Instantiate Lemmatizer
wn = WordNetLemmatizer()

In [114]:
# tag tokens with part of speach symbol
book_data['tagged_tokens'] = book_data['tokenized_text_clean'].apply(lambda x: nltk.pos_tag(x.split()))

In [116]:
# lemmatizing function written collaboratively with DSI-919 cohort
def custom_lemmatize(word, tag):
    mapper = {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV
    }
    pos = mapper.get(tag[0])
    
    return wn.lemmatize(word, pos) if pos else word

In [124]:
# make column of lemmatized words
book_data['lemmatized_words'] = book_data['tagged_tokens'].apply(lambda x: ' '.join([custom_lemmatize(word, tag) for word, tag in x]))

In [140]:
# write new dataframe to csv
book_data.to_csv('../data/book_data.csv')