##### Imports

In [32]:
import pandas as pd

import string
import re
import nltk 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

from gensim.models import Phrases

In [4]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jeromecohen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
posts_br_df = pd.read_csv('data/fortniteBR.csv')
posts_comp_df = pd.read_csv('data/fortniteCompetitive.csv')
comments_br_df = pd.read_csv('data/FortNiteBR_Comments')
comments_comp_df = pd.read_csv('data/fortniteCompComments')

##### Deletion of Rows and Columns

Drop extra indexes that were a result of scraping script

In [6]:
posts_br_df = posts_br_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
posts_comp_df = posts_comp_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
comments_br_df = comments_br_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1',  'Unnamed: 0.1.1'])

In [7]:
comments_comp_df = comments_comp_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

Merge the posts dataframe together and do the same with comments to make cleaning easier. We can always separate them again due to the 'subreddit' column. 

In [8]:
posts_df = pd.concat([posts_br_df, posts_comp_df])
comments_df = pd.concat([comments_br_df, comments_comp_df])

The APIs used return deleted comments and posts. When a author or text is deleted, the text gets replaced with '[deleted]'. We remove these instances from the dataframe. Based on prior inspection not shown, if the selftext / body is deleted the author is not necessarily deleted. However, if the author is deleted the selftext / body always is

In [9]:
posts_df = posts_df[posts_df['author'] != '[deleted]']
comments_df = comments_df[comments_df['author'] != '[deleted]']

We will now clean the text of posts and commments. This includes removing newline characters and space characters. We are doing this separate from the cleaning / tokenizing since we will use the normal-cased and un-tokenized sentices for sentiment analysis.

In [55]:
def first_clean(text): 
    text = str(text)
    text = text.replace('\\n',' ')
    text = text.replace('&amp',' ')
    text = text.replace(';#x200B;',' ')
    text = text.replace('nbsp',' ')
    
    return text

In [60]:
comments_df['body'] = comments_df['body'].apply(lambda x : first_clean(x))
posts_df['selftext'] = posts_df['selftext'].apply(lambda x : first_clean(x))
posts_df['title'] = posts_df['title'].apply(lambda x : first_clean(x))

##### Tokenization and Lemmization

In [34]:
wordnet_lemmatizer = WordNetLemmatizer()
stopwords_english = stopwords.words('english')

In [35]:
def clean_text(text): 
    #make string lowercase 
    text = str(text)
    text = text.lower()
    
    #remove links
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)

    #tokenize
    tokens = nltk.word_tokenize(text) 
    clean_text = []
    
    #remove stopwords, puncuation, then lemmatize
    for word in tokens:
        if (word not in stopwords_english and word not in string.punctuation): 
            token = wordnet_lemmatizer.lemmatize(word)
            clean_text.append(token)
            
    #remove words of length 3 or smaller        
    clean_text = [token for token in clean_text if len(token) > 3] 
            
    return clean_text      

In [36]:
comments_df['body_clean'] = comments_df['body'].apply(lambda x : clean_text(x))
posts_df['selftext_clean'] = posts_df['selftext'].apply(lambda x : clean_text(x))
posts_df['title_clean'] = posts_df['title'].apply(lambda x : clean_text(x))

We will remove posts / comments that are less then length 5 after cleaning. The dataset currently contains many one-word comments like 'nice' that only provide noise to the models we will be using. By removing short selftext, we are also removing posts that only have titles (since they are normally images and now NaN in our dataset). Titles only have to be length 3 or longer.

In [37]:
comments_df['body_length'] = comments_df['body_clean'].apply(lambda x : len(x))
posts_df['selftext_length'] = posts_df['selftext_clean'].apply(lambda x : len(x))
posts_df['title_length'] = posts_df['title_clean'].apply(lambda x : len(x))

comments_df = comments_df[comments_df['body_length'] >= 5]
posts_df = posts_df[((posts_df['selftext_length'] >= 5) & (posts_df['title_length'] >= 3))]

In [38]:
comments_df = comments_df.drop(columns=['body_length'])
posts_df = posts_df.drop(columns=['title_length','selftext_length'])

##### Creating Bi and Tri - grams

In [39]:
docs = pd.concat([posts_df['title_clean'], posts_df['selftext_clean'], comments_df['body_clean']])
bigram = Phrases(docs, min_count=10)
trigram = Phrases(bigram[docs])

In [40]:
#doc is of type - list. Expecting the tokenized sentences 
def add_ngram(doc): 
    return trigram[bigram[doc]]

In [41]:
comments_df['body_ngrams'] = comments_df['body_clean'].apply(lambda x : add_ngram(x))
posts_df['selftext_ngrams'] = posts_df['selftext_clean'].apply(lambda x : add_ngram(x))
posts_df['title_ngrams'] = posts_df['title_clean'].apply(lambda x : add_ngram(x))

In [42]:
comments_df.to_pickle('data/comments')
posts_df.to_pickle('data/posts')