##### Imports

In [2]:
import pandas as pd

import string
import nltk 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

from gensim.models import Phrases

In [None]:
nltk.download('averaged_perceptron_tagger')

In [3]:
posts_br_df = pd.read_csv('data/fortniteBR.csv')
posts_comp_df = pd.read_csv('data/fortniteCompetitive.csv')
comments_br_df = pd.read_csv('data/FortNiteBR_Comments')
comments_comp_df = pd.read_csv('data/fortniteCompComments')
patches_df = pd.read_csv('data/patches.csv')

##### Deletion of Rows and Columns

Drop extra indexes that were a result of scraping script

In [4]:
posts_br_df = posts_br_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
posts_comp_df = posts_comp_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
comments_br_df = comments_br_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1',  'Unnamed: 0.1.1'])

In [5]:
comments_comp_df = comments_comp_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

Merge the posts dataframe together and do the same with comments to make cleaning easier. We can always separate them again due to the 'subreddit' column. 

In [6]:
posts_df = pd.concat([posts_br_df, posts_comp_df])
comments_df = pd.concat([comments_br_df, comments_comp_df])

The APIs used return deleted comments and posts. When a author or text is deleted, the text gets replaced with '[deleted]'. We remove these instances from the dataframe. Based on prior inspection not shown, if the selftext / body is deleted the author is not necessarily deleted. However, if the author is deleted the selftext / body always is

In [7]:
posts_df = posts_df[posts_df['author'] != '[deleted]']
comments_df = comments_df[comments_df['author'] != '[deleted]']

We will now clean the text of posts and commments. This includes removing newline characters and space characters. We are doing this separate from the cleaning / tokenizing since we will use the normal-cased and un-tokenized sentices for sentiment analysis.

In [8]:
comments_df['body'] = comments_df['body'].str.replace('\\n',' ')
comments_df['body'] = comments_df['body'].str.replace('&amp',' ')
comments_df['body'] = comments_df['body'].str.replace('&#x200b',' ')
posts_df['selftext'] = posts_df['selftext'].str.replace('\\n',' ')
posts_df['selftext'] = posts_df['selftext'].str.replace('&amp',' ')
posts_df['selftext'] = posts_df['selftext'].str.replace('&#x200b',' ')
posts_df['title'] = posts_df['title'].str.replace('\\n',' ')
posts_df['title'] = posts_df['title'].str.replace('&amp',' ')
posts_df['title'] = posts_df['title'].str.replace('&#x200b',' ')

##### Tokenization and Lemmization

In [9]:
wordnet_lemmatizer = WordNetLemmatizer()
stopwords_english = stopwords.words('english')

In [10]:
def clean_text(text): 
    #make string lowercase 
    text = str(text)
    text = text.lower()

    #tokenize
    tokens = nltk.word_tokenize(text) 
    clean_text = []
    
    #remove stopwords, puncuation, then lemmatize
    for word in tokens:
        if (word not in stopwords_english and word not in string.punctuation): 
            token = wordnet_lemmatizer.lemmatize(word)
            clean_text.append(token)
            
    #remove words of length 3 or smaller        
    clean_text = [token for token in clean_text if len(token) > 3] 
            
    return clean_text      

In [36]:
comments_df['body_clean'] = comments_df['body'].apply(lambda x : clean_text(x))
posts_df['selftext_clean'] = posts_df['selftext'].apply(lambda x : clean_text(x))
posts_df['title_clean'] = posts_df['title'].apply(lambda x : clean_text(x))

##### Creating Bi and Tri - grams
Inspired by https://www.kaggle.com/ykhorramz/lda-and-t-sne-interactive-visualization

In [37]:
docs = posts_df['title_clean'].append(posts_df['selftext_clean'].append(comments_df['body_clean']))
bigram = Phrases(docs, min_count=10)
trigram = Phrases(bigram[docs])

In [38]:
#doc is of type - list. Expecting the tokenized sentences 
def add_ngram(doc): 
    for token in bigram[doc]:
        if '_' in token:
            # Token is a bigram, add to document.
            doc.append(token)
    for token in trigram[doc]:
        if '_' in token:
            # Token is a bigram, add to document.
            doc.append(token)
    return doc 

In [39]:
comments_df['body_clean'] = comments_df['body_clean'].apply(lambda x : add_ngram(x))
posts_df['selftext_clean'] = posts_df['selftext_clean'].apply(lambda x : add_ngram(x))
posts_df['title_clean'] = posts_df['title_clean'].apply(lambda x : add_ngram(x))

In [40]:
posts_df['title_clean']

0        [fortnite, unpopular, opinion, unpopular_opini...
1                                       [glad, pump, back]
2        [please, epic, remove, combat, shotgun, combat...
3        [entitled, section, player, base, play, team, ...
4                         [spamming, console, take, skill]
5        [made, nothing, much, kinda, proud, made, anot...
6                                    [zapatron, unvaulted]
7                                         [search, helmet]
8        [please, epic, ahead, need, variety, team, mod...
9              [clickbait, live, stream, already, rolling]
10       [dark, vertex, dark_vertex, dark_vertex, dark_...
11       [else, loving, care, think, yesterday, today, ...
12                                                  [part]
13       [pump, shotgun, available, team, rumble, seem,...
14                    [accesories, custom, function, like]
15       [release, duel, pickax, nunchucks, shop, spend...
16       [weapon, concept, bouncy, sniper, rifle, snipe.