In [None]:
import pandas as pd
import spacy
from spacy.lang.en import English
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import nltk
import re
import gensim
from gensim import corpora
import pickle
from collections import OrderedDict
import pyLDAvis.gensim
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

In [None]:
stop_words = pd.read_csv(r'..\Data\stop_words.csv')
print(stop_words.shape)
stop_words.head()

In [None]:
station_names = pd.read_csv(r'..\Data\station_names.csv')
print(station_names.shape)
station_names.head()

In [None]:
reddit_data_raw = pd.read_csv(r'..\Data\reddit_data_raw.csv')
print(reddit_data_raw.shape)
reddit_data_raw.head()

In [None]:
twitter_data_raw = pd.read_csv(r'..\Data\twitter_data_raw.csv')
print(twitter_data_raw.shape)
twitter_data_raw.head()

In [None]:
nlp = spacy.load('en_core_web_sm')
lemmatizer=WordNetLemmatizer()

In [None]:
### Load updated stop words list
stop_words = pd.read_csv(r'..\Data\stop_words.csv')
stop_words = set(stop_words['stop_words']) 

### Load station names list
station_names = pd.read_csv(r'..\Data\station_names.csv')
station = re.compile('|'.join(map(re.escape, station_names['Station'].str.lower())))

photo_names = ['svg','png','jpeg','jpg', 'photo','pictures','picture','photos']
photo = re.compile('|'.join(map(re.escape, photo_names)))

In [None]:
def flatten(x):
    """
    Function to flatten out nested list
    
    Parameters:
    ----------
    x : nested list
    
    Return:
    ----------
    [list elements removed from nested list]
    """
    result = []
    for el in x:
        if hasattr(el, "__iter__") and not isinstance(el, str):
            result.extend(flatten(el))
        else:
            result.append(el)
    return result

In [None]:
def get_keywords(text):
    """
    Function to extract chunks of key nouns and verbs
    
    Parameters:
    ----------
    text : comment string
    
    Return:
    ----------
    [list of unigram keywords ]
    """
    main_phrases = []
    for chunk in text.noun_chunks:
        if chunk.root.dep_ == 'nsubj' or chunk.root.dep_ == 'dobj' or chunk.root.dep_ == 'pobj': 
            main_phrases.append(chunk.lemma_)
    for word in text:
        if word.pos_ == 'VERB':
            main_phrases.append(word.lemma_)
    final_phrases = flatten([i.split(' ') for i in main_phrases])
    return [w for w in final_phrases if w not in stop_words and '-PRON-' not in w]

In [None]:
def tokenize(text):
    """
    Function to pre-process string 
    
    Parameters:
    ----------
    text : comment string
    Return:
    ----------
    [processed string, [list of keywords]]
    """
    ### 1. Masking common strings
    if 'https://' in text:
        text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', 'urllink', text, flags=re.MULTILINE)
    processed_text = re.sub('[^A-Za-z]+', ' ', text).lower()
    processed_text = station.sub("ttcstation", processed_text)
    processed_text = photo.sub("photo", processed_text)
    ### 2. Get Lemma and conduct POS tagging
    input_str=nlp(processed_text)
    lemma_str = [token.lemma_ for token in input_str]
    filtered_str = ' '.join([w for w in lemma_str if not w in stop_words])
    return [filtered_str, get_keywords(input_str)]

In [None]:
reddit_df = pd.read_csv(r'..\Data\reddit_data_raw.csv')
reddit_df.columns

In [None]:
# reddit_df['body'][0:100]

In [None]:
import multiprocessing as mp

In [None]:
p=mp.Pool(processes=8)
results=p.map(tokenize,reddit_df['body'][0:50])
p.close()
p.join()
whatever=list(results)

In [None]:
whatever

In [None]:
processed_list = reddit_df['body'].apply(lambda x: tokenize(x))

text_data = [i[1] for i in processed_list]

In [None]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
# pickle.dump(corpus, open('corpus.pkl', 'wb'))

In [None]:
text_data[0]

In [None]:
corpus[0]

In [None]:
text_data[3]

In [None]:
corpus[3]