In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

from nltk.corpus import stopwords

en_stop = set(stopwords.words('english'))
sc = spark.sparkContext
sc.broadcast(en_stop)

from nltk.corpus import wordnet as wn

import datetime
import re as re

from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA

In [2]:
data = spark.read.load('../data/oct_2016_news_comment.parquet')
data.show()

+---------+--------------------+----------+---------+
|  link_id|                body|   created|subreddit|
+---------+--------------------+----------+---------+
|t3_57m54q|&gt; You also don...|2016-10-15|     news|
|t3_57gfbu|I don't know if t...|2016-10-15|     news|
|t3_57m54q| How can she slap!?!|2016-10-15|     news|
|t3_57ljf8|It's almost impos...|2016-10-15|     news|
|t3_57i95x|           [deleted]|2016-10-15|     news|
|t3_57m54q|i want to sue god...|2016-10-15|     news|
|t3_57i5zw|Are you an accoun...|2016-10-15|     news|
|t3_57m54q|Good luck to you ...|2016-10-15|     news|
|t3_57m54q|Except Jack Danie...|2016-10-15|     news|
|t3_57m54q|Well let me know ...|2016-10-15|     news|
|t3_57m54q|Ok.. I think he a...|2016-10-15|     news|
|t3_57fn69|           [deleted]|2016-10-15|     news|
|t3_57m54q|Because he's not ...|2016-10-15|     news|
|t3_57it6q|It precludes lice...|2016-10-15|     news|
|t3_57i5zw|The problem is so...|2016-10-15|     news|
|t3_57m54q|Not exactly. One 

In [11]:
def reddit_comment_preprocessing(txt):
    '''
    Take care of doing all the text preprocessing for LDA at the comment level
    Only works on english ASCII content. (works on content with accent or such, but filter them out)
    '''
    #TODO : maybe remove http links.
    
    #keeping only elements relevant to written speech.
    keep_only_letters = lambda s: re.sub('[^a-zA-Z \']+', '', s)
    
    #remove mark of genitif from speech (i.e. "'s" associated to nouns)
    remove_genitive = lambda s: re.sub('(\'s)+', '', s)
    
    #tokenizing the texts (removing line break, space and capitalization)
    token_comm = re.split(" ", remove_genitive(keep_only_letters(txt)).strip().lower())
    
    #removing all words of two letters or less
    bigger_w = [x for x in token_comm if len(x) > 2] 
    
    #removing stop_words
    wout_sw_w = [x for x in bigger_w if x not in en_stop]
    
    def get_lemma(word):
        lemma = wn.morphy(word)
        return word if lemma is None else lemma
    
    #get lemma of each word, then return result
    return [get_lemma(token) for token in wout_sw_w]


def dataset_cleaning_and_preprocessing(data):
    '''
    take a pyspark dataframe as input,
    transform it into a RDD, and filter all 
    empty comments ([removed], [deleted], or just empty text)
    and apply the preprocessing for lda.
    '''
    #keeping only what matters
    dataset = data.select('link_id', 'body', 'created')
    
    #removing post that have been deleted/removed (thus hold no more information)
    filtered = dataset.filter(dataset.body != '[removed]').filter(dataset.body != '[deleted]').filter(dataset.body != '')
    
    #applying comment preprocessing for LDA
    preprocessed = filtered.rdd.map(lambda r: (r[0], reddit_comment_preprocessing(r[1]), r[2]))
    
    #we return only posts which have actual textual comments. (hence the filter with the second row element (pythonic way of testing non emptiness of list))
    preprocessed_filtered = preprocessed.filter(lambda r : r[1])
    
    return preprocessed.map(lambda r: (r[0], (r[1], r[2])))


def perform_lda(documents, n_topics, n_words):
    '''
    will perform LDA on a list of documents (== list of token)
    assume that documents is a RDD.
    '''
    documents_df = documents.zipWithUniqueId().toDF().selectExpr("_1 as tokens", "_2 as uid")
    
    #TODO : fine tune parameters
    cv = CountVectorizer(inputCol="tokens", outputCol="raw_features")
    cvmodel = cv.fit(documents_df)
    result_cv = cvmodel.transform(documents_df)
    
    #we perform an tf-idf (term frequency inverse document frequency), to avoid comments with a lot of words to pollute the topics.
    idf = IDF(inputCol="raw_features", outputCol="features")
    idfModel = idf.fit(result_cv)
    result_tfidf = idfModel.transform(result_cv) 
    
    corpus = result_tfidf.select("uid", "features").rdd.map(lambda r: [r[0],Vectors.fromML(r[1])])
    model = LDA.train(corpus, k=n_topics)

    topics = model.describeTopics(maxTermsPerTopic=n_words)
    
    vocab = cvmodel.vocabulary
    
    terms = topics[0]
    
    #topic[0] represents the words, topic[1] their weights
    return [[(vocab[topic[0][n]], topic[1][n]) for n in range(len(topic[0]))] for _, topic in enumerate(topics)]


def lda_and_min_date(in_rdd, n_topics, n_words):
    tok_date = in_rdd.values()
    min_date = tok_date.values().min()
    topics = perform_lda(tok_date.keys(), n_topics, n_words)
    return (topics, min_date)

In [7]:
cleaned_preprocessed = dataset_cleaning_and_preprocessing(data)
test_sample = cleaned_preprocessed.sample(False, 0.0001)
individual_keys = test_sample.keys().distinct().collect()

In [None]:
lda_res_sample = [lda_and_min_date(test_sample.filter(lambda r: r[0] == post_id), 2, 3) for post_id in individual_keys]

In [None]:
sqlContext = SQLContext(sc)
schema = StructType([
    StructField("topics",    ArrayType(),   False),
    StructField("created",     DateType(),   False),
])

res_df = sqlContext.createDataFrame(sc.parallelize(lda_res_sample), schema)
res_df.write.mode('overwrite').parquet('2016_news_lda_sample.parquet')