In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

en_stop = set(stopwords.words('english'))
en_lemmatizer = WordNetLemmatizer()
sc = spark.sparkContext
sc.broadcast(en_stop)
sc.broadcast(en_lemmatizer)

import datetime
import re as re

from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA

In [2]:
data = spark.read.load('../data/oct_2016_news_comment.parquet')
data = data.sample(False, 0.0001, 35).cache()
print(data.count())
data.show()

54
+---------+--------------------+----------+---------+
|  link_id|                body|   created|subreddit|
+---------+--------------------+----------+---------+
|t3_57m54q|I've lost people ...|2016-10-15|     news|
|t3_57m54q|We need better a ...|2016-10-15|     news|
|t3_5aiyva|          Same in US|2016-11-01|     news|
|t3_5ah1kd|&gt; they just ta...|2016-11-01|     news|
|t3_57m54q|The lawsuit of a ...|2016-10-15|     news|
|t3_57xeeg|In my state at le...|2016-10-18|     news|
|t3_57zmi3|Thanks for saying...|2016-10-17|     news|
|t3_5aiyva|The fuck up here ...|2016-11-01|     news|
|t3_5bg6hy|           [removed]|2016-11-06|     news|
|t3_59tp8q|The people who bu...|2016-10-28|     news|
|t3_59tp8q|           [removed]|2016-10-28|     news|
|t3_57wi3j|Are you for real?...|2016-10-17|     news|
|t3_572621|You'd rather kill...|2016-10-12|     news|
|t3_56choj|&gt;If no, then y...|2016-10-08|     news|
|t3_56f95x|I don't remember ...|2016-10-08|     news|
|t3_57tzgl|           [re

In [3]:
def reddit_comment_preprocessing(txt, stop_words, lemmatizer):
    '''
    Take care of doing all the text preprocessing for LDA at the comment level
    Only works on english ASCII content. (works on content with accent or such, but filter them out)
    '''
    #necessary apparently to avoid this error "pickle.PicklingError: args[0] from newobj args has the wrong class"
    from nltk.corpus import wordnet
    
    #keeping only elements relevant to written speech.
    keep_only_letters = lambda s: re.sub('[^a-zA-Z \']+', '', s)
    
    #remove mark of genitif from speech (i.e. "'s" associated to nouns)
    remove_genitive = lambda s: re.sub('(\'s)+', '', s)
    
    #tokenizing the texts (removing line break, space and capitalization)
    token_comm = re.split(" ", remove_genitive(keep_only_letters(txt)).strip().lower())
    
    #removing all words of three letters or less
    bigger_w = [x for x in token_comm if len(x) > 3] 
    
    #removing stop_words
    wout_sw_w = [x for x in bigger_w if x not in stop_words]
    
    #get lemma of each word, then return result
    return [lemmatizer.lemmatize(word) for word in wout_sw_w]


def dataset_cleaning_and_preprocessing(data, stop_words, lemmatizer):
    '''
    take a pyspark dataframe as input,
    transform it into a RDD, and filter all 
    empty comments ([removed], [deleted], or just empty text)
    and apply the preprocessing for lda.
    '''
    #keeping only what matters
    dataset = data.select('link_id', 'body', 'created')
    
    #removing post that have been deleted/removed (thus hold no more information)
    filtered = dataset.filter(dataset.body != '[removed]').filter(dataset.body != '[deleted]').filter(dataset.body != '')
    
    #applying comment preprocessing for LDA
    preprocessed = filtered.rdd.map(lambda r: (r[0], reddit_comment_preprocessing(r[1], stop_words, lemmatizer), r[2]))
    
    #we return only posts which have actual textual comments. (hence the filter with the second row element (pythonic way of testing non emptiness of list))
    preprocessed_filtered = preprocessed.filter(lambda r : r[1])
    
    return preprocessed.map(lambda r: (r[0], (r[1], r[2])))


def perform_lda(documents, n_topics, n_words):
    '''
    will perform LDA on a list of documents (== list of token)
    assume that documents is a RDD.
    '''
    documents_df = documents.zipWithUniqueId().toDF().selectExpr("_1 as tokens", "_2 as uid")
    
    #TODO : fine tune parameters
    cv = CountVectorizer(inputCol="tokens", outputCol="raw_features")
    cvmodel = cv.fit(documents_df)
    result_cv = cvmodel.transform(documents_df)
    
    #we perform an tf-idf (term frequency inverse document frequency), to avoid comments with a lot of words to pollute the topics.
    idf = IDF(inputCol="raw_features", outputCol="features")
    idfModel = idf.fit(result_cv)
    result_tfidf = idfModel.transform(result_cv) 
    
    corpus = result_tfidf.select("uid", "features").rdd.map(lambda r: [r[0],Vectors.fromML(r[1])])
    model = LDA.train(corpus, k=n_topics, maxIterations=10)

    topics = model.describeTopics(maxTermsPerTopic=n_words)
    
    vocab = cvmodel.vocabulary
    
    terms = topics[0]
    
    #topic[0] represents the words, topic[1] their weights
    return [[(vocab[topic[0][n]], topic[1][n]) for n in range(len(topic[0]))] for _, topic in enumerate(topics)]


def lda_and_min_date(in_rdd, n_topics, n_words):
    tok_date = in_rdd.values()
    min_date = tok_date.values().min()
    topics = perform_lda(tok_date.keys(), n_topics, n_words)
    return (topics, min_date)

In [4]:
cleaned_preprocessed = dataset_cleaning_and_preprocessing(data, en_stop, en_lemmatizer)
individual_keys = cleaned_preprocessed.keys().distinct().collect()

In [5]:
lda_res_sample = [lda_and_min_date(cleaned_preprocessed.filter(lambda r: r[0] == post_id), 1, 3) for post_id in individual_keys]

In [6]:
sqlContext = SQLContext(sc)
res_df = sc.parallelize(lda_res_sample).toDF().selectExpr("_1 as topic", "_2 as date")
res_df.write.mode('overwrite').parquet('2016_news_lda_sample_3_WornetStem.parquet')

In [7]:
res_data = spark.read.load('2016_news_lda_sample_3.parquet')
res_data.show()

+--------------------+----------+
|               topic|      date|
+--------------------+----------+
|[[[america, 0.133...|2016-10-23|
|                [[]]|2016-10-08|
|                [[]]|2016-10-25|
|                [[]]|2016-10-24|
|                [[]]|2016-10-24|
|                [[]]|2016-10-11|
|                [[]]|2016-10-23|
|                [[]]|2016-10-26|
|                [[]]|2016-10-14|
|                [[]]|2016-10-14|
|                [[]]|2016-10-10|
|[[[eleven, 0.1666...|2016-11-01|
|                [[]]|2016-10-28|
|                [[]]|2016-10-12|
|                [[]]|2016-10-12|
|                [[]]|2016-10-12|
|                [[]]|2016-10-23|
|                [[]]|2016-10-25|
|                [[]]|2016-11-01|
|                [[]]|2016-10-12|
+--------------------+----------+
only showing top 20 rows



In [8]:
res_data_2 = spark.read.load('2016_news_lda_sample_3_WornetStem.parquet')
res_data_2.show()

+--------------------+----------+
|               topic|      date|
+--------------------+----------+
|[[[america, 0.133...|2016-10-23|
|                [[]]|2016-10-08|
|                [[]]|2016-10-25|
|                [[]]|2016-10-24|
|                [[]]|2016-10-24|
|                [[]]|2016-10-11|
|                [[]]|2016-10-23|
|                [[]]|2016-10-26|
|                [[]]|2016-10-14|
|                [[]]|2016-10-14|
|                [[]]|2016-10-10|
|[[[eleven, 0.1666...|2016-11-01|
|                [[]]|2016-10-28|
|                [[]]|2016-10-12|
|                [[]]|2016-10-12|
|                [[]]|2016-10-12|
|                [[]]|2016-10-23|
|                [[]]|2016-10-25|
|                [[]]|2016-11-01|
|                [[]]|2016-10-12|
+--------------------+----------+
only showing top 20 rows

