In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import *
import pyspark.sql.functions as func
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()
comments = spark.read.load('../data/sample.parquet')
comments = comments.withColumn('created', func.from_unixtime(comments['created_utc'], 'yyyy-MM-dd HH:mm:ss.SS').cast(DateType()))
comments.registerTempTable("comments")
sc = spark.sparkContext

##### fetching post content from reddit using praw.

In [2]:
import praw
from praw.models import Submission

def retrieve_post_content(reddit_instance, post_id):
    if post_id[2] == '_':
        #the type identifier of the content has not been removed
        post_id = post_id[3:]
        
    post = Submission(reddit_instance, post_id)
    #the title is concatenated to the potential post content.
    return post.title +'\n\n'+ post.selftext

test_ids = ['t3_25xw','t3_cszs','t3_glz0']
reddit = praw.Reddit(client_id='PBYOhEGg-oACHw',
                     client_secret='zY2HF49pG_LAgauDlCnfmckUwz0',
                     user_agent='Fetch post content bot 1.0 by /u/raindust')

for t in test_ids:
    print(retrieve_post_content(reddit, t))

kullan-at posta hesabı


SAP ABAP Tutorials and Examples


Why the story with 165 diggs will never see the light of day (editor control at Digg)




### LDA with 1 topic 2 words on sample of comments on the month preceding the USA 2016 election

In [None]:
import spacy
import gensim
from spacy.lang.en import English
import nltk
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import datetime
en_stop = set(nltk.corpus.stopwords.words('english'))
parser = English()

start_date = datetime.date(year=2016, month=10, day=7)
end_date = datetime.date(year=2016, month=11, day=7)

oct_2016_news_comments = comments.select('link_id','body','created', 'subreddit').filter(comments.created > start_date).filter(comments.created < end_date).filter(comments.subreddit == 'news')

In [None]:
def lda_on_posts(dataset, words_per_topic, n_topics, parser=English(), stop_words=en_stop):
    '''   
    This function performs a LDA (Latent Dirichlet Allocation) model on a set of reddit comments.
    Useful for topic modelling/extraction from a reddit post.
    Parameters
    −−−−−−−−−−
    dataset: pyspark RDD or Dataframe, schema should have only two data type : 
              the post id (link_id) and the body of the comment in this order.
              
    words_per_topic: number of words that should constitute a topic per post.
    
    n_topics: number of topics to extract by post
    
    parser: the natural language parser used, corresponds to a language normally,
            by default english (as it is the most used language on reddit).
            should be a parser from the spacy.lang library.
    
    stop_words: set of words that constitutes stop words (i.e. that should be
                removed from the tokens)

    Returns
    −−−−−−−
    A RDD with the following pair of data as rows : (<post_id>, <topic (as a list of words)>)) 
    '''
    #useful functions for preprocessing the data for LDA
    def tokenize(text):
        lda_tokens = []
        tokens = parser(text)
        for token in tokens:
            if token.orth_.isspace():
                continue
            elif token.like_url:
                lda_tokens.append('URL')
            elif token.orth_.startswith('@'):
                lda_tokens.append('SCREEN_NAME')
            else:
                lda_tokens.append(token.lower_)
        return lda_tokens

    def get_lemma(word):
        lemma = wn.morphy(word)
        if lemma is None:
            return word
        else:
            return lemma

    def get_lemma2(word):
        return WordNetLemmatizer().lemmatize(word)

    def prepare_text_for_lda(text):
        tokens = tokenize(text)
        tokens = [token for token in tokens if len(token) > 4]
        tokens = [token for token in tokens if token not in en_stop]
        tokens = [get_lemma(token) for token in tokens]
        return tokens
    
    def get_n_topics(text_data):
        dictionary = gensim.corpora.Dictionary(text_data)
        corpus = [dictionary.doc2bow(text) for text in text_data]
        ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = n_topics, id2word=dictionary, passes=15)
        topics = ldamodel.print_topics(num_words=words_per_topic)
        return topics
    
    def extract_key_words(lda_result):
        return re.findall(r'\"(.*?)\"', lda_result)

    #detecting type of the input given.
    if isinstance(dataset, pyspark.sql.dataframe.DataFrame):
        dataset = dataset.rdd
    elif not isinstance(dataset, pyspark.rdd.RDD):
        raise ValueError('Wrong type of dataset, must be either a pyspark RDD or pyspark DataFrame')
    
    #TODO : keep the minimum timestamp (r[2] of the dataset) during computations.
    
    #filtering comments that were removed, to avoid them to pollute the topics extracted
    filter_absent_comm = dataset.filter(lambda r: r[1] != '[removed]' and r[1] != '[deleted]')
    
    #applying text preprocesisng for LDA + filtering all empty sets (without tokens as the result of the LDA preprocessing)
    LDA_preprocessed = filter_absent_comm.map(lambda r: (r[0], list(prepare_text_for_lda(r[1])))).filter(lambda r: r[1])
    
    #groupy every comments by post/thread id.
    post_and_list_token = LDA_preprocessed.groupByKey().map(lambda x : (x[0], list(x[1])))
    
    #generating n topics per post/thread.
    res_lda = post_and_list_token.map(lambda r: (r[0],get_n_topics(r[1]))).flatMap(lambda r: [(r[0], t) for t in r[1]])
    
    return res_lda.map(lambda r: (r[0], extract_key_words(r[1][1])))

#res = lda_on_posts(comments_about_usa_election.rdd.map(lambda r: (r[2], r[0])), 2, 1)
res = lda_on_posts(oct_2016_news_comments.rdd.map(lambda r: r), 2, 1)
hillary_and_trump = res.filter(lambda r: ('trump' in r[1]) or ('hillary' in r[1]) or ('donald' in r[1]) or ('clinton' in r[1]))
hillary_and_trump.toDF().show()