## Wegetarianizm -- Polska

In [None]:
## Import modules
import praw
from praw.models import MoreComments
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
from datetime import datetime
import spacy
nlp = spacy.load("pl_core_news_sm")
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import json

## stop_words = stopwords.words('polish')
## Get the tokens to connect to Reddit Oficial API
client_id = os.getenv("Reddit_Client_Id")
client_secret = os.getenv('Reddit_Client_Secret')
password = os.getenv('Reddit_password')
user_agent = os.getenv('Reddit_User_Agent')
username = os.getenv('Reddit_Username')


In [None]:
## Connect to Reddit API
reddit = praw.Reddit(
    client_id=client_id,
    client_secret = client_secret,
    password=password,
    user_agent=user_agent,
    username=username
)

In [None]:
## Generator for creating a dictionary
def read_tokens():
    with open('comments.jl', 'r') as file:
        for line in file:
            temp = json.loads(line)
            yield temp['tokens']

## New class for corpus. It streams docs or if the model is provided it returns results of LDA
class MyCorpus:
    def __init__(self, path = 'comments.jl', model=None, dictionary=None):
        self.path = path
        self.model = model
    def __iter__(self):
        with open(self.path, 'r') as file:
            for doc in file:
                temp = json.loads(doc)
                if not self.model:
                    yield dictionary.doc2bow(temp['tokens'])
                else:
                    topics = self.model.get_document_topics(dictionary.doc2bow(temp['tokens']))
                    topic, prob = sorted( topics, key = lambda x: x[1], reverse=True)[0]
                    temp['topic'] = topic
                    temp['prob'] = prob
                    top_terms = self.model.get_topic_terms(topic, 100)
                    temp['top_tokens'] = { dictionary.id2token[id] : prob for id, prob in top_terms }
                    yield temp

In [None]:
## Get all submissions from Reddit Polska that contain a word 
## wegetarianizm
subreddit = reddit.subreddit('Polska').search('wegetarianizm')
## Create a list of dictionaries with the submissions
submissions = [ { 'title' : line.title,
                  'id' : line.id,
                  'upvote_ratio' : line.upvote_ratio,
                  'selftext' : line.selftext,
                  'score' : line.score,
                  'flair' : line.link_flair_text,
                  'num_comments' : line.num_comments,
                  'is_self' : line.is_self} 
               for line in subreddit ]

There is a lot of information we can get from a submissions. The following fields are out there but probably we don't need all of them. I put them here just in case.

* author -- provides an instance of Redditor.
* author_flair_text -- the text content of the author’s flair, or None if not flaired. In simple terms, a flair on reddit is a kind of tag added to either post or username. They are meant to categorize posts or users.
* clicked -- whether or not the submission has been clicked by the client.
* comments -- provides an instance of CommentForest.
* created_utc -- time the submission was created, represented in Unix Time.
* distinguished -- whether or not the submission is distinguished.
* edited -- Whether or not the submission has been edited.
* id -- ID of the submission.
* is_original_content -- whether or not the submission has been set as original content.
* is_self -- whether or not the submission is a selfpost (text-only).
* link_flair_template_id -- the link flair’s ID.
* link_flair_text -- The link flair’s text content, or None if not flaired.
* locked -- whether or not the submission has been locked.
* name -- Fullname of the submission.
* num_comments -- the number of comments on the submission.
* over_18 -- whether or not the submission has been marked as NSFW.
* permalink -- a permalink for the submission.
* poll_data -- a PollData object representing the data of this submission, if it is a poll submission.
* saved -- whether or not the submission is saved.
* score -- the number of upvotes for the submission.
* selftext -- the submissions’ selftext - an empty string if a link post.
* spoiler -- whether or not the submission has been marked as a spoiler.
* stickied -- whether or not the submission is stickied.
* subreddit -- provides an instance of Subreddit.
* title -- the title of the submission.
* upvote_ratio -- the percentage of upvotes from all votes on the submission.
* url -- the URL the submission links to, or the permalink if a selfpost.

In [None]:
## Just print out the most important information about each submission.
for sub in submissions: print({ 'id' : sub['id'], 'title' : sub['title'], 'num_comments' : sub['num_comments'] })

There is a lot of information on a single comment. The following fields are otu there but probably we don't need all of them. I put them here just in case.jjj

* author -- provides an instance of Redditor.
* body -- the body of the comment, as Markdown.
* body_html -- he body of the comment, as HTML.
* created_utc -- time the comment was created, represented in Unix Time.
* distinguished -- whether or not the comment is distinguished.
* edited -- whether or not the comment has been edited.
* id -- the ID of the comment.
* is_submitter -- whether or not the comment author is also the author of the submission.
* link_id -- the submission ID that the comment belongs to.
* parent_id -- he ID of the parent comment (prefixed with t1_). If it is a top-level comment, this returns the submission ID instead (prefixed with t3_).
* permalink -- a permalink for the comment. Comment objects from the inbox have a context attribute instead.
* replies -- provides an instance of CommentForest.
* saved -- whether or not the comment is saved.
* score -- the number of upvotes for the comment.
* stickied -- whether or not the comment is stickied.
* submission -- provides an instance of Submission. The submission that the comment belongs to.
* subreddit -- provides an instance of Subreddit. The subreddit that the comment belongs to.
* subreddit_id -- the subreddit ID that the comment belongs to.

And for the Redditor

* comment_karma -- the comment karma for the Redditor.
* comments -- provide an instance of SubListing for comment access.
* submissions -- provide an instance of SubListing for submission access.
* created_utc -- time the account was created, represented in Unix Time.
* has_verified_email -- whether or not the Redditor has verified their email.
* icon_img -- the url of the Redditors’ avatar.
* id -- the ID of the Redditor.
* is_employee -- whether or not the Redditor is a Reddit employee.
* is_friend -- whether or not the Redditor is friends with the authenticated user.
* is_mod -- whether or not the Redditor mods any subreddits.
* is_gold -- whether or not the Redditor has active Reddit Premium status.
* is_suspended -- whether or not the Redditor is currently suspended.
* link_karma -- the link karma for the Redditor.
* name -- the Redditor’s username.
* subreddit -- if the Redditor has created a user-subreddit, provides a dictionary of additional attributes. See below.
* subreddit["banner_img"] -- the URL of the user-subreddit banner.
* subreddit["name"]-- the fullname of the user-subreddit.
* subreddit["over_18"] -- whether or not the user-subreddit is NSFW.
* subreddit["public_description"] -- the public description of the user-subreddit.
* subreddit["subscribers"] -- the number of users subscribed to the user-subreddit.
* subreddit["title"] -- the title of the user-subreddit.

In [None]:
## Select a submission by id -- this one is about vegetarianism and veganism
## on the Reddit Polska.
submission = reddit.submission("vnapm6")

## Create an empty list to store data about comments.
comments = []

## Set the option to get all the comments
submission.comments.replace_more(limit=None)

## Iterate over all the comments. Ignore the comments
## tree. Write the comments to the JSON line file.
with open('comments.jl', 'w') as file:
  for comment in submission.comments.list():
      temp_dict = {}
      temp_dict['body'] = comment.body
      temp_dict['score'] = comment.score
      try:
          temp_dict['author'] = { 'name' : comment.author.name,
                                  'karma' : comment.author.comment_karma,
                                  'created_utc' : datetime.fromtimestamp(comment.author.created_utc).strftime('%d-%m-%Y %H:%M:%S'),
                                  'has_verified_email' : comment.author.has_verified_email,
                                  #'is_suspended' : comment.author.is_suspended,
                                  'is_gold' : comment.author.is_gold
          }
      except:
          pass
      temp_dict['created_utc'] = datetime.fromtimestamp(comment.created_utc).strftime('%d-%m-%Y %H:%M:%S')
      temp_dict['edited'] = comment.edited
      temp_dict['is_submitter'] = comment.is_submitter
      text_nlp = nlp(temp_dict['body'].lower())
      temp_dict['tokens'] = [ token.lemma_ for token in text_nlp if len(token) > 1 and token.pos_ not in ['PUNCT', 'ADP', 'CCONJ', 'X' ] ]
      
      comments.append(temp_dict)
      file.write(json.dumps(temp_dict) + '\n')

In [None]:
dictionary = Dictionary( read_tokens() )

In [None]:
# Set training parameters.
num_topics = 5
passes = 100
iterations = 400
eval_every = None  ## Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=MyCorpus(dictionary=dictionary),
    id2word=id2word,
    ## chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [None]:
corpus = MyCorpus(dictionary=dictionary)
top_topics = model.top_topics(corpus)
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

In [None]:
sorted([ item['top_tokens'] for item in MyCorpus(model=model) if item['topic'] == 1 ][0])