## Wegetarianizm -- Polska

In [None]:
## Import modules
import praw
from praw.models import MoreComments
import os
from datetime import datetime
import spacy
nlp = spacy.load('pt_core_news_lg')
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import sys

## stop_words = stopwords.words('polish')
## Get the tokens to connect to Reddit Oficial API
client_id = os.getenv("Reddit_Client_Id")
client_secret = os.getenv('Reddit_Client_Secret')
password = os.getenv('Reddit_password')
user_agent = os.getenv('Reddit_User_Agent')
username = os.getenv('Reddit_Username')


In [None]:
## Connect to Reddit API
reddit = praw.Reddit(
    client_id=client_id,
    client_secret = client_secret,
    password=password,
    user_agent=user_agent,
    username=username
)

In [None]:
class MyCorpus:
    """
    A class that represents a corpus and has usefull methods defined.

    """
    
    def __init__(self, path, key='content'):
        """
        Reads from a JSON line file. Tokenizes and lemmatizes
        the text under key. It writes out the new JSON line
        file with a new field -- tokens.
        Args:
            path (str): a path to a JSON line.
            key (str): a key with the content to lemmatize.
        """
        self._path_original = path
        self._key = key
        self._dictionary = None
        self._path = path.replace('.', '_NLP.')
        with open(self._path, 'w') as file:
            n = 1
            for line in open(self._path_original, 'r'):
                temp_dict = json.loads(line)
                if temp_dict[self._key] == '[deleted]':
                    continue
                text_nlp = nlp(temp_dict[self._key])
                temp_dict['tokens'] = []
                for token in text_nlp:
                    is_stop = token.is_stop or token.is_punct or token.is_space \
                        or token.is_bracket or token.is_currency or token.is_digit \
                        or token.is_quote or token.like_url or token.like_email \
                        or len(token) < 2 or len(token) > 20 or (not token.lemma_.isalpha())
                    if is_stop:
                        continue
                    else:
                        temp_dict['tokens'].append(token.lemma_.lower())
                file.write( json.dumps(temp_dict) + '\n')
                sys.stdout.write(f'\rLine {n} processed')
                n += 1
                sys.stdout.flush()

        
    def set_dictionary(self, dictionary):
        """
        Assigns a gensim.corpora.dictionary.Dictioanry object
        to self._dictionary.

        Args:
            dictionary (gensim.corpora.dictionary.Dictionary): a dictionary
            that stores the frequencies of unique tokens in the corpus.
        """
        self._dictionary = dictionary

    def get_tokens(self):
        """
        It takes the path to a JSON line file with comments from Reddit and
        returns a generator that yields tokens for each comment.

        Yields:
            list : list of tokens for a comment from Reddit. 
        """
        for doc in open(self._path, 'r'):
            temp = json.loads(doc)
            yield temp['tokens']
    
    def get_bow(self):
        """
        It takes a dictionary with frequencies of unique tokens in the corpus
        and for each list of tokens returns a list of tuples that denote the 
        id of a given token and its frequency in a given document.

        Raises:
            ValueError: if the dictionary was not assigned to self._dictionary.

        Yields:
            list : a list of tuples that denote the id of a given token and its
            frequency in a given document.
        """
        if self._dictionary:
            for doc in self.get_tokens():
                yield self._dictionary.doc2bow(doc)
        else:
            raise ValueError('Dictionary has the value of None')
    
    def __iter__(self):
        """
        Yields:
            list : a list of tuples that denote the id of a given token and
            its frequency in a given document.
        """
        for doc in self.get_bow():
            yield doc

    def get_topics(self, model):
        """
        It takes a model and returns a generator that yields a mapping for each
        comment from Reddit. Among other keys it returns the most probable topic
        based on the LDA model provided and its probability.

        Args:
            model (gensim.models.ldamodel.LdaModel): Latent Dirchlet Allocation
            model.

        Yields:
            dict : a mapping for each comment from Reddit. Among other keys it
            returns the most prpobable topic based on the LDA model provided and
            its probability. 
        """
        for doc in open(self._path, 'r'):
            temp = json.loads(doc)
            topics = model.get_document_topics(self._dictionary.doc2bow(temp['tokens']))
            topic, prob = sorted( topics, key = lambda x: x[1], reverse=True )[0]
            temp['topic'] = topic + 1
            temp['topic_prob'] = prob
            yield temp

                
class MyModel(LdaModel):
    """
    Subclass of gensim.models.LdaModel.
    """
    def get_coherence(self, corpus):
        """
        Returns the average coherence measure for the given model.

        Args:
            corpus (MyCorpus): A corpus on which the model is computed. 

        Returns:
            float: the average coherence measure for the given model.
        """
        top_topics = self.top_topics(corpus)
        return sum([t[1] for t in top_topics]) / len(top_topics)
    
    def get_top_tokens(self, corpus):
        """
        Returns a list of dictionaries that depict the most probable
        tokens for each topic.

        Args:
            corpus (MyCorpus): A corpus on which the model was computed.

        Returns:
            list: list of dicitionaries that depict the most probable 
            tokens fro each topic.
        """
        top_tokens = self.top_topics(corpus)
        return [ { key : value for value, key in t[0] } for t in top_tokens ]

    
    
        
def run_lda_models(corpus, dictionary, min_topics, max_topics, step = 1, **kwargs):
    """
    Computes a sequence of lda models for a given corpus and dictionary. It prints
    the coherence measure and number of topics to the screen. It writes out the
    model to disk.

    Args:
        corpus (MyModel): A stream of document vectors or sparse matrix of shape (num_documents, num_terms).
        dictionary (dict): a mapping that assigns id to unique tokens from the corpus.
        min_topics (int): the smallest number of topics to compute.
        max_topics (int): the highest number of topics to compute.
        step (int, optional): the size of the break inbetween computed models. Defaults to 1.
    """
    name = input("Please provide the name of the model\n")
    temp = dictionary[0]
    id2word = dictionary.id2token
    if not os.path.exists('models'):
        os.mkdir('models')
    if not os.path.exists('png'):
        os.mkdir('png')
    for num_topic in range(min_topics, max_topics+1, step):
        model = MyModel( corpus = corpus,
                         id2word=id2word,
                         alpha = 'asymmetric',
                         eta = 'auto',
                         iterations = 500,
                         passes = 20,
                         eval_every=None,
                         num_topics=num_topic,
                         random_state=1044,
                         per_word_topics=True)
        temp_dict = {}
        temp_dict['name'] = name
        temp_dict['num_topics'] =  num_topic
        temp_dict['coherence'] = model.get_coherence(corpus = corpus)
        path_name = os.path.join('models', name + '-' + str(num_topic))
        model.save(path_name) 
        print(temp_dict)

In [None]:
## Get all submissions from Reddit Polska that contain a word 
## wegetarianizm
subreddit = reddit.subreddit('Polska').search('wegetarianizm')
## Create a list of dictionaries with the submissions
submissions = [ { 'title' : line.title,
                  'id' : line.id,
                  'upvote_ratio' : line.upvote_ratio,
                  'selftext' : line.selftext,
                  'score' : line.score,
                  'flair' : line.link_flair_text,
                  'num_comments' : line.num_comments,
                  'is_self' : line.is_self} 
               for line in subreddit ]

There is a lot of information we can get from a submissions. The following fields are out there but probably we don't need all of them. I put them here just in case.

* author -- provides an instance of Redditor.
* author_flair_text -- the text content of the author’s flair, or None if not flaired. In simple terms, a flair on reddit is a kind of tag added to either post or username. They are meant to categorize posts or users.
* clicked -- whether or not the submission has been clicked by the client.
* comments -- provides an instance of CommentForest.
* created_utc -- time the submission was created, represented in Unix Time.
* distinguished -- whether or not the submission is distinguished.
* edited -- Whether or not the submission has been edited.
* id -- ID of the submission.
* is_original_content -- whether or not the submission has been set as original content.
* is_self -- whether or not the submission is a selfpost (text-only).
* link_flair_template_id -- the link flair’s ID.
* link_flair_text -- The link flair’s text content, or None if not flaired.
* locked -- whether or not the submission has been locked.
* name -- Fullname of the submission.
* num_comments -- the number of comments on the submission.
* over_18 -- whether or not the submission has been marked as NSFW.
* permalink -- a permalink for the submission.
* poll_data -- a PollData object representing the data of this submission, if it is a poll submission.
* saved -- whether or not the submission is saved.
* score -- the number of upvotes for the submission.
* selftext -- the submissions’ selftext - an empty string if a link post.
* spoiler -- whether or not the submission has been marked as a spoiler.
* stickied -- whether or not the submission is stickied.
* subreddit -- provides an instance of Subreddit.
* title -- the title of the submission.
* upvote_ratio -- the percentage of upvotes from all votes on the submission.
* url -- the URL the submission links to, or the permalink if a selfpost.

In [None]:
## Just print out the most important information about each submission.
for sub in submissions: print({ 'id' : sub['id'], 'title' : sub['title'], 'num_comments' : sub['num_comments'] })

There is a lot of information on a single comment. The following fields are otu there but probably we don't need all of them. I put them here just in case.jjj

* author -- provides an instance of Redditor.
* body -- the body of the comment, as Markdown.
* body_html -- he body of the comment, as HTML.
* created_utc -- time the comment was created, represented in Unix Time.
* distinguished -- whether or not the comment is distinguished.
* edited -- whether or not the comment has been edited.
* id -- the ID of the comment.
* is_submitter -- whether or not the comment author is also the author of the submission.
* link_id -- the submission ID that the comment belongs to.
* parent_id -- he ID of the parent comment (prefixed with t1_). If it is a top-level comment, this returns the submission ID instead (prefixed with t3_).
* permalink -- a permalink for the comment. Comment objects from the inbox have a context attribute instead.
* replies -- provides an instance of CommentForest.
* saved -- whether or not the comment is saved.
* score -- the number of upvotes for the comment.
* stickied -- whether or not the comment is stickied.
* submission -- provides an instance of Submission. The submission that the comment belongs to.
* subreddit -- provides an instance of Subreddit. The subreddit that the comment belongs to.
* subreddit_id -- the subreddit ID that the comment belongs to.

And for the Redditor

* comment_karma -- the comment karma for the Redditor.
* comments -- provide an instance of SubListing for comment access.
* submissions -- provide an instance of SubListing for submission access.
* created_utc -- time the account was created, represented in Unix Time.
* has_verified_email -- whether or not the Redditor has verified their email.
* icon_img -- the url of the Redditors’ avatar.
* id -- the ID of the Redditor.
* is_employee -- whether or not the Redditor is a Reddit employee.
* is_friend -- whether or not the Redditor is friends with the authenticated user.
* is_mod -- whether or not the Redditor mods any subreddits.
* is_gold -- whether or not the Redditor has active Reddit Premium status.
* is_suspended -- whether or not the Redditor is currently suspended.
* link_karma -- the link karma for the Redditor.
* name -- the Redditor’s username.
* subreddit -- if the Redditor has created a user-subreddit, provides a dictionary of additional attributes. See below.
* subreddit["banner_img"] -- the URL of the user-subreddit banner.
* subreddit["name"]-- the fullname of the user-subreddit.
* subreddit["over_18"] -- whether or not the user-subreddit is NSFW.
* subreddit["public_description"] -- the public description of the user-subreddit.
* subreddit["subscribers"] -- the number of users subscribed to the user-subreddit.
* subreddit["title"] -- the title of the user-subreddit.

https://www.reddit.com/r/portugal/comments/uap723/somos_o_pa%C3%ADs_com_menos_vegetarianos_per_capita/

https://www.reddit.com/r/PORTUGALCARALHO/comments/xhuqbq/consumo_de_carne_na_europa/

Other links:

https://www.reddit.com/r/portugal/comments/rprwkj/é_poss%C3%ADvel_ter_uma_dieta_vegan_saudável_serio/?utm_source=share&utm_medium=web2x&context=3

https://www.reddit.com/r/portugal/comments/4oj1mg/transição_para_veganvegetariano/?utm_source=share&utm_medium=web2x&context=3

https://www.reddit.com/r/portugal/comments/lrcixp/qual_a_vossa_opinião_em_relação_ao_veganismo/

In [None]:
## Select a submission by id -- this one is about vegetarianism and veganism
## on the Reddit Polska.
submission = reddit.submission("lrcixp")

## Set the option to get all the comments
submission.comments.replace_more(limit=None)

## Iterate over all the comments. Ignore the comments
## tree. Write the comments to the JSON line file.
with open('data/comments_portugal_healthy_diet.jl', 'w') as file:
  for comment in submission.comments.list():
      temp_dict = {}
      temp_dict['body'] = comment.body
      temp_dict['score'] = comment.score
      temp_dict['link'] = comment.permalink
      temp_dict['parent_id'] = comment.parent_id
      try:
          temp_dict['author'] = { 'name' : comment.author.name,
                                  'karma' : comment.author.comment_karma,
                                  'created_utc' : datetime.fromtimestamp(comment.author.created_utc).strftime('%d-%m-%Y %H:%M:%S'),
                                  'has_verified_email' : comment.author.has_verified_email,
                                  #'is_suspended' : comment.author.is_suspended,
                                  'is_gold' : comment.author.is_gold
          }
      except:
          pass
      temp_dict['created_utc'] = datetime.fromtimestamp(comment.created_utc).strftime('%d-%m-%Y %H:%M:%S')
      temp_dict['edited'] = comment.edited
      temp_dict['is_submitter'] = comment.is_submitter
      
      file.write(json.dumps(temp_dict) + '\n')

In [None]:
## Read corpus
corpus = MyCorpus(path = 'data/comments_portugal_healthy_diet.jl', key = 'body')

In [None]:
## Create the dictionary
dictionary = Dictionary( corpus.get_tokens() )

In [None]:
dictionary.filter_extremes(no_below=.9, no_above=.1)

In [None]:
## Add the dictionary to the corpus
corpus.set_dictionary(dictionary)

In [None]:
## Compute modesl and write them out to the files
run_lda_models(corpus = corpus, dictionary = dictionary, min_topics=2, max_topics=15)

In [None]:
## Read in the module. It requires providing
## the name of the model we want to load.
model_name = input('Provide the name of the model you would like to load:\r')
model_path = os.path.join('models', model_name)
model = LdaModel.load(model_path)

In [None]:
## Print out and write the figures with the most 
## probable tokens in each topic.
list_top_tokens = model.get_top_tokens(corpus)
for i in range(len(list_top_tokens)):
    plt.barh(list(list_top_tokens[i].keys()), list(list_top_tokens[i].values()), align = 'center')
    plt.xlim(0,.02)
    plt.gca().invert_yaxis()
    plt.title('Topic' + ' ' + str(i + 1))
    plt.xlabel('Probability')
    plt.savefig('png/' + 'topic' + str(i + 1))
    plt.show()

In [None]:
## Write out the results into a CSV file
pd.DataFrame.from_records(line for line in corpus.get_topics(model = model)).to_excel('data/' + model_name + 'topics.xlsx')