# PRACTICA 1: Extracción de datos de una red social

In [None]:
%%bash

CLIENT_ID=""
CLIENT_SECRET=""

echo "

[tgine]
client_id=$CLIENT_ID
client_secret=$CLIENT_SECRET
" >> praw.ini

In [None]:
import os

import praw
import json
import datetime

In [None]:
subreddit_display_name = "science"
collections_dir = "collections"
collection_names = ["new", "hot", "rising"]
reddit_client_user_agent = "python:com.example.gonzalocl1024.tgine:v1.0 (by /u/gonzalocl1024)"

In [None]:
subreddit_attributes = [
    "display_name",
    "title",
    "active_user_count",
    "subscribers",
    "id",
    "description",
    "created_utc",
    "name"
]

submission_attributes = [
    "title",
    "name",
    "upvote_ratio",
    "ups",
    "score",
    "id",
    "created_utc",
    "selftext",
    "downs",
    "url"
]

comment_attributes = [
    "ups",
    "id",
    "score",
    "body",
    "downs"
]

def copy_attributes(dest, src, attributes):
    for attribute in attributes:
        try:
            dest[attribute] = src[attribute]
        except KeyError:
            i = None
            n = None
            try:
                i = src["id"]
            except KeyError:
                pass
            try:
                n = src["name"]
            except KeyError:
                pass
            print("\n{}, {}, {}".format(attribute, i, n))

def copy_subreddit(subreddit):
    dest = {}
    copy_attributes(dest, subreddit, subreddit_attributes)
    return dest

def copy_submission(submission):
    dest = {}
    copy_attributes(dest, submission, submission_attributes)
    return dest

def copy_comment(comment):
    dest = {}
    copy_attributes(dest, comment, comment_attributes)
    return dest

def copy_author(dest, src):
    if src.author:
        dest["author"] = src.author.name
    else:
        dest["author"] = None

Defino dos funciones una para obtener todos los _submissions_ en una lista y otra para obtener todos los comentarios de cada _submissions_.

In [None]:
def retrieve_submissions(submissions):

    submission_list = []
    i = 0

    for submission in submissions:
        submission_list.append(submission)

        i += 1
        print("\rRetrieved submissions: {}".format(i), end="")

    print()
    return submission_list

def retrieve_comments(collection, submissions):

    i = 0
    total_submissions = len(submissions)
    total_comments = 0

    collection["submissions"] = []

    for submission in submissions:

        _ = submission.title
        submission_copy = copy_submission(vars(submission))
        copy_author(submission_copy, submission)

        print(" Next comment batch: {:<35}".format(submission.num_comments), end="")

        comments = []
        submission.comments.replace_more(limit=None)

        for comment in submission.comments.list():
            _ = comment.body
            comment_copy = copy_comment(vars(comment))
            copy_author(comment_copy, comment)
            comments.append(comment_copy)

        collection["submissions"].append({
            "submission": submission_copy,
            "comments": comments
        })

        i += 1
        total_comments += len(comments)
        print("\rRetrieved comments: {} (Submissions: {}/{})".format(total_comments, i, total_submissions), end="")

    print()


def get_collection(subreddit, name):

    _ = subreddit.title

    submissions = getattr(subreddit, name)(limit=None)

    collection = {}

    # add subreddit info and date
    collection["collection_name"] = name
    collection["subreddit"] = copy_subreddit(vars(subreddit))
    collection["date"] = datetime.datetime.now().isoformat()

    # retrieve submissions list
    submission_list = retrieve_submissions(submissions)

    # retrieve comments
    retrieve_comments(collection, submission_list)

    return collection

In [None]:
def get_collection_path(collection_dir, display_name, collection_name):
    return os.path.join(collection_dir, "{}_{}.json".format(display_name, collection_name))

def get_collections(display_name, names):

    reddit_client = None
    subreddit = None

    os.makedirs(collections_dir, exist_ok=True)

    collections = {}

    for collection_name in names:

        collection_filename = get_collection_path(collections_dir, display_name, collection_name)

        if os.path.isfile(collection_filename):
            # read collection
            print("Using saved collection: {}".format(collection_filename))
            with open(collection_filename) as collection_file:
                collection = json.load(collection_file)

        else:
            # retrieve collection

            if not reddit_client:
                reddit_client = praw.Reddit("tgine", user_agent=reddit_client_user_agent)
                subreddit = reddit_client.subreddit(display_name)

            collection = get_collection(subreddit, collection_name)

            print("Saving collection: {}".format(collection_filename))
            with open(collection_filename, "w") as collection_file:
                json.dump(collection, collection_file)

        collections[collection_name] = collection

    return collections

In [None]:
collections = get_collections(subreddit_display_name, collection_names)