# PRACTICA 1: Extracción de datos de una red social

In [None]:
%%bash

CLIENT_ID=""
CLIENT_SECRET=""

echo "

[tgine]
client_id=$CLIENT_ID
client_secret=$CLIENT_SECRET
" >> praw.ini

In [None]:
import os

import praw
import json
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from pprint import pprint

from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID, BOOLEAN
from whoosh.qparser import QueryParser

In [None]:
subreddit_display_name = "science"
collections_dir = "collections"
collection_names = ["new", "hot", "rising"]
reddit_client_user_agent = "python:com.example.gonzalocl1024.tgine:v1.0 (by /u/gonzalocl1024)"

min_document_frequency = 10
most_central_terms = 50
most_repeated = 100

In [None]:
subreddit_attributes = [
    "display_name",
    "title",
    "active_user_count",
    "subscribers",
    "id",
    "description",
    "created_utc",
    "name"
]

submission_attributes = [
    "title",
    "name",
    "upvote_ratio",
    "ups",
    "score",
    "id",
    "created_utc",
    "selftext",
    "downs",
    "url"
]

comment_attributes = [
    "ups",
    "id",
    "score",
    "body",
    "downs"
]

def copy_attributes(dest, src, attributes):
    for attribute in attributes:
        try:
            dest[attribute] = src[attribute]
        except KeyError:
            i = None
            n = None
            try:
                i = src["id"]
            except KeyError:
                pass
            try:
                n = src["name"]
            except KeyError:
                pass
            print("\n{}, {}, {}".format(attribute, i, n))

def copy_subreddit(subreddit):
    dest = {}
    copy_attributes(dest, subreddit, subreddit_attributes)
    return dest

def copy_submission(submission):
    dest = {}
    copy_attributes(dest, submission, submission_attributes)
    return dest

def copy_comment(comment):
    dest = {}
    copy_attributes(dest, comment, comment_attributes)
    return dest

def copy_author(dest, src):
    if src.author:
        dest["author"] = src.author.name
    else:
        dest["author"] = None

Defino dos funciones una para obtener todos los _submissions_ en una lista y otra para obtener todos los comentarios de cada _submissions_.

In [None]:
def retrieve_submissions(submissions):

    submission_list = []
    i = 0

    for submission in submissions:
        submission_list.append(submission)

        i += 1
        print("\rRetrieved submissions: {}".format(i), end="")

    print()
    return submission_list

def retrieve_comments(collection, submissions):

    i = 0
    total_submissions = len(submissions)
    total_comments = 0

    collection["submissions"] = []

    for submission in submissions:

        _ = submission.title
        submission_copy = copy_submission(vars(submission))
        copy_author(submission_copy, submission)

        print(" Next comment batch: {:<35}".format(submission.num_comments), end="")

        comments = []
        submission.comments.replace_more(limit=None)

        for comment in submission.comments.list():
            _ = comment.body
            comment_copy = copy_comment(vars(comment))
            copy_author(comment_copy, comment)
            comments.append(comment_copy)

        collection["submissions"].append({
            "submission": submission_copy,
            "comments": comments
        })

        i += 1
        total_comments += len(comments)
        print("\rRetrieved comments: {} (Submissions: {}/{})".format(total_comments, i, total_submissions), end="")

    print()

def get_collection(subreddit, name):

    _ = subreddit.title

    submissions = getattr(subreddit, name)(limit=None)

    collection = {}

    # add subreddit info and date
    collection["collection_name"] = name
    collection["subreddit"] = copy_subreddit(vars(subreddit))
    collection["date"] = datetime.datetime.now().isoformat()

    # retrieve submissions list
    submission_list = retrieve_submissions(submissions)

    # retrieve comments
    retrieve_comments(collection, submission_list)

    return collection

In [None]:
def get_collection_path(collection_dir, display_name, collection_name):
    return os.path.join(collection_dir, "{}_{}.json".format(display_name, collection_name))

def get_collections(display_name, names):

    reddit_client = None
    subreddit = None

    os.makedirs(collections_dir, exist_ok=True)

    collections = {}

    for collection_name in names:

        collection_filename = get_collection_path(collections_dir, display_name, collection_name)

        if os.path.isfile(collection_filename):
            # read collection
            print("Using saved collection: {}".format(collection_filename))
            with open(collection_filename) as collection_file:
                collection = json.load(collection_file)

            total_comments = 0
            for submission in collection["submissions"]:
                total_comments += len(submission["comments"])
            print("  Submissions: {}, Comments: {}".format(len(collection["submissions"]), total_comments))

        else:
            # retrieve collection

            if not reddit_client:
                reddit_client = praw.Reddit("tgine", user_agent=reddit_client_user_agent)
                subreddit = reddit_client.subreddit(display_name)

            collection = get_collection(subreddit, collection_name)

            print("Saving collection: {}".format(collection_filename))
            with open(collection_filename, "w") as collection_file:
                json.dump(collection, collection_file)

        collections[collection_name] = collection

    return collections

In [None]:
def extract_corpus(collections):

    corpus = {}

    for collection in collections:
        corpus[collection] = []

        for submission in collections[collection]["submissions"]:

            corpus[collection].append("{} {}".format(submission["submission"]["title"],
                                                     submission["submission"]["selftext"]))

            for comment in submission["comments"]:

                # AutoModerator is a bot discard its messages
                if not comment["author"] == "AutoModerator":
                    corpus[collection].append(comment["body"])

    return corpus

In [None]:
subreddit_collections = get_collections(subreddit_display_name, collection_names)
collections_corpus = extract_corpus(subreddit_collections)

In [None]:
def print_central_repeated(corpus, min_doc_freq, most_central, most_rep):

    vectorizer = TfidfVectorizer(stop_words="english", min_df=min_doc_freq, max_features=most_rep)
    tfidf = vectorizer.fit_transform(corpus)

    most_repeated_terms = vectorizer.get_feature_names()

    central_terms_score = np.sum(tfidf.toarray(), axis=0)
    central_terms_indexes = np.argsort(central_terms_score)[-most_central:]
    central_terms = [most_repeated_terms[i] for i in central_terms_indexes]
    central_terms.reverse()

    print("{} most central terms".format(most_central))
    pprint(central_terms, width=100, compact=True)
    print()

    print("{} most repeated terms".format(most_rep))
    pprint(most_repeated_terms, width=100, compact=True)
    print("\n")

In [None]:
for collection_corpus in collections_corpus:

    print("Collection: {}".format(collection_corpus))
    print_central_repeated(collections_corpus[collection_corpus], min_document_frequency, most_central_terms, most_repeated)

In [None]:
def add_collections(writer, collections):

    total_submissions = 0
    submissions = 0
    for collection in collections:
        total_submissions += len(collections[collection]["submissions"])

    for collection in collections:
        for submission in collections[collection]["submissions"]:

            writer.add_document(collection=collection,
                                submission_id=submission["submission"]["id"],
                                author=submission["submission"]["author"],
                                content="{}\n{}".format(submission["submission"]["title"],
                                                        submission["submission"]["selftext"]),
                                is_submission=True)

            total_comments = len(submission["comments"])
            comments = 0
            submissions += 1

            for comment in submission["comments"]:
                writer.add_document(collection=collection,
                                    submission_id=submission["submission"]["id"],
                                    author=comment["author"],
                                    content=comment["body"],
                                    is_submission=False)

                comments += 1
                print("\rIndexed: Submissions {}/{}; Comments {}/{}               ".format(submissions,
                                                                                           total_submissions,
                                                                                           comments,
                                                                                           total_comments), end="")
    print()

def get_index(index_path, collections):

    if os.path.exists(index_path):
        print("Using saved index: {}".format(index_path))
        return open_dir(index_path)

    print("Creating index: {}".format(index_path))
    os.mkdir(index_path)

    schema = Schema(collection=TEXT(stored=True),
                    submission_id=ID(stored=True),
                    author=TEXT(stored=True),
                    content=TEXT(stored=True),
                    is_submission=BOOLEAN(stored=True))

    index = create_in(index_path, schema)

    writer = index.writer()
    add_collections(writer, collections)
    writer.commit()

    return index

def print_results(results):

    print("Search runtime: {}".format(results.runtime))
    print("Total results: {} (showing {})\n".format(results.estimated_length(),
                                                    results.scored_length()))

    for result in results:
        print("{:<20}{}".format(result["author"], result["content"].strip()))

In [None]:
collections_index = get_index("index", subreddit_collections)
index_searcher = collections_index.searcher()
query_parser = QueryParser("content", collections_index.schema)

In [None]:
query_str = "scientists is_submission:true"

query = query_parser.parse(query_str)
search_results = index_searcher.search(query)
print_results(search_results)

In [None]:
index_searcher.close()
collections_index.close()