# Data Fetching

In [2]:
import os
import xml.etree.ElementTree as et
import json
import ijson
from dateutil import parser

In [3]:
#posts_path = os.path.join("E:", "Uni", "DataViz", "DataSet", "Posts.xml")
#questions_path = os.path.join("E:", "Uni", "DataViz", "DataSet", "Questions.json")
#answers_path = os.path.join("E:", "Uni", "DataViz", "DataSet", "Answers.json")
#edge_list_path = os.path.join("E:", "Uni", "DataViz", "DataSet", "Edges.json")
#edge_list_tag_path = os.path.join("E:", "Uni", "DataViz", "DataSet", "Tags")

posts_path = os.path.join("Posts.xml")
questions_path = os.path.join("Questions.json")
answers_path = os.path.join("Answers.json")
edge_list_path = os.path.join("Edges.json")
edge_list_tag_path = os.path.join("Tags")

In [3]:
posts_path

'Posts.xml'

In [4]:
with open(questions_path, 'w') as q, open(answers_path, 'w') as a:
    q.write("{")
    a.write("{")
    firstQ = True
    firstA = True
    for event, elem in et.iterparse(posts_path):
        if elem.tag == "row":
            try:
                parsed = {}
                parsed["id"] = elem.attrib["Id"]
                parsed["owner_id"] = elem.attrib["OwnerUserId"]
                parsed["time"] = elem.attrib["CreationDate"]
                parsed["votes"] = elem.attrib["Score"]
                try:                    
                    parsed["tags"] = elem.attrib["Tags"]
                except KeyError:
                    parsed["tags"] = ""
                if elem.attrib["PostTypeId"] == "1":
                    parsed["ref"] = elem.attrib["AcceptedAnswerId"]
                    if not firstQ:
                        q.write(",\n")
                    else:
                        firstQ = False
                    q.write( "\"" + parsed["id"] + "\":" + json.dumps(parsed))
                else :                    
                    parsed["ref"] = elem.attrib["ParentId"]
                    if not firstA:
                        a.write(",\n")
                    else:
                        firstA = False
                    a.write( "\"" + parsed["ref"] + "\":" + json.dumps(parsed))
            except KeyError as e:
                pass
                # ignore posts without answer for now
                # ignore posts without user id
        elem.clear()
    q.write("}")
    a.write("}")

In [4]:
# create edge list
with open(questions_path, 'r') as q, open(answers_path, 'r') as a, open(edge_list_path,'w') as e:
    questions = json.load(q)
    answers = json.load(a)
    e.write("{ \"edges\":[")
    first = True
    for question in questions:
        if question in answers:
            answer = answers[question]
            if not first:
                e.write(",\n")
            else:
                first = False
            e.write(json.dumps({'q_id':questions[question]["owner_id"], 'a_id':answer["owner_id"], 'time':answer["time"], 'tags':questions[question]["tags"], "votes":answer["votes"]}))
    e.write("]}")

In [5]:
# split in file for each tag
for the_file in os.listdir(edge_list_tag_path):
    file_path = os.path.join(edge_list_tag_path, the_file)
    try:
        if os.path.isfile(file_path):
            os.unlink(file_path)
    except Exception as e:
        print(e)


with open(edge_list_path,'r') as e:
    edges = ijson.items(e, "edges.item")
    for edge in edges:
        # split tags
        tags = [s.replace("<", "").replace(">","") for s in edge["tags"].split("><")]
        edge.pop('tags', None)
        for tag in tags:
            with open(os.path.join(edge_list_tag_path, "{}.json".format(tag)), 'a') as e_tag:
                e_tag.write(json.dumps(edge)+"\n")

In [6]:
# proper json
for the_file in os.listdir(edge_list_tag_path):
    file_path = os.path.join(edge_list_tag_path, the_file)
    with open(file_path,'r+') as e:
        content = e.read()
        content = content.replace("}\n{", "},\n{")
        e.seek(0, 0)
        e.write( "{ \"edges\":[" + '\n' + content)
        e.write("]}")

In [8]:
# order by time
for the_file in os.listdir(edge_list_tag_path):
    if "_ordered" in the_file or "_list" in the_file:
        continue
    file_path = os.path.join(edge_list_tag_path, the_file)
    file_ordered_path = os.path.join(edge_list_tag_path, the_file.replace(".json", "_ordered.json"))
    with open(file_path,'r') as e, open(file_ordered_path,'w') as e_ordered:
        edges = json.load(e)
        edges["edges"].sort(key=lambda elem: parser.parse(elem["time"]))
        e_ordered.write(json.dumps(edges))

In [13]:
for the_file in os.listdir(edge_list_tag_path):
    if not "_ordered.json" in the_file:
        continue
    file_path = os.path.join(edge_list_tag_path, the_file)
    list_path = os.path.join(edge_list_tag_path, the_file.replace(".json", "_list.txt"))
    with open(file_path,'r') as e, open(list_path,'w') as l:
        edges = ijson.items(e, "edges.item")
        for edge in edges:
            l.write(edge["q_id"]+ " " + edge["a_id"]+ " "+ edge["time"]+ " "+ edge["votes"]+ "\n")