# Conversion to JSON

In [24]:
import os
import json
def convert_txt_to_json(file_name):
    items = []
    if os.path.exists(file_name + '.json'):
        print(f"{file_name}.json already exists")
        return
    
    with open(file_name + '.txt', 'r', encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            try:
                if isinstance(line, dict):
                    items.append(line)
                    continue
                lines.append(json.loads(line))
            except Exception as e:
                print("ERROR: ", e)

    with open(file_name + '.json', 'w',) as f:
        json.dump(items, f, indent=4)

base_path = "./Bevo-Bud-The-GPT/data" 
convert_txt_to_json(f'{base_path}/UTAustin_submissions')
convert_txt_to_json(f'{base_path}/UTAustin_comments')

./Bevo-Bud-The-GPT/data/UTAustin_submissions.json already exists
./Bevo-Bud-The-GPT/data/UTAustin_comments.json already exists


# Data Previewing

In [None]:
with open(f'{base_path}/UTAustin_submissions.json', 'r') as f:
    submissions = json.load(f)

with open(f'{base_path}/UTAustin_comments.json', 'r') as f:
    comments = json.load(f)

In [None]:
print(f"Submissions: {len(submissions)}")
print(f"Comments: {len(comments)}")

print(f"{"="*10} SUBMISSION {"="*10}")
print(json.dumps(submissions[0], indent=4))
print(f"{"="*10} COMMENT {"="*10}")
print(json.dumps(comments[0], indent=4))


Submissions: 69261
Comments: 397743
{
    "archived": true,
    "author": "KeyboardHero",
    "author_flair_background_color": null,
    "author_flair_css_class": null,
    "author_flair_richtext": [],
    "author_flair_text": null,
    "author_flair_text_color": null,
    "author_flair_type": "text",
    "brand_safe": true,
    "can_gild": true,
    "contest_mode": false,
    "created_utc": 1254489696,
    "distinguished": null,
    "domain": "self.UTAustin",
    "edited": false,
    "gilded": 0,
    "hidden": false,
    "hide_score": false,
    "id": "9q6hs",
    "is_crosspostable": true,
    "is_reddit_media_domain": false,
    "is_self": true,
    "is_video": false,
    "link_flair_css_class": null,
    "link_flair_richtext": [],
    "link_flair_text": null,
    "link_flair_text_color": "dark",
    "link_flair_type": "text",
    "locked": false,
    "media": null,
    "media_embed": {},
    "no_follow": true,
    "num_comments": 1,
    "num_crossposts": 0,
    "over_18": false,
   

In [None]:
sub_ids = {}
for index, sub in enumerate(submissions):
    if sub['id'] not in sub_ids:
        if "?" in sub["title"]:
            sub_ids[sub['id']] = sub
    else:
        print("Duplicate submission id: ", sub['id'], f" at index {index}")

print(f"Total unique submissions: {len(sub_ids)}")
sub_ids_list = list(sub_ids.keys())
print(f'sub ids: {sub_ids_list[:10]}')

Total unique submissions: 34317
sub ids: ['9q6hs', 'b3gte', 'g9y46', 'gaplv', 'gaoi9', 'gab6t', 'ga5i5', 'ga4ec', 'gbbsq', 'gbx8g']


In [None]:
count = 0
com_ids = {}
for com in comments:
    com_ids[com['link_id']] = com
    
print(f"Total comment_ids: {len(com_ids)}")
com_ids_list = list(com_ids.keys())
print(com_ids_list[:10])

count = 0
for com_id in com_ids_list:
    if com_id.startswith('t3_'):
        count += 1
print(f"Total comments with t3_ prefix: {count}")

Total comment_ids: 59586
['t3_b3gte', 't3_c72rq', 't3_9q6hs', 't3_g9wn1', 't3_g9y46', 't3_ga1te', 't3_ga4ec', 't3_ga5i5', 't3_ga8ky', 't3_gab6t']
Total comments with t3_ prefix: 59586


In [None]:
count = 0
missing_coms_ids = []
for com_id in com_ids_list:
    com = com_ids[com_id]
    sub_id = com_id.split("_")[1]
    if sub_id not in sub_ids:
        count += 1
        missing_coms_ids.append(com_id)
        print(f"sub id {sub_id} not found in submissions")

print("Total comments with missing submission: ", count)
for com_id in missing_coms_ids:
    print(json.dumps(com_ids[com_id]["body"], indent=4))


sub id v0risg not found in submissions
sub id v0rk8h not found in submissions
sub id v0o1bj not found in submissions
sub id v254eh not found in submissions
sub id v24h7e not found in submissions
sub id v276fl not found in submissions
sub id v276cu not found in submissions
sub id v29kbn not found in submissions
sub id v2cvlu not found in submissions
sub id v2acec not found in submissions
sub id v2cacw not found in submissions
Total comments with missing submission:  11
"Hello! I'm sorry, but your post has been removed because the title is too short and thus probably breaks rule #3 in our sidebar.\n\nInstead of \"final exam for m408d,\" post your full question such as \"How is the final exam for M408D?\" or \"Where can I find an example final exam for M408D?\" or even \"Here's some advice for the M408D final exam.\" You have 300 characters for your title on Reddit, and the more that Redditors can understand from your title alone, the more likely you are to get useful responses.\n\nIf you

In [None]:
def get_comment_text_and_score(com_id, com_ids):
    com = com_ids[com_id]
    return {
        "score": com["score"],
        "text": com["body"]
    }

for com_id in com_ids_list:
    if com_id.endswith('gc9wp'):
        print(json.dumps(com_ids[com_id], indent=4))

{
    "subreddit": "UTAustin",
    "distinguished": null,
    "author_flair_text": null,
    "retrieved_on": 1427018987,
    "parent_id": "t1_c1mn5sf",
    "ups": 2,
    "controversiality": 0,
    "edited": false,
    "subreddit_id": "t5_2qy08",
    "downs": 0,
    "author_flair_css_class": null,
    "id": "c1mnlf6",
    "body": "I'm in the Greek system and sincerely appreciate this viewpoint.",
    "score_hidden": false,
    "name": "t1_c1mnlf6",
    "created_utc": "1301289785",
    "gilded": 0,
    "author": "ickjui",
    "link_id": "t3_gc9wp",
    "score": 2,
    "archived": true
}


In [None]:
comments_count = 0
for sub_id in sub_ids_list:
    comments_count += sub_ids[sub_id]["num_comments"]

print(f"{comments_count - len(com_ids)} number of comments not in coms ids")

305435 number of comments not in coms ids


In [None]:
count = 0
comments_missing = 0
for sub_id in sub_ids_list:
    coms_length = len(subs_coms_map[sub_id]["comments"])
    if coms_length != sub_ids_map[sub_id]["num_comments"]:
        count += 1
        comments_missing += sub_ids_map[sub_id]["num_comments"] - coms_length
        print(f"sub_id {sub_id} has {coms_length} comments but expected {sub_ids_map[sub_id]['num_comments']}")
print(f"Total submissions with mismatched comments: {count}")
print(f"Total comments missing: {comments_missing}")

sub_id b3gte has 1 comments but expected 7
sub_id g9y46 has 1 comments but expected 27
sub_id gaplv has 1 comments but expected 11
sub_id gaoi9 has 1 comments but expected 109
sub_id ga5i5 has 1 comments but expected 21
sub_id ga4ec has 1 comments but expected 52
sub_id gbbsq has 1 comments but expected 27
sub_id gbx8g has 1 comments but expected 21
sub_id gbue7 has 1 comments but expected 3
sub_id gcncx has 1 comments but expected 10
sub_id gc9wp has 1 comments but expected 10
sub_id gc9jd has 1 comments but expected 16
sub_id gcseo has 1 comments but expected 14
sub_id geb2q has 1 comments but expected 5
sub_id gdqj0 has 1 comments but expected 5
sub_id gfx76 has 1 comments but expected 2
sub_id gfosa has 1 comments but expected 5
sub_id ghbqx has 1 comments but expected 12
sub_id ghb36 has 1 comments but expected 2
sub_id ghz9g has 1 comments but expected 17
sub_id gjcse has 1 comments but expected 3
sub_id giukd has 1 comments but expected 9
sub_id gjohs has 1 comments but expected