In [3]:
import pandas as pd

# Load the JSON file into a pandas DataFrame
# Using lines=True to handle JSON Lines format
df = pd.read_json('../data/processed/comments_extracted_2008-01.json', lines=True)

# Display the first few rows of the DataFrame to verify the data
print("DataFrame shape:", df.shape)
df.head()


DataFrame shape: (145036, 16)


Unnamed: 0,author,body,body_cleaned,controversiality,created_utc,distinguished,edited,gilded,id,language,link_id,parent_id,retrieved_on,score,subreddit,subreddit_id
0,[deleted],[deleted],,0,1199145643,,0,0,c02s9sb,un,t3_648iy,t3_648iy,1425820157,1,politics,t5_2cneq
1,nOD1S,"And if you sincerely believe that headline, co...","and if you sincerely believe that headline , c...",0,1199145650,,1,0,c02s9se,en,t3_6483n,t3_6483n,1425820157,4,politics,t5_2cneq
2,fR8On,What organization? AQ is a bunch of wingnuts w...,what organization ? aq is a bunch of wingnuts ...,0,1199145728,,0,0,c02s9so,en,t3_6483n,t1_c02s9e2,1425820157,1,politics,t5_2cneq
3,MtZfs,NO HIS PROBLEM IS THAT WE HAVE NOT GOT HIM ON ...,no his problem is that we have not got him on ...,0,1199145817,,0,0,c02s9t4,en,t3_648os,t3_648os,1425820157,1,politics,t5_2cneq
4,MtZfs,**HILL BABY IN 08**,**hill baby in 08**,0,1199145837,,0,0,c02s9t6,un,t3_648lz,t3_648lz,1425820157,0,politics,t5_2cneq


In [4]:
# Create a sample of unique link_ids and parent_ids to examine their patterns
sample_df = df[['link_id', 'parent_id']].drop_duplicates().head(10)

print("Sample of unique link_id and parent_id pairs:")
print(sample_df)

# Count how many comments are direct replies to the thread vs replies to other comments
thread_replies = df[df['link_id'] == df['parent_id']].shape[0]
comment_replies = df[df['link_id'] != df['parent_id']].shape[0]

print("\nAnalysis of reply types:")
print(f"Direct replies to thread (link_id == parent_id): {thread_replies}")
print(f"Replies to other comments (link_id != parent_id): {comment_replies}")

# Examine the structure of a single thread
print("\nExamining a single thread structure:")
thread_id = df['link_id'].iloc[0]
thread_comments = df[df['link_id'] == thread_id][['parent_id', 'id']].head(5)
print(thread_comments)


Sample of unique link_id and parent_id pairs:
     link_id   parent_id
0   t3_648iy    t3_648iy
1   t3_6483n    t3_6483n
2   t3_6483n  t1_c02s9e2
3   t3_648os    t3_648os
4   t3_648lz    t3_648lz
6   t3_6481i  t1_c02s830
7   t3_645rf  t1_c02s9e4
8   t3_648f2  t1_c02s8nw
9   t3_647nr    t3_647nr
11  t3_648kc  t1_c02s99e

Analysis of reply types:
Direct replies to thread (link_id == parent_id): 50774
Replies to other comments (link_id != parent_id): 94262

Examining a single thread structure:
    parent_id       id
0    t3_648iy  c02s9sb
16   t3_648iy  c02s9u7
47   t3_648iy  c02s9y9
91   t3_648iy  c02sa3q
124  t3_648iy  c02sa8m


In [5]:
# Check for link_ids with multiple parent_ids
link_parent_counts = df.groupby('link_id')['parent_id'].nunique()
multiple_parent_links = link_parent_counts[link_parent_counts > 1]

print("Link IDs with multiple parent IDs:")
print(f"Number of links with multiple parents: {len(multiple_parent_links)}")
if len(multiple_parent_links) > 0:
    print("\nSample of links with multiple parents:")
    print(multiple_parent_links.head())

# Check for parent_ids that appear in multiple link_ids
parent_link_counts = df.groupby('parent_id')['link_id'].nunique()
multiple_link_parents = parent_link_counts[parent_link_counts > 1]

print("\nParent IDs that appear in multiple link IDs:")
print(f"Number of parents in multiple links: {len(multiple_link_parents)}")
if len(multiple_link_parents) > 0:
    print("\nSample of parents in multiple links:")
    print(multiple_link_parents.head())


Link IDs with multiple parent IDs:
Number of links with multiple parents: 4397

Sample of links with multiple parents:
link_id
t3_1lwd6    4
t3_2ztbv    3
t3_5z3om    2
t3_5z71n    2
t3_60mss    2
Name: parent_id, dtype: int64

Parent IDs that appear in multiple link IDs:
Number of parents in multiple links: 0


In [6]:
# Count unique link_ids and parent_ids
unique_links = df['link_id'].nunique()
unique_parents = df['parent_id'].nunique()

print(f"Number of unique link_ids: {unique_links}")
print(f"Number of unique parent_ids: {unique_parents}")

# Get the total number of rows
total_rows = len(df)
print(f"\nTotal number of rows in dataset: {total_rows}")

# Calculate average comments per link and parent
avg_comments_per_link = total_rows / unique_links
avg_comments_per_parent = total_rows / unique_parents

print(f"\nAverage comments per link: {avg_comments_per_link:.2f}")
print(f"Average comments per parent: {avg_comments_per_parent:.2f}")


Number of unique link_ids: 10164
Number of unique parent_ids: 76884

Total number of rows in dataset: 145036

Average comments per link: 14.27
Average comments per parent: 1.89
