### Part 1

### Part 2

In [6]:
# Scrape data from Reddit API
import praw
import pandas as pd

def setup_reddit_api(client_id, client_secret, user_agent):
    """Set up Reddit API client."""
    reddit = praw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent=user_agent
    )
    return reddit

def get_subreddit_posts(reddit, subreddit_name, limit=100):
    """Get posts from a subreddit."""
    subreddit = reddit.subreddit(subreddit_name)
    posts = []

    for post in subreddit.hot(limit=limit):
        posts.append({
            'post_id': post.id,
            'title': post.title,
            'text': post.selftext,
            'author': str(post.author),
            'created_utc': post.created_utc,
            'score': post.score,
            'num_comments': post.num_comments,
            'upvote_ratio': post.upvote_ratio
        })

    return pd.DataFrame(posts)

def get_post_comments(reddit, post_id, limit=100):
    """Get comments for a specific post."""
    post = reddit.submission(id=post_id)
    post.comments.replace_more(limit=0)  # Only get top-level comments
    comments = []

    for comment in post.comments[:limit]:
        comments.append({
            'comment_id': comment.id,
            'post_id': post_id,
            'author': str(comment.author),
            'text': comment.body,
            'score': comment.score,
            'created_utc': comment.created_utc
        })

    return pd.DataFrame(comments)

In [7]:
client_id = "YI3yiSbD9yctHZ2NMIQoZA"
client_secret = "vZ1MFPXuDEXCe-NaEwEyKnDrKyaPyg"
user_agent = "script : data_collection :v1 .0 (by/u/data_collection)"
reddit = setup_reddit_api( client_id , client_secret , user_agent )
posts_df_reddit = get_subreddit_posts(reddit , "datascience" , limit =50)
posts_df_reddit.to_csv('posts_df_reddit.csv')

In [8]:
posts_df_reddit.head()

Unnamed: 0,post_id,title,text,author,created_utc,score,num_comments,upvote_ratio
0,1jyq1tk,Weekly Entering & Transitioning - Thread 14 Ap...,\n\nWelcome to this week's entering & transit...,AutoModerator,1744603000.0,9,43,0.91
1,1i5inrb,Weekly Entering & Transitioning - Thread 20 Ja...,\n\nWelcome to this week's entering & transit...,AutoModerator,1737349000.0,13,46,0.94
2,1k26920,How do you go about memorizing all the ML algo...,"I’ve been preparing for interviews lately, but...",Lamp_Shade_Head,1744986000.0,65,36,0.93
3,1k26kp3,What’s your 2025 data science coding stack + A...,Curious how others are working these days. Wha...,Zuricho,1744987000.0,41,16,0.88
4,1k2a8t6,"Forecasting: Principles and Practice, the Pyth...",,Sampo,1744997000.0,10,1,0.92


In [9]:
# Fetch comments for all posts in posts_df_reddit
all_comments = []

for post_id in posts_df_reddit['post_id']:
    try:
        comments_df = get_post_comments(reddit, post_id, limit=100)
        all_comments.append(comments_df)
        print(f"✅ Retrieved {len(comments_df)} comments for post: {post_id}")
    except Exception as e:
        print(f"❌ Failed to fetch comments for {post_id}: {e}")

# Combine all comments into a single DataFrame
all_comments_df = pd.concat(all_comments, ignore_index=True)

# Save to CSV
all_comments_df.to_csv("reddit_post_comments.csv", index=False)
print("💾 All comments saved to reddit_post_comments.csv")


✅ Retrieved 18 comments for post: 1jyq1tk
✅ Retrieved 24 comments for post: 1i5inrb
✅ Retrieved 23 comments for post: 1k26920
✅ Retrieved 12 comments for post: 1k26kp3
✅ Retrieved 1 comments for post: 1k2a8t6
✅ Retrieved 3 comments for post: 1k2ax74
✅ Retrieved 9 comments for post: 1k1wu9o
✅ Retrieved 17 comments for post: 1k22cd4
✅ Retrieved 11 comments for post: 1k1mjok
✅ Retrieved 3 comments for post: 1k1x464
✅ Retrieved 1 comments for post: 1k1vo23
✅ Retrieved 1 comments for post: 1k1lh3r
✅ Retrieved 5 comments for post: 1k20azb
✅ Retrieved 0 comments for post: 1k1ohsp
✅ Retrieved 20 comments for post: 1k0zcye
✅ Retrieved 11 comments for post: 1k0v0dc
✅ Retrieved 68 comments for post: 1k0c459
✅ Retrieved 21 comments for post: 1k0mdr3
✅ Retrieved 1 comments for post: 1k0vdku
✅ Retrieved 9 comments for post: 1k082ij
✅ Retrieved 26 comments for post: 1jzml32
✅ Retrieved 5 comments for post: 1jz0h1y
✅ Retrieved 28 comments for post: 1jz4teg
✅ Retrieved 26 comments for post: 1jyu503
✅ R

In [10]:
all_comments_df.head()

Unnamed: 0,comment_id,post_id,author,text,score,created_utc
0,mn0k755,1jyq1tk,Norse_af,Here is the roadmap I am starting to prep for ...,2,1744607000.0
1,mn2syxi,1jyq1tk,Formal-Degree-1578,"Hi everyone, I’m working on a project to forec...",2,1744645000.0
2,mndey84,1jyq1tk,tejjm9,"Hi guys, I have work experience in operations ...",2,1744788000.0
3,mn0jtkc,1jyq1tk,Norse_af,Starting a Master's Program soon.\n\nI applied...,1,1744607000.0
4,mn0vcwx,1jyq1tk,Complete-Sandwich564,"\nNew here, this may be long winded but any gu...",1,1744614000.0


In [11]:
from neo4j import GraphDatabase
import pandas as pd
import random
from datetime import datetime, timedelta

# --- CONFIG --- #
uri = "bolt://localhost:7687"
user = "neo4j"
password = "12345678"
driver = GraphDatabase.driver(uri, auth=(user, password))

# --- Generate Fake User Info --- #
def generate_users(df, posts_df):
    usernames = pd.concat([df['author'], posts_df['author']]).dropna().unique()
    users = []
    for name in usernames:
        users.append({
            'user_id': f"U_{name}",
            'username': name,
            'join_date': (datetime.today() - timedelta(days=random.randint(100, 3000))).strftime('%Y-%m-%d'),
            'follower_count': random.randint(0, 10000),
            'verified': random.choice([True, False])
        })
    return pd.DataFrame(users)

# --- Assign Topics Based on Post Title --- #
def assign_topics(posts_df):
    keywords = {
        "book": "Books",
        "transition": "Career Advice",
        "engineer": "Engineering",
        "forecast": "Forecasting",
        "data": "Data Science"
    }
    topic_map = {}
    posts_df['topic'] = "Other"
    for idx, row in posts_df.iterrows():
        for k, v in keywords.items():
            if k in row['title'].lower():
                posts_df.at[idx, 'topic'] = v
                if v not in topic_map:
                    topic_map[v] = {
                        'topic_id': f"T_{len(topic_map)+1}",
                        'name': v,
                        'popularity_score': random.randint(1, 100)
                    }
                break
    return posts_df, pd.DataFrame(topic_map.values())

# --- Recreate ENGAGED_WITH relationships --- #
def recreate_engagements(tx, comments_df):
    for _, row in comments_df.iterrows():
        tx.run("""
            MATCH (u:User {username: $author})
            MATCH (p:Post {post_id: $post_id})
            MERGE (u)-[:ENGAGED_WITH {
                engagement_type: 'comment',
                engagement_date: date(datetime({epochSeconds: toInteger($created_utc)}))
            }]->(p)
        """, {
            "author": row["author"],
            "post_id": row["post_id"],
            "created_utc": int(row["created_utc"])
        })

# --- Generate Final DataFrames --- #
users_df = generate_users(all_comments_df, posts_df_reddit)
posts_df_reddit, topics_df = assign_topics(posts_df_reddit)

# --- Neo4j Insertion Function --- #
def insert_to_neo4j(tx):
    # Platform node
    tx.run("""
        MERGE (p:Platform {platform_id: 'reddit'})
        SET p.name = 'Reddit',
            p.monthly_active_users = 430000000
    """)

    # Users
    for _, row in users_df.iterrows():
        tx.run("""
            MERGE (u:User {user_id: $user_id})
            SET u.username = $username,
                u.join_date = date($join_date),
                u.follower_count = $follower_count,
                u.verified = $verified
        """, **row)

    # Topics
    for _, row in topics_df.iterrows():
        tx.run("""
            MERGE (t:Topic {topic_id: $topic_id})
            SET t.name = $name,
                t.popularity_score = $popularity_score
        """, **row)

    # Posts and Relationships
    for _, row in posts_df_reddit.iterrows():
        tx.run("""
            MERGE (post:Post {post_id: $post_id})
            SET post.timestamp = datetime({epochSeconds: toInteger($timestamp)}),
                post.content_type = 'text',
                post.like_count = $score,
                post.share_count = $num_comments

            WITH post
            MATCH (u:User {username: $author})
            MERGE (u)-[:CREATED {creation_date: date(datetime({epochSeconds: toInteger($timestamp)}))}]->(post)

            WITH post
            MATCH (t:Topic {name: $topic})
            MERGE (post)-[:TAGGED_WITH {relevance_score: 0.9}]->(t)

            WITH post
            MATCH (p:Platform {platform_id: 'reddit'})
            MERGE (post)-[:POSTED_ON]->(p)
        """, {
            'post_id': row['post_id'],
            'timestamp': int(row['created_utc']),
            'score': int(row['score']),
            'num_comments': int(row['num_comments']),
            'author': row['author'],
            'topic': row['topic']
        })

    # Comments (CREATED and COMMENTED_ON)
    for _, row in all_comments_df.iterrows():
        tx.run("""
            MERGE (c:Comment {comment_id: $comment_id})
            SET c.text = $text,
                c.score = $score,
                c.timestamp = datetime({epochSeconds: toInteger($created_utc)})

            WITH c
            MATCH (u:User {username: $author})
            MERGE (u)-[:CREATED]->(c)

            WITH c
            MATCH (p:Post {post_id: $post_id})
            MERGE (c)-[:COMMENTED_ON]->(p)
        """, {
            'comment_id': row['comment_id'],
            'text': row['text'],
            'score': int(row['score']),
            'created_utc': int(row['created_utc']),
            'post_id': row['post_id'],
            'author': row['author']
        })

    # LIKES_SIMILAR_TOPICS
    for topic in topics_df['name']:
        authors = posts_df_reddit[posts_df_reddit['topic'] == topic]['author'].unique()
        for i in range(len(authors)):
            for j in range(i + 1, len(authors)):
                tx.run("""
                    MATCH (a:User {username: $u1}), (b:User {username: $u2})
                    MERGE (a)-[:LIKES_SIMILAR_TOPICS]->(b)
                """, {'u1': authors[i], 'u2': authors[j]})

# --- Run All Insertions --- #
with driver.session() as session:
    session.write_transaction(insert_to_neo4j)
    session.write_transaction(recreate_engagements, all_comments_df)

print("✅ Full Reddit knowledge graph recreated in Neo4j, including ENGAGED_WITH links.")

  session.write_transaction(insert_to_neo4j)
  session.write_transaction(recreate_engagements, all_comments_df)


✅ Full Reddit knowledge graph recreated in Neo4j, including ENGAGED_WITH links.


#### 📊 CYTHER ANALYSIS QUERIES

1. Who are the most influential users?

In [12]:
def most_influential_users(tx):
    query = """
    MATCH (u:User)
    OPTIONAL MATCH (u)-[:CREATED]->(p:Post)
    OPTIONAL MATCH (p)<-[:ENGAGED_WITH]-(e:User)
    WITH u.username AS user, u.follower_count AS followers, COUNT(DISTINCT e) AS total_engagement
    RETURN user, followers, total_engagement, (followers + total_engagement) AS influence_score
    ORDER BY influence_score DESC
    LIMIT 10
    """
    return list(tx.run(query))

with driver.session() as session:
    results = session.read_transaction(most_influential_users)
    for r in results:
        print(r)

  results = session.read_transaction(most_influential_users)


<Record user='Trungyaphets' followers=9994 total_engagement=0 influence_score=9994>
<Record user='crazyeddie_farker' followers=9963 total_engagement=0 influence_score=9963>
<Record user='Beneficial_Phase2366' followers=9963 total_engagement=0 influence_score=9963>
<Record user='Vampy04' followers=9961 total_engagement=0 influence_score=9961>
<Record user='forbiscuit' followers=9926 total_engagement=0 influence_score=9926>
<Record user='wang-bang' followers=9919 total_engagement=5 influence_score=9924>
<Record user='PhitPhil' followers=9917 total_engagement=0 influence_score=9917>
<Record user='Aromatic-Box683' followers=9902 total_engagement=0 influence_score=9902>
<Record user='MyKo101' followers=9883 total_engagement=4 influence_score=9887>
<Record user='next-choken' followers=9842 total_engagement=0 influence_score=9842>


📌 2. Topics with Highest Engagement

In [13]:
def top_engaging_topics(tx):
    query = """
    MATCH (t:Topic)<-[:TAGGED_WITH]-(p:Post)<-[:ENGAGED_WITH]-(u:User)
    WITH t.name AS topic, COUNT(u) AS engagement_count
    RETURN topic, engagement_count
    ORDER BY engagement_count DESC
    LIMIT 10
    """
    return list(tx.run(query))

with driver.session() as session:
    results = session.read_transaction(top_engaging_topics)
    for r in results:
        print(r)

  results = session.read_transaction(top_engaging_topics)


<Record topic='Data Science' engagement_count=243>
<Record topic='Engineering' engagement_count=232>
<Record topic='Career Advice' engagement_count=55>
<Record topic='Books' engagement_count=32>
<Record topic='Forecasting' engagement_count=21>


📌 3. Best Content Types by Platform

In [14]:
def best_content_types(tx):
    query = """
    MATCH (p:Post)-[:POSTED_ON]->(platform:Platform)
    WITH platform.name AS platform, p.content_type AS content_type, p.like_count + p.share_count AS total_engagement
    RETURN platform, content_type, AVG(total_engagement) AS avg_engagement
    ORDER BY avg_engagement DESC
    """
    return list(tx.run(query))

with driver.session() as session:
    results = session.read_transaction(best_content_types)
    for r in results:
        print(r)

<Record platform='Reddit' content_type='text' avg_engagement=118.16666666666667>


  results = session.read_transaction(best_content_types)


📌 4. User Communities via Shared Topics

In [15]:
def user_communities(tx):
    query = """
    MATCH (u1:User)-[:LIKES_SIMILAR_TOPICS]->(u2:User)
    RETURN u1.username AS user1, u2.username AS user2
    ORDER BY user1, user2
    LIMIT 20
    """
    return list(tx.run(query))

with driver.session() as session:
    results = session.read_transaction(user_communities)
    for r in results:
        print(r)

<Record user1='Emuthusiast' user2='khaili109'>
<Record user1='FilmIsForever' user2='Particular_Reality12'>
<Record user1='FilmIsForever' user2='SingerEast1469'>
<Record user1='FilmIsForever' user2='Starktony11'>
<Record user1='FilmIsForever' user2='Suspicious_Jacket463'>
<Record user1='FilmIsForever' user2='chrisgarzon19'>
<Record user1='FilmIsForever' user2='etherealcabbage72'>
<Record user1='FilmIsForever' user2='guna1o0'>
<Record user1='FilmIsForever' user2='vintagefiretruk'>
<Record user1='Particular_Reality12' user2='chrisgarzon19'>
<Record user1='Particular_Reality12' user2='guna1o0'>
<Record user1='Particular_Reality12' user2='vintagefiretruk'>
<Record user1='Sampo' user2='Admirable_Creme1276'>
<Record user1='SingerEast1469' user2='Particular_Reality12'>
<Record user1='SingerEast1469' user2='chrisgarzon19'>
<Record user1='SingerEast1469' user2='guna1o0'>
<Record user1='SingerEast1469' user2='vintagefiretruk'>
<Record user1='Starktony11' user2='Particular_Reality12'>
<Record user

  results = session.read_transaction(user_communities)
