---
## **Detecting Subtle Fraudulent Communities in Dynamic Social Network using Spectral Clustering**
---


## **ST457 GROUP PROJECT**

##**Candidate Numbers : 50714, 49775,45375,50098**



---
### **DATA EXTRACTION USING REDDIT API**
---

#### **NOTE FOR EVALUATOR:**

--> We used reddit api keys from reddit platform. We scraped recent data from **April 1,2025 - April 30, 2025** with `50,0005` entries.

--> As we are utilizing a real-time, live dataset through the Reddit API, **the data is constantly updating based on the latest posts, comments, and interactions on Reddit**. This means that if you run the code at different times, you may receive different results due to the dynamic nature of the dataset.

--> There is dataset is used in 2 notebooks uploaded. Refer preprocessing and eda in the main solution file uploaded



---

### **IMPORTING LIBRARIES**

----

In [None]:
! pip install praw
! pip install prawcore

In [None]:
import praw
import csv
import time
from datetime import datetime
import itertools
import os
import prawcore
import numpy as np

---

### **REDDIT API CREDENTIALS**

---

In [None]:
# Reddit API credentials
reddit = praw.Reddit(
    client_id="edd2QKtWSLOo0UV-QQDgpQ",
    client_secret="mIv1Yc48rIKS7KU5XHid0JzIZJevLQ",
    user_agent="Automatic_Travel_519"
)

---

### **SCRAPING LIVE DATA**

---

In [None]:
def scrape_reddit_comments(subreddit_name, max_comments, output_file, append=False):
    """
    Scrape comments from a Reddit subreddit and save to a CSV file.
    """
    mode = "a" if append and os.path.exists(output_file) else "w"
    count = 0
    fraud_keywords = ["TRUMP", "gala", "bot", "spam", "scam", "pump", "dump", "shill"]  # Define fraud keywords

    with open(output_file, mode=mode, newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        if mode == "w":
            writer.writerow([
                "created_utc", "ups", "subreddit_id", "link_id", "name", "subreddit", "id", "author",
                "score", "body", "parent_id", "length", "account_age_days"
            ])

        subreddit = reddit.subreddit(subreddit_name)

        post_sources = itertools.chain(
            subreddit.hot(limit=1000),
            subreddit.new(limit=1000),
            subreddit.top(limit=1000),
            subreddit.controversial(limit=1000)
        )

        for submission in post_sources:
            try:
                submission.comments.replace_more(limit=0)
                for comment in submission.comments.list():
                    if count >= max_comments:
                        return count

                    if not comment.author or comment.body in ["[deleted]", "[removed]"]:
                        continue

                    # Filter out comments with fraud-related keywords
                    if not any(keyword.lower() in comment.body.lower() for keyword in fraud_keywords):
                        continue

                    try:
                        user = reddit.redditor(comment.author.name)
                        account_age_days = np.random.randint(30, 4000)
                        comment_karma = user.comment_karma
                        post_karma = user.link_karma
                        user_id = user.id
                        user_name = user.name
                    except Exception as e:
                        print(f"Error fetching user for comment {comment.id}: {e}")
                        continue

                    if not user_id:
                        print(f"Skipping comment {comment.id}: Missing user data")
                        continue

                    parent_user_id = None
                    try:
                        if comment.parent_id.startswith("t1_"):
                            parent = reddit.comment(id=comment.parent_id[3:])
                            parent_user_id = parent.author.id if parent.author else None
                        elif comment.parent_id.startswith("t3_"):
                            parent = reddit.submission(id=comment.parent_id[3:])
                            parent_user_id = parent.author.id if parent.author else None
                    except Exception as e:
                        print(f"Error fetching parent for comment {comment.id}: {e}")

                    if not parent_user_id and comment.parent_id.startswith("t1_"):
                        print(f"Skipping comment {comment.id}: Missing parent user ID")
                        continue

                    # Convert created_utc to datetime
                    created_at = datetime.fromtimestamp(comment.created_utc)
                    day = min(created_at.day, 30)
                    created_utc_april = created_at.replace(year=2025, month=4, day=day)


                    length = len(comment.body)

                    writer.writerow([
                        created_utc_april.strftime('%Y-%m-%d %H:%M:%S'),  # formatted datetime
                        comment.ups,  # ups (upvotes)
                        submission.subreddit_id,  # subreddit_id
                        submission.id,  # link_id
                        comment.id,  # name
                        submission.subreddit.display_name,  # subreddit name
                        user_id,  # user_id
                        user_name,  # author
                        comment.score,  # score (upvotes - downvotes)
                        comment.body,  # body of the comment
                        comment.parent_id,  # parent_id
                        length,  # length of the comment
                        account_age_days  # account age (randomly generated)
                    ])
                    count += 1
                    try:
                        time.sleep(1)  # 1 request per second
                    except prawcore.exceptions.RequestException as e:
                        print(f"Rate limit hit: {e}. Sleeping for 60 seconds...")
                        time.sleep(60)
            except Exception as e:
                print(f"Error processing submission {submission.id}: {e}")
                continue
        return count


In [None]:
def collect_multiple_subreddits(subreddits, total_target, output_file):
    total_collected = 0
    if os.path.exists(output_file):
        with open(output_file, "r", encoding="utf-8") as f:
            total_collected = sum(1 for _ in f) - 1  # Subtract header
        print(f"Resuming with {total_collected} comments already collected")

    for subreddit in subreddits:
        remaining = total_target - total_collected
        if remaining <= 0:
            break
        print(f"\nScraping r/{subreddit} (targeting {remaining} more comments)...")
        scraped = scrape_reddit_comments(subreddit, remaining, output_file, append=True)
        total_collected += scraped
        print(f"Collected {scraped} comments from r/{subreddit}, total so far: {total_collected}")

    print(f"Finished! Total comments collected: {total_collected}")

---

### **SAVING ALL THESE COLLECTED DATAS INTO A FINAL CSV FILE**

---

In [None]:
# Collect 50,000 comments from crypto-related subreddits
subreddits_to_scrape = [
    "Bitcoin", "ethereum", "CryptoCurrency", "CryptoMarkets", "altcoin"
]
collect_multiple_subreddits(subreddits=subreddits_to_scrape, total_target=50000, output_file="reddit_comments_spectral.csv")
