In [1]:
!pip install beautifulsoup4 requests pandas


Defaulting to user installation because normal site-packages is not writeable


In [2]:
import requests
import pandas as pd

def scrape_reddit(keyword="crime", subreddit="artificial", max_posts=200):
    posts = []
    after = None

    while len(posts) < max_posts:
        url = f"https://www.reddit.com/r/{subreddit}/search.json?q={keyword}&restrict_sr=1&sort=new&limit=100"
        if after:
            url += f"&after={after}"
        
        headers = {"User-Agent": "Mozilla/5.0"}
        res = requests.get(url, headers=headers)
        data = res.json()

        children = data["data"]["children"]
        if not children:
            break

        for child in children:
            post = child["data"]
            title = post.get("title", "")
            selftext = post.get("selftext", "")
            url = "https://reddit.com" + post.get("permalink", "")
            content = selftext if selftext else title

            posts.append({"title": title, "url": url, "content": content})

        after = data["data"].get("after")
        if not after:
            break

    return pd.DataFrame(posts[:max_posts])


In [3]:
# Scrape 67+ articles on "crime"
df_crime = scrape_reddit(keyword="crime")
df_crime.drop_duplicates(subset="url", inplace=True)
df_crime = df_crime[df_crime["content"].str.len() > 30]

# Scrape more with "police"
df_police = scrape_reddit(keyword="police")
df_police.drop_duplicates(subset="url", inplace=True)
df_police = df_police[df_police["content"].str.len() > 30]

# Combine
df_combined = pd.concat([df_crime, df_police]).drop_duplicates(subset="url").reset_index(drop=True)

print(f"Total articles after combining and filtering: {len(df_combined)}")
df_combined.head(101)


Total articles after combining and filtering: 139


Unnamed: 0,title,url,content
0,One-Minute Daily AI News 2/22/2025,https://reddit.com/r/artificial/comments/1iw2y...,1. **Google’s** ‘Career Dreamer’ uses AI to he...
1,The Dawn of The New Age (Antichrist Unveiled),https://reddit.com/r/artificial/comments/1ioaj...,**These short stories are about Biblically rel...
2,One-Minute Daily AI News 2/1/2025,https://reddit.com/r/artificial/comments/1ifq2...,1. UK makes use of AI tools to create child ab...
3,Teen victim of graphic deepfake pornography wo...,https://reddit.com/r/artificial/comments/1hd0m...,Teen victim of graphic deepfake pornography wo...
4,China Unveils Sci-Fi Inspired Spherical AI Pol...,https://reddit.com/r/artificial/comments/1hcg2...,China Unveils Sci-Fi Inspired Spherical AI Pol...
...,...,...,...
96,i’ve done 10+ hours of searching but can’t fin...,https://reddit.com/r/artificial/comments/121os...,i want to make a children’s book for my nieces...
97,Question: is it possible to create such illust...,https://reddit.com/r/artificial/comments/11uy7...,Question: is it possible to create such illust...
98,Discord will use AI functions. Yes or Ney?,https://reddit.com/r/artificial/comments/11nqx...,Reference: https://www.tech360.tv/discord-roll...
99,"Do you think AI purposefully ""nerfs"" some ques...",https://reddit.com/r/artificial/comments/11hjq...,"Do you think AI purposefully ""nerfs"" some ques..."


In [9]:
def query_ollama(prompt, model="mistral"):  # changed llama3 → mistral
    url = "http://localhost:11434/api/generate"
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False
    }
    response = requests.post(url, json=payload)
    return response.json()["response"]


In [13]:
import time

results = []

for i in range(0, len(df_combined), 10):
    batch = df_combined.iloc[i:i+10].copy()
    print(f"\n⚙️ Running batch {i+1} to {i+len(batch)}")

    batch_responses = []
    for j, row in batch.iterrows():
        print(f"🔍 Analyzing post {i+j+1}/{len(df_combined)}...")
        start = time.time()
        response = analyze_article(row["content"])
        duration = round(time.time() - start, 2)
        print(f"✅ Done in {duration} sec")
        batch_responses.append(response)

    batch["analysis"] = batch_responses
    results.append(batch)

    # Save after each batch
    pd.concat(results).to_csv("ai_crime_analysis_progress.csv", index=False)



⚙️ Running batch 1 to 10
🔍 Analyzing post 1/139...
✅ Done in 57.53 sec
🔍 Analyzing post 2/139...
✅ Done in 42.97 sec
🔍 Analyzing post 3/139...
✅ Done in 35.83 sec
🔍 Analyzing post 4/139...
✅ Done in 34.33 sec
🔍 Analyzing post 5/139...
✅ Done in 28.16 sec
🔍 Analyzing post 6/139...
✅ Done in 21.67 sec
🔍 Analyzing post 7/139...
✅ Done in 33.39 sec
🔍 Analyzing post 8/139...
✅ Done in 34.8 sec
🔍 Analyzing post 9/139...
✅ Done in 27.59 sec
🔍 Analyzing post 10/139...
✅ Done in 45.64 sec

⚙️ Running batch 11 to 20
🔍 Analyzing post 21/139...
✅ Done in 24.91 sec
🔍 Analyzing post 22/139...
✅ Done in 43.38 sec
🔍 Analyzing post 23/139...
✅ Done in 30.32 sec
🔍 Analyzing post 24/139...
✅ Done in 38.55 sec
🔍 Analyzing post 25/139...
✅ Done in 40.15 sec
🔍 Analyzing post 26/139...
✅ Done in 44.36 sec
🔍 Analyzing post 27/139...
✅ Done in 62.84 sec
🔍 Analyzing post 28/139...
✅ Done in 35.21 sec
🔍 Analyzing post 29/139...
✅ Done in 33.4 sec
🔍 Analyzing post 30/139...
✅ Done in 31.54 sec

⚙️ Running batch 

In [15]:
final_df = pd.concat(results).reset_index(drop=True)

def parse_analysis(response):
    try:
        lines = response.strip().split("\n")
        summary = lines[0].replace("Summary: ", "").strip()
        score = lines[1].replace("Score: ", "").strip()
        tone = lines[2].replace("Tone: ", "").strip()
        return pd.Series([summary, score, tone])
    except:
        return pd.Series(["", "", ""])

final_df[["summary", "score", "tone"]] = final_df["analysis"].apply(parse_analysis)
final_df.to_csv("final_ai_crime_analysis.csv", index=False)


In [16]:
final_df = pd.concat(results).reset_index(drop=True)

def parse_analysis(response):
    try:
        lines = response.strip().split("\n")
        summary = lines[0].replace("Summary: ", "").strip()
        score = lines[1].replace("Score: ", "").strip()
        tone = lines[2].replace("Tone: ", "").strip()
        return pd.Series([summary, score, tone])
    except:
        return pd.Series(["", "", ""])

final_df[["summary", "score", "tone"]] = final_df["analysis"].apply(parse_analysis)
final_df.to_csv("final_ai_crime_analysis.csv", index=False)
final_df[["title", "summary", "score", "tone", "url"]].head()


Unnamed: 0,title,summary,score,tone,url
0,One-Minute Daily AI News 2/22/2025,The post discusses recent developments in the ...,,Score: +3 (Moderately impactful) - The post hi...,https://reddit.com/r/artificial/comments/1iw2y...
1,The Dawn of The New Age (Antichrist Unveiled),The post discusses a fictional story about the...,,Score: 2 (This post is not directly related to...,https://reddit.com/r/artificial/comments/1ioaj...
2,One-Minute Daily AI News 2/1/2025,This post discusses recent developments and in...,,Score: 6 (Neutral),https://reddit.com/r/artificial/comments/1ifq2...
3,Teen victim of graphic deepfake pornography wo...,The post discusses a teen who was victimized b...,,Score: +8 (Highly impactful) - This bill has t...,https://reddit.com/r/artificial/comments/1hd0m...
4,China Unveils Sci-Fi Inspired Spherical AI Pol...,The post discusses China unveiling a spherical...,,Score: +7 (Highly impactful) - This developmen...,https://reddit.com/r/artificial/comments/1hcg2...
