# Reddit Data Processing

In [1]:
import pandas as pd
import re
import gender_detector as gd
from transformers import pipeline

def load_data(file_path):
    """
    Load the CSV file into a pandas DataFrame.
    """
    return pd.read_csv(file_path)

In [2]:
family_df = load_data('r_family_data/r_family_posts.csv')

fi_df = load_data('r_FamilyIssues_data/r_familyissue_posts.csv')

family_df
fi_df

Unnamed: 0,id,title,author,selftext,score,num_comments,created_utc,url
0,1kw6yw0,Family and politics,OstrichCorrect9498,My daughters were very close and are no longer...,1,0,1.748297e+09,https://www.reddit.com/r/FamilyIssues/comments...
1,1kw5d7w,Dad laughing at rape joke,Spiritual-Wonder7536,Im not sure if I am over reacting but it’s Mem...,2,0,1.748293e+09,https://www.reddit.com/r/FamilyIssues/comments...
2,1kw4l5f,Adult children of emotionally immature parents.,Flat-Matter-3314,Myself and mother are not close. She tries to ...,1,0,1.748291e+09,https://www.reddit.com/r/FamilyIssues/comments...
3,1kvufk3,Im scared to interact with my dad's side.,Short_Audience7308,"One day my dad was sick, he had a stomach flu ...",1,0,1.748266e+09,https://www.reddit.com/r/FamilyIssues/comments...
4,1kvtoq9,Brother given a house,Own-Needleworker4869,I just found out yesterday that my brother was...,1,0,1.748264e+09,https://www.reddit.com/r/FamilyIssues/comments...
...,...,...,...,...,...,...,...,...
951,1j7s932,IN LAWS,International-Air472,I’m gonna try to explain the whole story to th...,1,1,1.741588e+09,https://www.reddit.com/r/FamilyIssues/comments...
952,1j7m8ri,My stepmom and her family (especially her mom)...,_aidischirris_,My relationship with my stepmom is complicated...,2,0,1.741567e+09,https://www.reddit.com/r/FamilyIssues/comments...
953,1j7lvhr,"My sister is going off to college, and my pare...",RingPrestigious9389,My older sister is very accomplished. Student ...,1,1,1.741566e+09,https://www.reddit.com/r/FamilyIssues/comments...
954,1j7idd9,I don’t know if I’ll ever recover from growing...,RushAmazing1419,\nI've never had a perfect relationship with m...,2,0,1.741556e+09,https://www.reddit.com/r/FamilyIssues/comments...


In [3]:
df = pd.concat([family_df, fi_df], ignore_index=True)

def clean_text(text):
    """
    Clean the text by removing URLs, special characters, and extra spaces.
    """
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['selftext'] = df['selftext'].fillna('').astype(str).apply(clean_text)

print(df['selftext'].head())

0    To start things off I ll explain my current li...
1    my grandma treats me like i m worthless and in...
2    i am the only daughter and i have 3 brothers h...
3    how do I cut off my family members without hur...
4    So it was a normal outing and we were on the c...
Name: selftext, dtype: object


In [4]:
df['clean_text'] = (
    'title: '      + df['title'].fillna('') +
    '\nauthor: '    + df['author'].fillna('') +
    '\ntext body: ' + df['selftext'].fillna('')
)

print(df[['title', 'author', 'selftext', 'clean_text']].head())

                                               title              author  \
0   Severely ignored by my family and being excluded  Tall-Jackfruit7375   
1                                    Hateful grandma       999_szn_lvsss   
2  I (20F) feel like I have to parent my little b...  Affectionate_Use53   
3                             how to cut off family?     Middle-Scene534   
4             Car argument that tore my family apart  Worried-Offer-7153   

                                            selftext  \
0  To start things off I ll explain my current li...   
1  my grandma treats me like i m worthless and in...   
2  i am the only daughter and i have 3 brothers h...   
3  how do I cut off my family members without hur...   
4  So it was a normal outing and we were on the c...   

                                          clean_text  
0  title: Severely ignored by my family and being...  
1  title: Hateful grandma\nauthor: 999_szn_lvsss\...  
2  title: I (20F) feel like I have to par

In [None]:
import os
import re
import json
import csv

# (1) Setup the LLM API (OpenAI GPT-4 as example; replace or augment with HF Mistral if desired)
# Ensure OPENAI_API_KEY is set in the environment. For GPT-4o (next-gen model) JSON mode:
try:
    import openai
    openai.api_key = os.getenv("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")
except ImportError:
    openai = None  # If OpenAI not available, one could use Hugging Face instead

df2 = (
    df
    .reset_index(drop=False)                 # move old index into column “index”
    .rename(columns={'index': 'post_id',
                     'clean_text': 'body'})  # rename clean_text → body
)
posts = df2[['post_id','title','body']].to_dict(orient='records')

# Prepare regex patterns for fallback
age_pattern = re.compile(r"(\b\d{1,3}\b)\s*(?:years?\s*old|yo|yrs? old)", re.IGNORECASE)
male_roles = ["father","dad","son","brother","uncle","boy","husband","grandfather","grandpa","pa","father-in-law"]
female_roles = ["mother","mom","daughter","sister","aunt","girl","wife","grandmother","grandma","ma","mother-in-law"]
role_pattern = re.compile(r"\b(I am|I'm|As a|As an|As the)\s+(?:an? )?(%s)\b" % "|".join(male_roles+female_roles), re.IGNORECASE)

results = []
for post in posts:
    text = f"Title: {post['title']}\nPost: {post['body']}"
    age = gender = role = None

    # (2) Call the LLM to extract fields
    if openai:
        prompt = (
            "You are an assistant that extracts a narrator's details from a text. "
            "Identify the narrator's age (years), gender (might be expressed as M for male and F for female), and family role (e.g. mother, father, daughter, son, brother, sister) and output a JSON object with keys `age`, `gender`, and `role`. "
            "Use null for any field that is not mentioned. "
            f"Text: '''{text}'''"
        )
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4o",  # or "gpt-4o-mini", "gpt-4", etc.
                messages=[{"role": "system", "content": "Extract narrator age, gender, role."},
                          {"role": "user", "content": prompt}],
                temperature=0.0,
                max_tokens=100,
                # Optionally use structured output mode (JSON schema) if supported:
                # response_format={"type": "json_schema", "json_schema": {"schema": {"type":"object","properties":{"age":{"type":"integer"},"gender":{"type":"string"},"role":{"type":"string"}},"required":["age","gender","role"]}}}
            )
            content = response.choices[0].message.content
            # (3) Parse JSON from LLM output, with regex fallback if needed
            try:
                data = json.loads(content)
            except json.JSONDecodeError:
                # Fallback: extract the first JSON object from the text using regex
                match = re.search(r"\{.*\}", content, re.DOTALL)
                if match:
                    try:
                        data = json.loads(match.group())
                    except:
                        data = {}
                else:
                    data = {}
        except Exception as e:
            # If LLM call fails, skip to regex fallback
            data = {}
    else:
        # (Alternate) Use HuggingFace Mistral model
        from transformers import pipeline
        hf_model = "mistralai/Mistral-7B-Instruct-v0.3"
        generator = pipeline("text-generation", model=hf_model, device=0)
        prompt = (
            "Extract the narrator's age, gender, and family role from the following text. "
            "Return a JSON with keys age, gender, role (null for unknown).\n\n" + text
        )
        output = generator(prompt, max_new_tokens=100, temperature=0.0)[0]['generated_text']
        # The model may echo the prompt; find the JSON part
        match = re.search(r"\{.*\}", output, re.DOTALL)
        try:
            data = json.loads(match.group()) if match else {}
        except:
            data = {}

    # (4) Extract values from JSON or use regex if missing
    if data:
        age = data.get("age", None)
        gender = data.get("gender", None)
        role = data.get("role", None)

    # Regex fallback for age
    if age is None:
        m = age_pattern.search(text)
        if m:
            age = int(m.group(1))
    # Regex fallback for role (and infer gender from role)
    if role is None or gender is None:
        m = role_pattern.search(text)
        if m:
            found = m.group(2).lower()
            # Normalize role (lowercase)
            role = found
            # Infer gender
            if found in [r.lower() for r in female_roles]:
                gender = "Female"
            elif found in [r.lower() for r in male_roles]:
                gender = "Male"

    # Normalize gender string
    if gender:
        gender_norm = gender.strip().capitalize()
        if gender_norm not in ["Male", "Female"]:
            gender_norm = "Other"
        gender = gender_norm

    # Record None as blank in CSV
#     results.append({
#         "post_id": post["post_id"],
#         "age": age if age is not None else "",
#         "gender": gender if gender else "",
#         "role": role if role else ""
#     })

# # (5) Write results to CSV
# with open("output.csv", "w", newline="", encoding="utf-8") as csvfile:
#     writer = csv.writer(csvfile)
#     writer.writerow(["post_id", "clean_text" "age", "gender", "role"])
#     for row in results:
#         writer.writerow([row["post_id"], row["age"], row["gender"], row["role"]])
# … inside your loop, include the text in the results dict
    results.append({
        "post_id": post["post_id"],
        "clean_text": post["body"],   # or post["clean_text"] if that’s your key
        "age": age if age is not None else "",
        "gender": gender if gender else "",
        "role": role if role else ""
    })

# (5) Write results to CSV, now including clean_text
with open("output.csv", "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    # add the clean_text header (note the comma)
    writer.writerow(["post_id", "clean_text", "age", "gender", "role"])
    for row in results:
        writer.writerow([
            row["post_id"],
            row["clean_text"],
            row["age"],
            row["gender"],
            row["role"]
        ])


In [13]:
import os
import json
import csv
import pandas as pd
import openai


df = df.reset_index().rename(columns={'index':'post_id'})
# 1) Configure OpenAI
openai.api_key = os.getenv("OPENAI_API_KEY", "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")

# 2) Define the function schema
functions = [
    {
        "name": "extract_narrator",
        "description": "Extract the narrator's age (integer), gender (Male/Female/Other), and family role from a Reddit post. Return null for unknown fields.",
        "parameters": {
            "type": "object",
            "properties": {
                "age":      {"type": ["integer","null"], "description": "The narrator's age in years"},
                "gender":   {"type": ["string","null"],  "enum": ["Male","Female","Other", None]},
                "role":     {"type": ["string","null"],  "description": "Family role, e.g., mother, father, daughter, son, sibling, etc."},
            },
            "required": ["age", "gender", "role"]
        }
    }
]

results = []

for _, row in df.iterrows():
    prompt = (
        f"Title: {row.title}\n\n"
        f"Post body:\n{row.clean_text}\n\n"
        "Please extract the narrator's age, gender, and family role."
    )

    resp = openai.chat.completions.create(
        model="gpt-4o",
        messages=[{"role":"user","content":prompt}],
        functions=functions,
        function_call={"name":"extract_narrator"},
        temperature=0.0,
    )

    # the model will return a 'function_call' with JSON arguments
    fn_call = resp.choices[0].message.function_call
    data = json.loads(fn_call.arguments)

    results.append({
        "post_id": row.post_id,
        "title":   row.title,
        "clean_text": row.clean_text,
        "age":     data.get("age"),
        "gender":  data.get("gender"),
        "role":    data.get("role"),
    })

# 3) Write out CSV
with open("output.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["post_id","title","clean_text","age","gender","role"])
    writer.writeheader()
    writer.writerows(results)

print("Done — output.csv written")


Done — output.csv written


In [9]:
sample_text = "title: I (20F) feel like I have to parent my little brother (17M) author: Affectionate_Use53 text body: i am the only daughter and i have 3 brothers haha two of which are out of the house but i still live with my parents and little brother honestly my parents have had an awful marriage cheating divorce threats walking out of the house fighting etc which also led to their focus being far away from the kids id that makes sense so us siblings have been very close my little brother is 17 now and kind of in a very important time in his life he s struggling in school grades are low acting out angry sad all of the time it feels like my parents aren t really there for him and or don t really care so i ve been helping him with school trying to be an outlet for his feelings making sure he s getting to events school practice making dinners hanging out paying for his outings private practices food and what not but it s just so exhausting and i constantly feel like i m failing him parents aren t even super involved with his or my life and they re constantly yelling at us about something i m really trying my best to be there for him and be the person he needs but i can t help but feel like i m not doing enough can anyone give any advice i m trying so hard"

age, gender, role = extract_labels(sample_text)
print(f"Extracted -> Age: {age}, Gender: {gender}, Role: {role}")


Extracted -> Age: 20, Gender: female, Role: daughter


In [13]:
df['clean_text'] = df['clean_text'].fillna("")

df[['age','gender','role']] = df['clean_text']\
  .apply(lambda s: pd.Series(extract_labels(s), index=['age','gender','role']))

print(df[['clean_text', 'age', 'gender', 'role']].head())

                                          clean_text   age  gender  \
0  title: Severely ignored by my family and being...   NaN  female   
1  title: Hateful grandma\nauthor: 999_szn_lvsss\...   NaN  female   
2  title: I (20F) feel like I have to parent my l...  20.0  female   
3  title: how to cut off family?\nauthor: Middle-...   NaN    None   
4  title: Car argument that tore my family apart\...   NaN  female   

          role  
0  undisclosed  
1  undisclosed  
2     daughter  
3  undisclosed  
4  undisclosed  


In [20]:
# Apply the mask
df_filtered = load_data('output.csv')

mask = (
    df_filtered['age'].notna() &           # age is not NaN
    df_filtered['gender'].notna() &        # gender is not None
    (df_filtered['role'].notna()) # role is not “undisclosed”
)

df_filtered = df_filtered[mask].reset_index(drop=True)


count = df_filtered.shape[0]
print(f"Number of posts with valid age, gender, and role: {count}")

Number of posts with valid age, gender, and role: 387


In [23]:
df_filtered.to_csv('labeled_reddit_post.csv', index=False)