In [1]:
import random
import pandas as pd
from datetime import datetime, timedelta

In [5]:
def generate_entry(sentiment, emotion, session_type="journal"):
    positive_phrases = [
        "I feel hopeful today", "Things are improving",
        "I managed my stress well", "I feel supported",
        "I’m proud of myself"
    ]
    neutral_phrases = [
        "Today was an average day", "I went about my routine",
        "Nothing unusual happened", "I had my meals on time",
        "It was just a normal day"
    ]
    negative_phrases = [
        "I feel anxious", "It was hard to focus",
        "I feel lonely", "I had a breakdown",
        "I feel hopeless"
    ]
    
    fillers = [
        "I kept thinking about my future.",
        "Sometimes I wonder if things will change.",
        "I had a conversation with a friend.",
        "I tried to relax with music.",
        "Writing this helps me reflect.",
        "I don’t know how to explain it fully."
    ]
    
    if sentiment == "positive":
        base = random.choice(positive_phrases)
    elif sentiment == "neutral":
        base = random.choice(neutral_phrases)
    else:
        base = random.choice(negative_phrases)
    
    text = f"[{emotion.upper()}] {base} " + " ".join(random.choices(fillers, k=random.randint(2,5)))
    return text

In [7]:
# Number of rows (set to 15000 for your project)
n_samples = 15000  

sentiments = ["positive", "neutral", "negative"]
emotions = {
    "positive": ["happy", "calm", "relieved", "grateful"],
    "neutral": ["indifferent", "calm", "tired"],
    "negative": ["anxious", "sad", "angry", "lonely"]
}
session_types = ["journal", "chat"]

data = []
start_date = datetime(2020, 1, 1)

for i in range(n_samples):
    sentiment = random.choice(sentiments)
    emotion = random.choice(emotions[sentiment])
    session_type = random.choice(session_types)
    
    text = generate_entry(sentiment, emotion, session_type)
    word_count = len(text.split())
    duration = max(5, min(60, word_count // 5 + random.randint(-2, 5)))
    date = start_date + timedelta(days=random.randint(0, 1825))  # 5 years span
    
    data.append({
        "id": i+1,
        "date": date.strftime("%Y-%m-%d"),
        "session_type": session_type,
        "duration_minutes": duration,
        "word_count": word_count,
        "sentiment": sentiment,
        "emotion_label": emotion,
        "text": text
    })

df = pd.DataFrame(data)
df.head()

Unnamed: 0,id,date,session_type,duration_minutes,word_count,sentiment,emotion_label,text
0,1,2020-04-01,chat,9,30,neutral,tired,[TIRED] Nothing unusual happened I don’t know ...
1,2,2024-11-27,chat,7,30,neutral,indifferent,[INDIFFERENT] Today was an average day Sometim...
2,3,2024-03-11,chat,5,23,negative,sad,[SAD] I feel anxious I kept thinking about my ...
3,4,2024-07-20,chat,5,15,positive,calm,[CALM] Things are improving I tried to relax w...
4,5,2024-10-15,chat,5,34,positive,happy,[HAPPY] I’m proud of myself Sometimes I wonder...


In [9]:
output_path = "mental_health_sentiment_dataset.csv"
df.to_csv(output_path, index=False, encoding="utf-8-sig")  # Excel friendly encoding
print(f"Dataset saved as {output_path} with {len(df)} rows and {df.shape[1]} columns")

Dataset saved as mental_health_sentiment_dataset.csv with 15000 rows and 8 columns
