# Psychosis Detection Dataset Exploration

This notebook explores the synthetic psychosis dataset used for training.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add src to path
sys.path.append(str(Path('.').parent / 'src'))

from data_loader import load_dataset
from preprocess import preprocess_batch
from features import FeatureExtractor

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)


## Load Dataset


In [None]:
# Load data
data_path = Path('..') / 'data' / 'synthetic_psychosis_data.csv'
df = load_dataset(str(data_path))

print(f"Dataset shape: {df.shape}")
print(f"\nLabel distribution:")
print(df['label'].value_counts())
print(f"\nLabel distribution (%):")
print(df['label'].value_counts(normalize=True) * 100)


## Dataset Statistics


In [None]:
# Preprocess text
df['text_processed'] = preprocess_batch(df['text'].values)

# Calculate text statistics
df['text_length'] = df['text_processed'].str.len()
df['word_count'] = df['text_processed'].str.split().str.len()

# Group by label
stats = df.groupby('label').agg({
    'text_length': ['mean', 'std', 'min', 'max'],
    'word_count': ['mean', 'std', 'min', 'max']
})

print("Text Statistics by Label:")
print(stats)


## Visualizations


In [None]:
# Label distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
df['label'].value_counts().plot(kind='bar', ax=axes[0], color=['#10b981', '#ef4444'])
axes[0].set_title('Label Distribution')
axes[0].set_xlabel('Label')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=0)

# Text length distribution
df.boxplot(column='text_length', by='label', ax=axes[1])
axes[1].set_title('Text Length by Label')
axes[1].set_xlabel('Label')
axes[1].set_ylabel('Text Length (characters)')
plt.suptitle('')

plt.tight_layout()
plt.show()


In [None]:
# Word count distribution
fig, ax = plt.subplots(figsize=(10, 6))

for label in df['label'].unique():
    subset = df[df['label'] == label]
    ax.hist(subset['word_count'], alpha=0.6, label=label, bins=20)

ax.set_title('Word Count Distribution by Label')
ax.set_xlabel('Word Count')
ax.set_ylabel('Frequency')
ax.legend()
plt.show()


## Feature Analysis


In [None]:
# Extract features
extractor = FeatureExtractor()

print("Extracting features (this may take a moment)...")
features_list = extractor.extract_batch_features(df['text_processed'].values)

# Convert to DataFrame
features_df = pd.DataFrame(features_list)
features_df['label'] = df['label'].values

print("\nFeature Statistics:")
print(features_df.describe())


In [None]:
# Compare features by label
feature_cols = ['negation_count', 'pronoun_count', 'negative_emotions', 'positive_emotions']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, col in enumerate(feature_cols):
    if col in features_df.columns:
        features_df.boxplot(column=col, by='label', ax=axes[i])
        axes[i].set_title(f'{col.replace("_", " ").title()} by Label')
        axes[i].set_xlabel('Label')
        axes[i].set_ylabel(col.replace('_', ' ').title())

plt.suptitle('')
plt.tight_layout()
plt.show()


## Sample Texts


In [None]:
# Show sample texts from each class
print("Sample Psychotic-like Texts:")
print("=" * 80)
for idx, row in df[df['label'] == 'psychotic-like'].head(3).iterrows():
    print(f"\n{row['text'][:200]}...")

print("\n\nSample Normal Texts:")
print("=" * 80)
for idx, row in df[df['label'] == 'normal'].head(3).iterrows():
    print(f"\n{row['text'][:200]}...")


## Summary

This notebook provides basic exploration of the dataset. Key insights:
- Dataset size and label distribution
- Text length and word count statistics
- Feature distributions
- Sample texts from each class
