In [15]:
import json
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

def load_all_timelines(data_dir):
    """Load all JSON files from a directory into a list of timelines"""
    timelines = []
    for filename in os.listdir(data_dir):
        if filename.endswith('.json'):
            with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as f:
                timeline = json.load(f)
                timelines.append(timeline)
    return timelines

def create_training_dataset(timelines):
    """Extract posts with their evidence annotations into a DataFrame"""
    data = []
    for timeline in timelines:
        timeline_id = timeline["timeline_id"]
        for post in timeline["posts"]:
            post_id = post["post_id"]
            post_text = post["post"]
            
            # Skip posts without evidence annotations
            if "evidence" not in post:
                continue
                
            # Extract adaptive evidence spans
            adaptive_evidence = []
            if "adaptive-state" in post["evidence"]:
                for component, details in post["evidence"]["adaptive-state"].items():
                    if "highlighted_evidence" in details:
                        adaptive_evidence.append(details["highlighted_evidence"])
            
            # Extract maladaptive evidence spans
            maladaptive_evidence = []
            if "maladaptive-state" in post["evidence"]:
                for component, details in post["evidence"]["maladaptive-state"].items():
                    if "highlighted_evidence" in details:
                        maladaptive_evidence.append(details["highlighted_evidence"])
            
            data.append({
                "timeline_id": timeline_id,
                "post_id": post_id,
                "text": post_text,
                "adaptive_evidence": adaptive_evidence,
                "maladaptive_evidence": maladaptive_evidence
            })
    
    return pd.DataFrame(data)

# Load all training timelines
train_timelines = load_all_timelines("/kaggle/input/train-dataset-1")
train_df = create_training_dataset(train_timelines)


In [16]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

def analyze_dataset(df):
    """Analyze the training dataset to understand its characteristics"""
    print(f"Total posts: {len(df)}")
    print(f"Posts with adaptive evidence: {sum(df['adaptive_evidence'].apply(len) > 0)}")
    print(f"Posts with maladaptive evidence: {sum(df['maladaptive_evidence'].apply(len) > 0)}")
    
    # Create binary labels for classification
    df['has_adaptive'] = df['adaptive_evidence'].apply(lambda x: 1 if len(x) > 0 else 0)
    df['has_maladaptive'] = df['maladaptive_evidence'].apply(lambda x: 1 if len(x) > 0 else 0)
    
    # Tokenize posts into sentences for later use
    df['sentences'] = df['text'].apply(sent_tokenize)
    
    return df

# Analyze and preprocess the dataset
train_df = analyze_dataset(train_df)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Total posts: 343
Posts with adaptive evidence: 169
Posts with maladaptive evidence: 179


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

def engineer_features(df, feature_type="tfidf"):
    """Create features for binary classification"""
    if feature_type == "tfidf":
        # TF-IDF features
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
        X = vectorizer.fit_transform(df['text'])
        feature_names = vectorizer.get_feature_names_out()
    else:
        # Add other feature types if needed (e.g., BERT embeddings)
        pass
    
    return X, vectorizer, feature_names

# Split data into train and validation sets
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

# Create feature matrices
X_train, vectorizer, feature_names = engineer_features(train_data)
X_val = vectorizer.transform(val_data['text'])


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def train_binary_classifiers(X_train, y_train_adaptive, y_train_maladaptive, 
                             X_val, y_val_adaptive, y_val_maladaptive):
    """Train separate classifiers for adaptive and maladaptive states"""
    # Adaptive classifier
    adaptive_clf = RandomForestClassifier(n_estimators=100, random_state=42)
    adaptive_clf.fit(X_train, y_train_adaptive)
    adaptive_preds = adaptive_clf.predict(X_val)
    
    # Maladaptive classifier
    maladaptive_clf = RandomForestClassifier(n_estimators=100, random_state=42)
    maladaptive_clf.fit(X_train, y_train_maladaptive)
    maladaptive_preds = maladaptive_clf.predict(X_val)
    
    # Evaluate classifiers
    print("Adaptive Classifier Performance:")
    print(classification_report(y_val_adaptive, adaptive_preds))
    
    print("Maladaptive Classifier Performance:")
    print(classification_report(y_val_maladaptive, maladaptive_preds))
    
    return adaptive_clf, maladaptive_clf

# Train classifiers
adaptive_clf, maladaptive_clf = train_binary_classifiers(
    X_train, train_data['has_adaptive'], train_data['has_maladaptive'],
    X_val, val_data['has_adaptive'], val_data['has_maladaptive']
)


Adaptive Classifier Performance:
              precision    recall  f1-score   support

           0       0.79      0.81      0.80        37
           1       0.77      0.75      0.76        32

    accuracy                           0.78        69
   macro avg       0.78      0.78      0.78        69
weighted avg       0.78      0.78      0.78        69

Maladaptive Classifier Performance:
              precision    recall  f1-score   support

           0       0.94      0.83      0.88        41
           1       0.79      0.93      0.85        28

    accuracy                           0.87        69
   macro avg       0.87      0.88      0.87        69
weighted avg       0.88      0.87      0.87        69



In [19]:
def calculate_feature_importance_simple(clf, feature_names):
    """Calculate feature importance using the classifier's built-in feature_importances_"""
    if not hasattr(clf, 'feature_importances_'):
        raise ValueError("Classifier does not have feature_importances_ attribute")
    
    importance_scores = clf.feature_importances_
    
    # Create a mapping of features to importance scores
    feature_importance = {feature_names[i]: importance_scores[i] 
                         for i in range(len(feature_names))}
    
    return feature_importance

# Calculate feature importance for adaptive classifier
adaptive_importance = calculate_feature_importance_simple(adaptive_clf, feature_names)
print(f"Top 5 important features for adaptive states: {sorted(adaptive_importance.items(), key=lambda x: x[1], reverse=True)[:5]}")

# Calculate feature importance for maladaptive classifier
maladaptive_importance = calculate_feature_importance_simple(maladaptive_clf, feature_names)
print(f"Top 5 important features for maladaptive states: {sorted(maladaptive_importance.items(), key=lambda x: x[1], reverse=True)[:5]}")


Top 5 important features for adaptive states: [('and', 0.022689654908040886), ('but', 0.021444647353222553), ('it', 0.018397119002538157), ('to', 0.016057036870414175), ('for', 0.014616723314670446)]
Top 5 important features for maladaptive states: [('and', 0.02563150227239303), ('to', 0.02274935575225972), ('it', 0.018220202868338312), ('be', 0.016732570023576965), ('of', 0.01626467195205366)]


In [20]:
import numpy as np
from scipy import sparse

def extract_evidence_spans(post_text, clf, vectorizer, feature_names, 
                           importance_threshold=0.01, top_n=3, feature_importance=None):
    """
    Extract evidence spans for a post using feature importance
    
    Parameters:
    -----------
    post_text : str
        The text of the post to analyze
    clf : classifier
        Trained classifier model
    vectorizer : TfidfVectorizer or similar
        The vectorizer used to convert text to features
    feature_names : list
        Names of features
    importance_threshold : float
        Threshold for considering a feature important
    top_n : int
        Number of top sentences to return
    feature_importance : dict, optional
        Pre-calculated feature importance dictionary
    """
    # Vectorize the post
    post_vector = vectorizer.transform([post_text])
    
    # Predict if post contains evidence
    try:
        has_evidence = clf.predict(post_vector)[0]
    except Exception as e:
        print(f"Prediction error: {str(e)}")
        return []
    
    if not has_evidence:
        return []
    
    # Use pre-calculated feature importance if provided
    if feature_importance is not None:
        post_importance = feature_importance
    else:
        try:
            # Use classifier's built-in feature importance instead of SHAP
            # This avoids the sparse matrix issues with SHAP
            if hasattr(clf, 'feature_importances_'):
                importance_scores = clf.feature_importances_
                post_importance = {feature_names[i]: importance_scores[i] 
                                  for i in range(len(feature_names))}
            else:
                # If no feature importance available, use coefficient values for linear models
                if hasattr(clf, 'coef_'):
                    coef = clf.coef_[0] if len(clf.coef_.shape) > 1 else clf.coef_
                    post_importance = {feature_names[i]: abs(coef[i]) 
                                      for i in range(len(feature_names))}
                else:
                    # Last resort: give equal importance to all features present in this post
                    post_importance = {}
                    post_vector_array = post_vector.toarray()[0]
                    for i, val in enumerate(post_vector_array):
                        if val > 0:
                            post_importance[feature_names[i]] = val
        except Exception as e:
            print(f"Feature importance calculation error: {str(e)}")
            # Create a simple importance score based on TF-IDF values
            post_importance = {}
            for i, val in enumerate(post_vector.toarray()[0]):
                if val > 0:
                    post_importance[feature_names[i]] = val
    
    # Find important sentences
    important_sentences = find_important_sentences(
        post_text, post_importance, importance_threshold
    )
    
    # Return top N sentences as evidence spans
    spans = [s['sentence'] for s in important_sentences[:top_n] if s['score'] > 0]
    
    # Ensure spans are actually in the original text
    validated_spans = [span for span in spans if span in post_text]
    
    return validated_spans


In [21]:
def consolidate_spans(spans, post_text):
    """Merge overlapping spans and ensure they are continuous in the original text"""
    if not spans:
        return []
    
    # Sort spans by their position in the original text
    sorted_spans = sorted(spans, key=lambda span: post_text.find(span))
    
    # Merge spans that are adjacent or overlapping in the original text
    consolidated = []
    current_start = post_text.find(sorted_spans[0])
    current_end = current_start + len(sorted_spans[0])
    current_span = sorted_spans[0]
    
    for span in sorted_spans[1:]:
        span_start = post_text.find(span)
        span_end = span_start + len(span)
        
        # If spans overlap or are adjacent, merge them
        if span_start <= current_end + 5:  # Allow small gaps (5 chars)
            merged_end = max(current_end, span_end)
            current_span = post_text[current_start:merged_end]
            current_end = merged_end
        else:
            consolidated.append(current_span)
            current_start = span_start
            current_end = span_end
            current_span = span
    
    consolidated.append(current_span)
    return consolidated


In [22]:
def process_test_timeline(timeline, adaptive_clf, maladaptive_clf, vectorizer, feature_names, 
                         adaptive_importance=None, maladaptive_importance=None):
    """Process a test timeline to extract evidence spans for each post"""
    timeline_id = timeline["timeline_id"]
    result = {
        "timeline_level": {"summary": ""},  # Will be filled by Task C
        "post_level": {}
    }
    
    for post in timeline["posts"]:
        post_id = post["post_id"]
        post_text = post["post"] if "post" in post else ""
        
        if not post_text:
            # Handle empty posts
            result["post_level"][post_id] = {
                "adaptive_evidence": [],
                "maladaptive_evidence": [],
                "summary": "",
                "wellbeing_score": None
            }
            continue
            
        # Extract adaptive evidence
        adaptive_spans = extract_evidence_spans(
            post_text, adaptive_clf, vectorizer, feature_names,
            feature_importance=adaptive_importance
        )
        adaptive_spans = consolidate_spans(adaptive_spans, post_text)
        
        # Extract maladaptive evidence
        maladaptive_spans = extract_evidence_spans(
            post_text, maladaptive_clf, vectorizer, feature_names,
            feature_importance=maladaptive_importance
        )
        maladaptive_spans = consolidate_spans(maladaptive_spans, post_text)
        
        # Add to results
        result["post_level"][post_id] = {
            "adaptive_evidence": adaptive_spans,
            "maladaptive_evidence": maladaptive_spans,
            "summary": "",  # Will be filled by Task B
            "wellbeing_score": None  # Will be filled by Task A.2
        }
    
    return timeline_id, result


In [23]:
def calculate_feature_importance_simple(clf, feature_names):
    """Calculate feature importance using the classifier's built-in feature_importances_"""
    if hasattr(clf, 'feature_importances_'):
        importance_scores = clf.feature_importances_
    elif hasattr(clf, 'coef_'):
        coef = clf.coef_[0] if len(clf.coef_.shape) > 1 else clf.coef_
        importance_scores = np.abs(coef)
    else:
        raise ValueError("Classifier does not have feature_importances_ or coef_ attribute")
    
    # Create a mapping of features to importance scores
    feature_importance = {feature_names[i]: float(importance_scores[i]) 
                         for i in range(len(feature_names))}
    
    return feature_importance

def run_full_pipeline(train_dir, test_dir, output_path, team_name="MyTeam"):
    """Run the complete Task A.1 pipeline from training to submission generation"""
    print("Loading training data...")
    train_timelines = load_all_timelines(train_dir)
    train_df = create_training_dataset(train_timelines)
    train_df = analyze_dataset(train_df)
    
    print("Training classifiers...")
    train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)
    X_train, vectorizer, feature_names = engineer_features(train_data)
    X_val = vectorizer.transform(val_data['text'])
    
    adaptive_clf, maladaptive_clf = train_binary_classifiers(
        X_train, train_data['has_adaptive'], train_data['has_maladaptive'],
        X_val, val_data['has_adaptive'], val_data['has_maladaptive']
    )
    
    print("Calculating feature importance...")
    # Calculate feature importance using the simplified method
    try:
        adaptive_importance = calculate_feature_importance_simple(adaptive_clf, feature_names)
        print("Feature importance for adaptive classifier calculated successfully")
    except Exception as e:
        print(f"Error calculating importance for adaptive classifier: {str(e)}")
        adaptive_importance = None
        
    try:
        maladaptive_importance = calculate_feature_importance_simple(maladaptive_clf, feature_names)
        print("Feature importance for maladaptive classifier calculated successfully")
    except Exception as e:
        print(f"Error calculating importance for maladaptive classifier: {str(e)}")
        maladaptive_importance = None
    
    print("Processing test data...")
    test_timelines = load_all_timelines(test_dir)
    
    submission = {}
    for timeline in tqdm(test_timelines, desc="Processing test timelines"):
        timeline_id, result = process_test_timeline(
            timeline, adaptive_clf, maladaptive_clf, vectorizer, feature_names,
            adaptive_importance, maladaptive_importance
        )
        submission[timeline_id] = result
    
    print("Saving submission...")
    os.makedirs(output_path, exist_ok=True)
    output_file = os.path.join(output_path, f"{team_name}_1.json")
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(submission, f, ensure_ascii=False, indent=2)
    
    print(f"Task A.1 processing complete! Submission saved to {output_file}")
    return submission


In [24]:
def evaluate_evidence_extraction(val_data, predictions, metric="bertscore"):
    """Evaluate evidence extraction performance using BERTScore"""
    from bert_score import score
    
    adaptive_scores = []
    maladaptive_scores = []
    
    for i, row in val_data.iterrows():
        post_id = row['post_id']
        if post_id in predictions:
            # Evaluate adaptive evidence
            gold_adaptive = row['adaptive_evidence']
            pred_adaptive = predictions[post_id]['adaptive_evidence']
            
            if gold_adaptive and pred_adaptive:
                P, R, F1 = score(pred_adaptive, gold_adaptive, lang="en")
                adaptive_scores.append(R.mean().item())  # Use recall as per CLPsych evaluation
            
            # Evaluate maladaptive evidence
            gold_maladaptive = row['maladaptive_evidence']
            pred_maladaptive = predictions[post_id]['maladaptive_evidence']
            
            if gold_maladaptive and pred_maladaptive:
                P, R, F1 = score(pred_maladaptive, gold_maladaptive, lang="en")
                maladaptive_scores.append(R.mean().item())
    
    return {
        "adaptive_recall": np.mean(adaptive_scores) if adaptive_scores else 0,
        "maladaptive_recall": np.mean(maladaptive_scores) if maladaptive_scores else 0,
        "overall_recall": np.mean(adaptive_scores + maladaptive_scores) if adaptive_scores + maladaptive_scores else 0
    }


In [25]:
from sklearn.model_selection import GridSearchCV

def tune_classifiers(X_train, y_train):
    """Find optimal hyperparameters for the classifiers"""
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    
    grid_search = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        cv=5,
        scoring='f1',
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_


In [26]:
def extract_bert_features(texts, model_name="emilyalsentzer/Bio_ClinicalBERT"):
    """Extract BERT embeddings as features"""
    from transformers import AutoTokenizer, AutoModel
    import torch
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    embeddings = []
    for text in tqdm(texts, desc="Extracting BERT embeddings"):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Use CLS token embedding as document representation
        embeddings.append(outputs.last_hidden_state[:, 0, :].numpy().flatten())
    
    return np.array(embeddings)


In [27]:
def generate_final_submission(test_dir, output_dir, team_name, model_params):
    """Generate the final submission file for Task A.1"""
    # Load trained models and vectorizers
    adaptive_clf = model_params['adaptive_clf']
    maladaptive_clf = model_params['maladaptive_clf']
    vectorizer = model_params['vectorizer']
    feature_names = model_params['feature_names']
    
    # Process test timelines
    test_timelines = load_all_timelines(test_dir)
    submission = {}
    
    for timeline in tqdm(test_timelines, desc="Generating final predictions"):
        timeline_id, result = process_test_timeline(
            timeline, adaptive_clf, maladaptive_clf, vectorizer, feature_names
        )
        submission[timeline_id] = result
    
    # Save submission file
    output_file = os.path.join(output_dir, f"{team_name}_TaskA1.json")
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(submission, f, ensure_ascii=False, indent=2)
    
    print(f"Final submission saved to {output_file}")
    return submission


In [28]:
if __name__ == "__main__":
    # Configuration
    TRAIN_DIR = "/kaggle/input/train-dataset-1"
    TEST_DIR = "/kaggle/input/test-dataset-1"
    OUTPUT_DIR = "/kaggle/working/"
    TEAM_NAME = "CIOL"
    
    # Run the complete pipeline
    submission = run_full_pipeline(TRAIN_DIR, TEST_DIR, OUTPUT_DIR, TEAM_NAME)
    
    print("Task A.1 completed successfully!")


Loading training data...
Total posts: 343
Posts with adaptive evidence: 169
Posts with maladaptive evidence: 179
Training classifiers...
Adaptive Classifier Performance:
              precision    recall  f1-score   support

           0       0.79      0.81      0.80        37
           1       0.77      0.75      0.76        32

    accuracy                           0.78        69
   macro avg       0.78      0.78      0.78        69
weighted avg       0.78      0.78      0.78        69

Maladaptive Classifier Performance:
              precision    recall  f1-score   support

           0       0.94      0.83      0.88        41
           1       0.79      0.93      0.85        28

    accuracy                           0.87        69
   macro avg       0.87      0.88      0.87        69
weighted avg       0.88      0.87      0.87        69

Calculating feature importance...
Feature importance for adaptive classifier calculated successfully
Feature importance for maladaptive clas

Processing test timelines:   0%|          | 0/10 [00:00<?, ?it/s]


NameError: name 'find_important_sentences' is not defined

In [None]:
import json
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def extract_wellbeing_data(timelines):
    """Extract posts with well-being scores from training timelines"""
    data = []
    for timeline in timelines:
        timeline_id = timeline["timeline_id"]
        prev_posts = []  # To store previous posts for contextual features
        
        for i, post in enumerate(timeline["posts"]):
            post_id = post["post_id"]
            post_text = post["post"]
            
            # Skip posts without well-being annotations
            if "Well-being" not in post or post["Well-being"] is None:
                prev_posts.append(post_text)
                continue
                
            wellbeing_score = post["Well-being"]
            
            # Extract adaptive and maladaptive evidence
            adaptive_evidence = []
            maladaptive_evidence = []
            if "evidence" in post:
                if "adaptive-state" in post["evidence"]:
                    for component, details in post["evidence"]["adaptive-state"].items():
                        if "highlighted_evidence" in details:
                            adaptive_evidence.append(details["highlighted_evidence"])
                
                if "maladaptive-state" in post["evidence"]:
                    for component, details in post["evidence"]["maladaptive-state"].items():
                        if "highlighted_evidence" in details:
                            maladaptive_evidence.append(details["highlighted_evidence"])
            
            # Get previous posts context (last 3 posts)
            context = prev_posts[-3:] if prev_posts else []
            
            data.append({
                "timeline_id": timeline_id,
                "post_id": post_id,
                "post_index": i,
                "text": post_text,
                "adaptive_evidence": adaptive_evidence,
                "maladaptive_evidence": maladaptive_evidence,
                "previous_posts": context,
                "wellbeing_score": wellbeing_score
            })
            
            # Update previous posts
            prev_posts.append(post_text)
    
    return pd.DataFrame(data)

# Load all training timelines
def load_all_timelines(data_dir):
    """Load all JSON files from a directory into a list of timelines"""
    timelines = []
    for filename in os.listdir(data_dir):
        if filename.endswith('.json'):
            with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as f:
                timeline = json.load(f)
                timelines.append(timeline)
    return timelines

# Load and prepare data
train_timelines = load_all_timelines("/kaggle/input/train-dataset-1")
wellbeing_df = extract_wellbeing_data(train_timelines)

# Analyze well-being score distribution
score_distribution = wellbeing_df['wellbeing_score'].value_counts().sort_index()
print("Well-being score distribution:")
print(score_distribution)

# Split data into training and validation sets
train_data, val_data = train_test_split(wellbeing_df, test_size=0.2, random_state=42)


In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Download required NLTK resources
nltk.download('vader_lexicon')
nltk.download('punkt')

def extract_wellbeing_features(df):
    """
    Extract features relevant to well-being prediction
    
    Features include:
    - Text features (length, sentiment)
    - Evidence-based features (presence, count, ratio)
    - Previous post context features
    - Content-based features (mentions of specific topics)
    """
    # Initialize sentiment analyzer
    sid = SentimentIntensityAnalyzer()
    
    # Create feature DataFrame
    features = pd.DataFrame()
    
    # 1. Basic text features
    features['text_length'] = df['text'].apply(len)
    features['word_count'] = df['text'].apply(lambda x: len(x.split()))
    features['sent_count'] = df['text'].apply(lambda x: len(nltk.sent_tokenize(x)))
    
    # 2. Sentiment features
    features['sentiment_neg'] = df['text'].apply(lambda x: sid.polarity_scores(x)['neg'])
    features['sentiment_neu'] = df['text'].apply(lambda x: sid.polarity_scores(x)['neu'])
    features['sentiment_pos'] = df['text'].apply(lambda x: sid.polarity_scores(x)['pos'])
    features['sentiment_compound'] = df['text'].apply(lambda x: sid.polarity_scores(x)['compound'])
    
    # 3. Evidence-based features
    features['has_adaptive'] = df['adaptive_evidence'].apply(lambda x: 1 if len(x) > 0 else 0)
    features['has_maladaptive'] = df['maladaptive_evidence'].apply(lambda x: 1 if len(x) > 0 else 0)
    features['adaptive_count'] = df['adaptive_evidence'].apply(len)
    features['maladaptive_count'] = df['maladaptive_evidence'].apply(len)
    
    # Calculate ratio of adaptive to total evidence spans
    features['adaptive_ratio'] = features.apply(
        lambda row: row['adaptive_count'] / (row['adaptive_count'] + row['maladaptive_count']) 
        if (row['adaptive_count'] + row['maladaptive_count']) > 0 else 0.5,
        axis=1
    )
    
    # 4. Context features (if previous posts exist)
    features['has_prev_posts'] = df['previous_posts'].apply(lambda x: 1 if len(x) > 0 else 0)
    features['prev_posts_count'] = df['previous_posts'].apply(len)
    
    # 5. Content-based features
    # Detect mentions of specific topics related to well-being
    # Social functioning
    features['mentions_friends'] = df['text'].apply(
        lambda x: 1 if re.search(r'\b(friend|friends|social|relationship|relationships)\b', x.lower()) else 0
    )
    features['mentions_family'] = df['text'].apply(
        lambda x: 1 if re.search(r'\b(family|parent|parents|mom|dad|sister|brother|sibling)\b', x.lower()) else 0
    )
    
    # Occupational functioning
    features['mentions_work'] = df['text'].apply(
        lambda x: 1 if re.search(r'\b(work|job|career|school|college|university|study|studies)\b', x.lower()) else 0
    )
    
    # Psychological functioning
    features['mentions_mental_health'] = df['text'].apply(
        lambda x: 1 if re.search(r'\b(depress|anxiety|stress|mental|therapy|therapist|psychologist|psychiatrist)\b', 
                                x.lower()) else 0
    )
    features['mentions_suicide'] = df['text'].apply(
        lambda x: 1 if re.search(r'\b(suicid|kill myself|end my life|die|death)\b', x.lower()) else 0
    )
    features['mentions_self_harm'] = df['text'].apply(
        lambda x: 1 if re.search(r'\b(cut|cutting|self-harm|hurt myself|harm myself)\b', x.lower()) else 0
    )
    
    return features

# Extract features from training and validation data
train_features = extract_wellbeing_features(train_data)
val_features = extract_wellbeing_features(val_data)

# Add text vectorization features
vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
train_tfidf = vectorizer.fit_transform(train_data['text'])
val_tfidf = vectorizer.transform(val_data['text'])

# Convert sparse matrices to DataFrames
train_tfidf_df = pd.DataFrame(
    train_tfidf.toarray(), 
    columns=[f'tfidf_{i}' for i in range(train_tfidf.shape[1])]
)
val_tfidf_df = pd.DataFrame(
    val_tfidf.toarray(), 
    columns=[f'tfidf_{i}' for i in range(val_tfidf.shape[1])]
)

# Combine all features
train_features_full = pd.concat([train_features, train_tfidf_df], axis=1)
val_features_full = pd.concat([val_features, val_tfidf_df], axis=1)

# Prepare target values
train_target = train_data['wellbeing_score']
val_target = val_data['wellbeing_score']


In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd
import xgboost as xgb

def align_features_and_targets(features_df, targets_series):
    """
    Ensure features and targets are properly aligned with the same number of samples
    
    Parameters:
    -----------
    features_df : DataFrame
        Feature dataframe
    targets_series : Series
        Target values series
    
    Returns:
    --------
    tuple: (aligned_features, aligned_targets)
    """
    print(f"Original features shape: {features_df.shape}")
    print(f"Original targets shape: {targets_series.shape}")
    
    # Get the intersection of indices
    common_indices = features_df.index.intersection(targets_series.index)
    print(f"Number of common indices: {len(common_indices)}")
    
    # Filter both dataframes to only include common indices
    aligned_features = features_df.loc[common_indices]
    aligned_targets = targets_series.loc[common_indices]
    
    print(f"Aligned features shape: {aligned_features.shape}")
    print(f"Aligned targets shape: {aligned_targets.shape}")
    
    # Check for NaN values
    print(f"NaN values in aligned features: {aligned_features.isna().sum().sum()}")
    print(f"NaN values in aligned targets: {aligned_targets.isna().sum()}")
    
    return aligned_features, aligned_targets

def train_wellbeing_models(X_train, y_train, X_val, y_val):
    """Train and evaluate multiple regression models for well-being scoring"""
    
    # First ensure data alignment
    X_train, y_train = align_features_and_targets(X_train, y_train)
    X_val, y_val = align_features_and_targets(X_val, y_val)
    
    # Handle any remaining NaN values in features
    imputer = SimpleImputer(strategy='mean')
    X_train_imputed = imputer.fit_transform(X_train)
    X_val_imputed = imputer.transform(X_val)
    
    # Check for NaN values after imputation
    print(f"NaN values in training data after imputation: {np.isnan(X_train_imputed).sum()}")
    print(f"NaN values in validation data after imputation: {np.isnan(X_val_imputed).sum()}")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_imputed)
    X_val_scaled = scaler.transform(X_val_imputed)
    
    # Model 1: Ridge Regression
    print("Training Ridge Regression model...")
    ridge = Ridge(alpha=1.0)
    ridge.fit(X_train_scaled, y_train)
    ridge_preds = ridge.predict(X_val_scaled)
    ridge_mse = mean_squared_error(y_val, ridge_preds)
    
    # Round predictions to nearest integer and clip to 1-10 range
    ridge_preds_rounded = np.round(np.clip(ridge_preds, 1, 10)).astype(int)
    ridge_mse_rounded = mean_squared_error(y_val, ridge_preds_rounded)
    
    print(f"Ridge Regression MSE: {ridge_mse:.4f}, Rounded MSE: {ridge_mse_rounded:.4f}")
    
    # Model 2: Random Forest
    print("Training Random Forest model...")
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train_imputed, y_train)
    rf_preds = rf.predict(X_val_imputed)
    rf_mse = mean_squared_error(y_val, rf_preds)
    
    # Round predictions to nearest integer and clip to 1-10 range
    rf_preds_rounded = np.round(np.clip(rf_preds, 1, 10)).astype(int)
    rf_mse_rounded = mean_squared_error(y_val, rf_preds_rounded)
    
    print(f"Random Forest MSE: {rf_mse:.4f}, Rounded MSE: {rf_mse_rounded:.4f}")
    
    # Model 3: Gradient Boosting
    print("Training Gradient Boosting model...")
    gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb.fit(X_train_imputed, y_train)
    gb_preds = gb.predict(X_val_imputed)
    gb_mse = mean_squared_error(y_val, gb_preds)
    
    # Round predictions to nearest integer and clip to 1-10 range
    gb_preds_rounded = np.round(np.clip(gb_preds, 1, 10)).astype(int)
    gb_mse_rounded = mean_squared_error(y_val, gb_preds_rounded)
    
    print(f"Gradient Boosting MSE: {gb_mse:.4f}, Rounded MSE: {gb_mse_rounded:.4f}")
    
    # Model 4: XGBoost
    print("Training XGBoost model...")
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
    xgb_model.fit(X_train_imputed, y_train)
    xgb_preds = xgb_model.predict(X_val_imputed)
    xgb_mse = mean_squared_error(y_val, xgb_preds)
    
    # Round predictions to nearest integer and clip to 1-10 range
    xgb_preds_rounded = np.round(np.clip(xgb_preds, 1, 10)).astype(int)
    xgb_mse_rounded = mean_squared_error(y_val, xgb_preds_rounded)
    
    print(f"XGBoost MSE: {xgb_mse:.4f}, Rounded MSE: {xgb_mse_rounded:.4f}")
    
    # Determine best model based on MSE
    model_mse = {
        'ridge': ridge_mse_rounded,
        'rf': rf_mse_rounded,
        'gb': gb_mse_rounded,
        'xgb': xgb_mse_rounded
    }
    
    best_model_name = min(model_mse, key=model_mse.get)
    best_model_dict = {
        'ridge': ridge,
        'rf': rf,
        'gb': gb,
        'xgb': xgb_model
    }
    best_preds_dict = {
        'ridge': ridge_preds_rounded,
        'rf': rf_preds_rounded,
        'gb': gb_preds_rounded,
        'xgb': xgb_preds_rounded
    }
    
    best_model = best_model_dict[best_model_name]
    best_preds = best_preds_dict[best_model_name]
    
    print(f"\nBest model: {best_model_name.upper()} with MSE: {model_mse[best_model_name]:.4f}")
    
    # Evaluate on score ranges
    def eval_score_range(y_true, y_pred, range_min, range_max):
        mask = (y_true >= range_min) & (y_true <= range_max)
        if sum(mask) > 0:
            return mean_squared_error(y_true[mask], y_pred[mask])
        return 0
    
    low_range_mse = eval_score_range(y_val, best_preds, 1, 4)
    mid_range_mse = eval_score_range(y_val, best_preds, 5, 6)
    high_range_mse = eval_score_range(y_val, best_preds, 7, 10)
    
    print(f"MSE for scores 1-4: {low_range_mse:.4f}")
    print(f"MSE for scores 5-6: {mid_range_mse:.4f}")
    print(f"MSE for scores 7-10: {high_range_mse:.4f}")
    
    # Analyze feature importance
    feature_importance = None
    if hasattr(best_model, 'feature_importances_'):
        feature_importance = {
            column: float(importance) 
            for column, importance in zip(X_train.columns, best_model.feature_importances_)
        }
        # Print top 10 important features
        sorted_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
        print("\nTop 10 important features:")
        for feature, importance in sorted_importance[:10]:
            print(f"{feature}: {importance:.4f}")
    
    return {
        'ridge': ridge,
        'rf': rf,
        'gb': gb,
        'xgb': xgb_model,
        'best_model': best_model,
        'scaler': scaler,
        'imputer': imputer,
        'feature_importance': feature_importance,
        'X_train_columns': X_train.columns.tolist()  # Store column names for prediction
    }

# Attempt to train the models with error handling
try:
    print("=== Starting well-being model training ===")
    # Make sure train_target and val_target are series with indices
    if not isinstance(train_target, pd.Series):
        train_target = pd.Series(train_target, index=train_features_full.index)
    if not isinstance(val_target, pd.Series):
        val_target = pd.Series(val_target, index=val_features_full.index)
    
    # Train models using properly aligned data
    models = train_wellbeing_models(
        train_features_full, train_target,
        val_features_full, val_target
    )
    print("Model training completed successfully!")
except Exception as e:
    print(f"Error during model training: {str(e)}")
    
    # Create a more basic set of features if the full feature set fails
    print("\nTrying with a simpler feature set...")
    
    # Extract basic features directly
    def create_basic_features(texts, wellbeing_scores):
        data = []
        for text, score in zip(texts, wellbeing_scores):
            # Basic text features
            length = len(text)
            word_count = len(text.split())
            
            # Simple sentiment features (without external libraries)
            positive_words = ['good', 'great', 'happy', 'joy', 'excellent', 'love', 'positive', 'wonderful']
            negative_words = ['bad', 'sad', 'angry', 'depressed', 'awful', 'hate', 'negative', 'terrible']
            
            pos_count = sum(1 for word in text.lower().split() if word in positive_words)
            neg_count = sum(1 for word in text.lower().split() if word in negative_words)
            
            data.append({
                'text_length': length,
                'word_count': word_count,
                'positive_word_count': pos_count,
                'negative_word_count': neg_count,
                'pos_neg_ratio': pos_count / (neg_count + 1),  # +1 to avoid division by zero
                'wellbeing_score': score
            })
        
        return pd.DataFrame(data)
    
    # Create simple features from text
    train_texts = train_data['text'].tolist()
    train_scores = train_data['wellbeing_score'].tolist()
    val_texts = val_data['text'].tolist()
    val_scores = val_data['wellbeing_score'].tolist()
    
    simple_train_df = create_basic_features(train_texts, train_scores)
    simple_val_df = create_basic_features(val_texts, val_scores)
    
    # Split features and target
    simple_train_features = simple_train_df.drop('wellbeing_score', axis=1)
    simple_train_target = simple_train_df['wellbeing_score']
    simple_val_features = simple_val_df.drop('wellbeing_score', axis=1)
    simple_val_target = simple_val_df['wellbeing_score']
    
    # Try training with simple features
    models = train_wellbeing_models(
        simple_train_features, simple_train_target,
        simple_val_features, simple_val_target
    )
    print("Model training completed with simplified feature set!")


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.neural_network import MLPRegressor

def extract_bert_embeddings(texts, model_name="emilyalsentzer/Bio_ClinicalBERT"):
    """
    Extract BERT embeddings for texts
    
    Parameters:
    texts (list): List of text strings
    model_name (str): Pretrained model name
    
    Returns:
    numpy.ndarray: BERT embeddings
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    embeddings = []
    batch_size = 8  # Process texts in batches
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Extracting BERT embeddings"):
        batch_texts = texts[i:i+batch_size]
        
        # Tokenize and get attention masks
        encoded = tokenizer(batch_texts, padding=True, truncation=True, 
                           return_tensors="pt", max_length=512)
        
        # Extract embeddings
        with torch.no_grad():
            outputs = model(**encoded)
            
        # Use CLS token embedding as document representation
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(batch_embeddings)
    
    return np.array(embeddings)

def train_neural_wellbeing_model(X_train, y_train, X_val, y_val):
    """Train neural network model for well-being prediction"""
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Neural Network Regressor
    nn_model = MLPRegressor(
        hidden_layer_sizes=(100, 50),
        activation='relu',
        solver='adam',
        alpha=0.001,
        max_iter=500,
        random_state=42
    )
    
    # Train the model
    nn_model.fit(X_train_scaled, y_train)
    
    # Predict and evaluate
    nn_preds = nn_model.predict(X_val_scaled)
    nn_preds_rounded = np.round(np.clip(nn_preds, 1, 10)).astype(int)
    nn_mse = mean_squared_error(y_val, nn_preds_rounded)
    
    print(f"Neural Network MSE: {nn_mse:.4f}")
    
    return nn_model, scaler

# Extract BERT embeddings for NLP-based well-being prediction
# Note: This step is optional and can be resource-intensive
try:
    print("Extracting BERT embeddings for training data...")
    train_bert_embeddings = extract_bert_embeddings(train_data['text'].tolist())
    val_bert_embeddings = extract_bert_embeddings(val_data['text'].tolist())
    
    # Train a neural network using BERT embeddings
    bert_nn_model, bert_scaler = train_neural_wellbeing_model(
        train_bert_embeddings, train_target,
        val_bert_embeddings, val_target
    )
    
    # Add BERT model to the models dictionary
    models['bert_nn'] = bert_nn_model
    models['bert_scaler'] = bert_scaler
    
except Exception as e:
    print(f"Skipping BERT embeddings due to error: {str(e)}")


In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

def create_ensemble_predictor(models, X_val, y_val):
    """
    Create an ensemble predictor that combines multiple models with proper handling of NaN values
    and feature name consistency
    
    Parameters:
    models (dict): Dictionary of trained models
    X_val (DataFrame): Validation features
    y_val (Series): Validation targets
    
    Returns:
    function: Ensemble prediction function
    dict: Model weights
    """
    print(f"Creating ensemble predictor with {len(models)} models")
    print(f"X_val shape: {X_val.shape}, NaN values: {X_val.isna().sum().sum()}")
    
    # Ensure we have the imputer and scaler
    imputer = models.get('imputer')
    scaler = models.get('scaler')
    
    if imputer is None:
        print("Warning: Imputer not found in models dictionary")
        # Create a simple imputer if none exists
        from sklearn.impute import SimpleImputer
        imputer = SimpleImputer(strategy='mean')
        imputer.fit(X_val)
        models['imputer'] = imputer
    
    # Get predictions from all available models
    predictions = {}
    
    # Handle NaN values in validation data
    X_val_imputed = pd.DataFrame(
        imputer.transform(X_val), 
        columns=X_val.columns,
        index=X_val.index
    )
    
    print(f"Imputed X_val shape: {X_val_imputed.shape}, NaN values: {X_val_imputed.isna().sum().sum()}")
    
    # Make predictions with each model
    if 'ridge' in models and models['ridge'] is not None:
        try:
            # Convert to numpy array to avoid feature name issues
            X_scaled = scaler.transform(X_val_imputed.values)
            ridge_preds = models['ridge'].predict(X_scaled)
            predictions['ridge'] = np.round(np.clip(ridge_preds, 1, 10)).astype(int)
            print("Successfully made Ridge predictions")
        except Exception as e:
            print(f"Error with Ridge model: {str(e)}")
    
    if 'rf' in models and models['rf'] is not None:
        try:
            rf_preds = models['rf'].predict(X_val_imputed)
            predictions['rf'] = np.round(np.clip(rf_preds, 1, 10)).astype(int)
            print("Successfully made Random Forest predictions")
        except Exception as e:
            print(f"Error with Random Forest model: {str(e)}")
    
    if 'gb' in models and models['gb'] is not None:
        try:
            gb_preds = models['gb'].predict(X_val_imputed)
            predictions['gb'] = np.round(np.clip(gb_preds, 1, 10)).astype(int)
            print("Successfully made Gradient Boosting predictions")
        except Exception as e:
            print(f"Error with Gradient Boosting model: {str(e)}")
    
    if 'xgb' in models and models['xgb'] is not None:
        try:
            xgb_preds = models['xgb'].predict(X_val_imputed)
            predictions['xgb'] = np.round(np.clip(xgb_preds, 1, 10)).astype(int)
            print("Successfully made XGBoost predictions")
        except Exception as e:
            print(f"Error with XGBoost model: {str(e)}")
    
    # If no models could make predictions, use a fallback
    if not predictions:
        print("No models could make predictions. Using default predictor.")
        # Return a function that always predicts middle score (5)
        return (lambda x, **kwargs: 5), {'default': 1.0}
    
    # Optimize model weights based on MSE
    weights = {}
    for model_name, preds in predictions.items():
        mse = mean_squared_error(y_val, preds)
        print(f"{model_name.upper()} MSE: {mse:.4f}")
        # Use inverse MSE as weight (better models get higher weights)
        weights[model_name] = 1 / mse if mse > 0 else 1
    
    # Normalize weights
    total_weight = sum(weights.values())
    for model_name in weights:
        weights[model_name] /= total_weight
    
    print("Ensemble model weights:")
    for model_name, weight in sorted(weights.items(), key=lambda x: x[1], reverse=True):
        print(f"{model_name}: {weight:.4f}")
    
    # Create ensemble prediction function
    def predict_ensemble(features, **kwargs):
        """
        Make well-being score predictions using ensemble
        
        Parameters:
        features (DataFrame): Features for traditional models
        **kwargs: Additional arguments (ignored)
        
        Returns:
        int: Predicted well-being score (1-10)
        """
        # Ensure we're predicting for a single sample
        if len(features.shape) > 1 and features.shape[0] > 1:
            print(f"Warning: Predicting for first sample only. Received {features.shape[0]} samples.")
            if isinstance(features, pd.DataFrame):
                features = features.iloc[[0]]
            else:
                features = features[[0], :]
        
        # Preprocess: handle NaN values
        try:
            if isinstance(features, pd.DataFrame):
                features_imputed = pd.DataFrame(
                    imputer.transform(features),
                    columns=features.columns,
                    index=features.index
                )
            else:
                features_imputed = imputer.transform(features)
        except Exception as e:
            print(f"Error in imputation: {str(e)}")
            return 5  # Default middle score if preprocessing fails
        
        # Collect predictions from each model
        model_predictions = []
        model_weights = []
        
        # Ridge model (needs scaling)
        if 'ridge' in weights:
            try:
                X_scaled = scaler.transform(features_imputed.values if isinstance(features_imputed, pd.DataFrame) 
                                          else features_imputed)
                ridge_pred = models['ridge'].predict(X_scaled)[0]
                model_predictions.append(int(np.round(np.clip(ridge_pred, 1, 10))))
                model_weights.append(weights['ridge'])
            except Exception as e:
                print(f"Ridge prediction error: {str(e)}")
        
        # Random Forest model
        if 'rf' in weights:
            try:
                rf_pred = models['rf'].predict(features_imputed)[0]
                model_predictions.append(int(np.round(np.clip(rf_pred, 1, 10))))
                model_weights.append(weights['rf'])
            except Exception as e:
                print(f"Random Forest prediction error: {str(e)}")
        
        # Gradient Boosting model
        if 'gb' in weights:
            try:
                gb_pred = models['gb'].predict(features_imputed)[0]
                model_predictions.append(int(np.round(np.clip(gb_pred, 1, 10))))
                model_weights.append(weights['gb'])
            except Exception as e:
                print(f"Gradient Boosting prediction error: {str(e)}")
        
        # XGBoost model
        if 'xgb' in weights:
            try:
                xgb_pred = models['xgb'].predict(features_imputed)[0]
                model_predictions.append(int(np.round(np.clip(xgb_pred, 1, 10))))
                model_weights.append(weights['xgb'])
            except Exception as e:
                print(f"XGBoost prediction error: {str(e)}")
        
        # If we have predictions, compute weighted average
        if model_predictions:
            # Weighted average
            weighted_pred = sum(p * w for p, w in zip(model_predictions, model_weights)) / sum(model_weights)
            final_pred = int(np.round(np.clip(weighted_pred, 1, 10)))
            return final_pred
        else:
            # If all models failed, return middle score
            print("All models failed to predict, returning default score")
            return 5
    
    return predict_ensemble, weights

# Create ensemble predictor with error handling
try:
    print("\n=== Creating ensemble predictor ===")
    # First ensure that validation data is aligned with targets
    if isinstance(val_target, pd.Series):
        X_val_aligned = val_features_full.loc[val_target.index]
    else:
        X_val_aligned = val_features_full
        
    # Now create the ensemble predictor
    ensemble_predictor, model_weights = create_ensemble_predictor(
        models, X_val_aligned, val_target
    )
    print("Successfully created ensemble predictor")
except Exception as e:
    print(f"Failed to create ensemble predictor: {str(e)}")
    # Fallback to best model
    if 'best_model' in models and models['best_model'] is not None:
        print("Using best model as fallback")
        best_model = models['best_model']
        imputer = models.get('imputer')
        scaler = models.get('scaler')
        
        def simple_predictor(features, **kwargs):
            try:
                # Basic preprocessing
                if imputer is not None:
                    features_imputed = imputer.transform(features)
                else:
                    features_imputed = features
                
                # Check if best model is Ridge (needs scaling)
                if hasattr(best_model, 'intercept_') and scaler is not None:
                    features_scaled = scaler.transform(features_imputed)
                    pred = best_model.predict(features_scaled)[0]
                else:
                    pred = best_model.predict(features_imputed)[0]
                
                return int(np.round(np.clip(pred, 1, 10)))
            except Exception as e:
                print(f"Error in simple predictor: {str(e)}")
                return 5  # Default middle score
        
        ensemble_predictor = simple_predictor
        model_weights = {'best_model': 1.0}
    else:
        print("No best model available, using default predictor")
        # Just return middle score
        ensemble_predictor = lambda x, **kwargs: 5
        model_weights = {'default': 1.0}


In [None]:
def predict_wellbeing_for_post(post_text, previous_posts, adaptive_evidence, maladaptive_evidence, 
                              models, vectorizer, ensemble_predictor=None):
    """
    Predict well-being score for a new post
    
    Parameters:
    post_text (str): Text content of the post
    previous_posts (list): List of previous post texts
    adaptive_evidence (list): Extracted adaptive evidence spans
    maladaptive_evidence (list): Extracted maladaptive evidence spans
    models (dict): Trained models
    vectorizer (TfidfVectorizer): Fitted vectorizer
    ensemble_predictor (function, optional): Ensemble prediction function
    
    Returns:
    int: Predicted well-being score (1-10)
    """
    # Create a DataFrame with the post
    post_df = pd.DataFrame([{
        'text': post_text,
        'adaptive_evidence': adaptive_evidence,
        'maladaptive_evidence': maladaptive_evidence,
        'previous_posts': previous_posts
    }])
    
    # Extract features
    features = extract_wellbeing_features(post_df)
    
    # Add text vectorization features
    tfidf = vectorizer.transform([post_text])
    tfidf_df = pd.DataFrame(
        tfidf.toarray(), 
        columns=[f'tfidf_{i}' for i in range(tfidf.shape[1])]
    )
    
    # Combine all features
    features_full = pd.concat([features, tfidf_df], axis=1)
    
    # If ensemble predictor is available, use it
    if ensemble_predictor is not None:
        return ensemble_predictor(features_full)
    
    # Otherwise, use the best model
    best_model = models['best_model']
    prediction = best_model.predict(features_full)[0]
    
    # Round to nearest integer and clip to 1-10 range
    final_prediction = int(np.round(np.clip(prediction, 1, 10)))
    
    return final_prediction


In [None]:
def process_timeline_for_taskA(timeline, adaptive_clf, maladaptive_clf, vectorizer, feature_names,
                              wellbeing_models, adaptive_importance=None, maladaptive_importance=None,
                              ensemble_predictor=None):
    """
    Process a test timeline for Task A (both A1 and A2)
    
    Parameters:
    timeline (dict): Timeline data
    adaptive_clf, maladaptive_clf: Classifiers from Task A1
    vectorizer: TF-IDF vectorizer
    feature_names: Feature names for the vectorizer
    wellbeing_models (dict): Trained well-being prediction models
    adaptive_importance, maladaptive_importance: Feature importance dictionaries
    ensemble_predictor: Ensemble prediction function
    
    Returns:
    tuple: (timeline_id, result dictionary)
    """
    timeline_id = timeline["timeline_id"]
    result = {
        "timeline_level": {"summary": ""},  # Will be filled by Task C
        "post_level": {}
    }
    
    # Keep track of previous posts for context
    previous_posts = []
    
    for post in timeline["posts"]:
        post_id = post["post_id"]
        post_text = post["post"] if "post" in post else ""
        
        if not post_text:
            # Handle empty posts
            result["post_level"][post_id] = {
                "adaptive_evidence": [],
                "maladaptive_evidence": [],
                "summary": "",
                "wellbeing_score": None
            }
            continue
        
        # Task A.1: Extract adaptive and maladaptive evidence
        adaptive_spans = extract_evidence_spans(
            post_text, adaptive_clf, vectorizer, feature_names,
            feature_importance=adaptive_importance
        )
        adaptive_spans = consolidate_spans(adaptive_spans, post_text)
        
        maladaptive_spans = extract_evidence_spans(
            post_text, maladaptive_clf, vectorizer, feature_names,
            feature_importance=maladaptive_importance
        )
        maladaptive_spans = consolidate_spans(maladaptive_spans, post_text)
        
        # Task A.2: Predict well-being score
        wellbeing_score = predict_wellbeing_for_post(
            post_text, previous_posts, adaptive_spans, maladaptive_spans,
            wellbeing_models, vectorizer, ensemble_predictor
        )
        
        # Add to results
        result["post_level"][post_id] = {
            "adaptive_evidence": adaptive_spans,
            "maladaptive_evidence": maladaptive_spans,
            "summary": "",  # Will be filled by Task B
            "wellbeing_score": wellbeing_score
        }
        
        # Update previous posts for context
        previous_posts.append(post_text)
        if len(previous_posts) > 5:  # Keep only last 5 posts for context
            previous_posts = previous_posts[-5:]
    
    return timeline_id, result


In [None]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def run_full_taskA_pipeline(train_dir, test_dir, output_path, team_name="MyTeam"):
    """Run the complete Task A pipeline (A1 and A2) from training to submission generation"""
    print("Loading training data...")
    train_timelines = load_all_timelines(train_dir)
    
    # Task A.1 data preparation
    train_df_a1 = create_training_dataset(train_timelines)
    train_df_a1 = analyze_dataset(train_df_a1)
    
    # Task A.2 data preparation
    wellbeing_df = extract_wellbeing_data(train_timelines)
    
    print("Splitting data...")
    # Split data for both tasks
    train_data_a1, val_data_a1 = train_test_split(train_df_a1, test_size=0.2, random_state=42)
    train_data_a2, val_data_a2 = train_test_split(wellbeing_df, test_size=0.2, random_state=42)
    
    print("Training Task A.1 classifiers...")
    # Task A.1: Feature engineering and model training
    X_train_a1, vectorizer, feature_names = engineer_features(train_data_a1)
    X_val_a1 = vectorizer.transform(val_data_a1['text'])
    
    adaptive_clf, maladaptive_clf = train_binary_classifiers(
        X_train_a1, train_data_a1['has_adaptive'], train_data_a1['has_maladaptive'],
        X_val_a1, val_data_a1['has_adaptive'], val_data_a1['has_maladaptive']
    )
    
    # Calculate feature importance for Task A.1
    adaptive_importance = calculate_feature_importance_simple(adaptive_clf, feature_names)
    maladaptive_importance = calculate_feature_importance_simple(maladaptive_clf, feature_names)
    
    print("Training Task A.2 models...")
    # Task A.2: Feature engineering and model training
    train_features_a2 = extract_wellbeing_features(train_data_a2)
    val_features_a2 = extract_wellbeing_features(val_data_a2)
    
    # Add text vectorization features
    train_tfidf_a2 = vectorizer.transform(train_data_a2['text'])
    val_tfidf_a2 = vectorizer.transform(val_data_a2['text'])
    
    # Ensure indices are preserved during DataFrame creation
    train_tfidf_df_a2 = pd.DataFrame(
        train_tfidf_a2.toarray(), 
        columns=[f'tfidf_{i}' for i in range(train_tfidf_a2.shape[1])],
        index=train_data_a2.index  # Preserve index alignment
    )
    val_tfidf_df_a2 = pd.DataFrame(
        val_tfidf_a2.toarray(), 
        columns=[f'tfidf_{i}' for i in range(val_tfidf_a2.shape[1])],
        index=val_data_a2.index  # Preserve index alignment
    )
    
    # Ensure train_features_a2 and val_features_a2 have proper indices
    train_features_a2.index = train_data_a2.index
    val_features_a2.index = val_data_a2.index
    
    # Concatenate features with preserved indices
    train_features_full_a2 = pd.concat([train_features_a2, train_tfidf_df_a2], axis=1)
    val_features_full_a2 = pd.concat([val_features_a2, val_tfidf_df_a2], axis=1)
    
    # Ensure targets are Series with preserved indices
    train_target_a2 = pd.Series(train_data_a2['wellbeing_score'].values, index=train_data_a2.index)
    val_target_a2 = pd.Series(val_data_a2['wellbeing_score'].values, index=val_data_a2.index)
    
    # Train well-being models with properly aligned data
    wellbeing_models = train_wellbeing_models(
        train_features_full_a2, train_target_a2,
        val_features_full_a2, val_target_a2
    )
    
    print("Creating ensemble predictor...")
    # Create ensemble predictor using aligned data
    ensemble_predictor, model_weights = create_aligned_ensemble_predictor(
        wellbeing_models, val_features_full_a2, val_target_a2
    )
    
    print("Processing test data...")
    test_timelines = load_all_timelines(test_dir)
    
    submission = {}
    for timeline in tqdm(test_timelines, desc="Processing test timelines"):
        timeline_id, result = process_timeline_for_taskA(
            timeline, adaptive_clf, maladaptive_clf, vectorizer, feature_names,
            wellbeing_models, adaptive_importance, maladaptive_importance,
            ensemble_predictor
        )
        submission[timeline_id] = result
    
    print("Saving submission...")
    os.makedirs(output_path, exist_ok=True)
    output_file = os.path.join(output_path, f"{team_name}_TaskA.json")
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(submission, f, ensure_ascii=False, indent=2)
    
    print(f"Task A processing complete! Submission saved to {output_file}")
    return submission

def create_aligned_ensemble_predictor(models, X_val, y_val):
    """
    Create an ensemble predictor that combines multiple models with proper data alignment
    
    Parameters:
    models (dict): Dictionary of trained models
    X_val (DataFrame): Validation features
    y_val (Series): Validation targets
    
    Returns:
    function: Ensemble prediction function
    dict: Model weights
    """
    print(f"Creating ensemble predictor with {len(models)} models")
    
    # Ensure X_val and y_val are properly aligned by index
    common_indices = X_val.index.intersection(y_val.index)
    print(f"Original X_val shape: {X_val.shape}, y_val shape: {len(y_val)}")
    print(f"Common indices: {len(common_indices)}")
    
    # Filter both dataframes to only include common indices
    X_val_aligned = X_val.loc[common_indices]
    y_val_aligned = y_val.loc[common_indices]
    
    print(f"Aligned X_val shape: {X_val_aligned.shape}, y_val shape: {len(y_val_aligned)}")
    
    # Ensure we have the imputer and scaler
    imputer = models.get('imputer')
    scaler = models.get('scaler')
    
    if imputer is None:
        print("Warning: Imputer not found in models dictionary")
        from sklearn.impute import SimpleImputer
        imputer = SimpleImputer(strategy='mean')
        imputer.fit(X_val_aligned)
        models['imputer'] = imputer
    
    # Handle NaN values in validation data
    X_val_imputed = pd.DataFrame(
        imputer.transform(X_val_aligned), 
        columns=X_val_aligned.columns,
        index=X_val_aligned.index
    )
    
    print(f"Imputed X_val shape: {X_val_imputed.shape}, NaN values: {X_val_imputed.isna().sum().sum()}")
    
    # Get predictions from all available models
    predictions = {}
    
    # Make predictions with each model
    if 'ridge' in models and models['ridge'] is not None:
        try:
            # Convert to numpy array to avoid feature name issues
            X_scaled = scaler.transform(X_val_imputed.values)
            ridge_preds = models['ridge'].predict(X_scaled)
            predictions['ridge'] = np.round(np.clip(ridge_preds, 1, 10)).astype(int)
            print("Successfully made Ridge predictions")
        except Exception as e:
            print(f"Error with Ridge model: {str(e)}")
    
    if 'rf' in models and models['rf'] is not None:
        try:
            # Convert to numpy array to avoid feature name issues
            rf_preds = models['rf'].predict(X_val_imputed.values)
            predictions['rf'] = np.round(np.clip(rf_preds, 1, 10)).astype(int)
            print("Successfully made Random Forest predictions")
        except Exception as e:
            print(f"Error with Random Forest model: {str(e)}")
    
    if 'gb' in models and models['gb'] is not None:
        try:
            # Convert to numpy array to avoid feature name issues
            gb_preds = models['gb'].predict(X_val_imputed.values)
            predictions['gb'] = np.round(np.clip(gb_preds, 1, 10)).astype(int)
            print("Successfully made Gradient Boosting predictions")
        except Exception as e:
            print(f"Error with Gradient Boosting model: {str(e)}")
    
    if 'xgb' in models and models['xgb'] is not None:
        try:
            # Convert to numpy array to avoid feature name issues
            xgb_preds = models['xgb'].predict(X_val_imputed.values)
            predictions['xgb'] = np.round(np.clip(xgb_preds, 1, 10)).astype(int)
            print("Successfully made XGBoost predictions")
        except Exception as e:
            print(f"Error with XGBoost model: {str(e)}")
    
    # If no models could make predictions, use a fallback
    if not predictions:
        print("No models could make predictions. Using default predictor.")
        return (lambda x, **kwargs: 5), {'default': 1.0}
    
    # Optimize model weights based on MSE
    weights = {}
    for model_name, preds in predictions.items():
        # Now y_val_aligned and preds should have the same length
        mse = mean_squared_error(y_val_aligned, preds)
        print(f"{model_name.upper()} MSE: {mse:.4f}")
        # Use inverse MSE as weight (better models get higher weights)
        weights[model_name] = 1 / mse if mse > 0 else 1
    
    # Normalize weights
    total_weight = sum(weights.values())
    for model_name in weights:
        weights[model_name] /= total_weight
    
    print("Ensemble model weights:")
    for model_name, weight in sorted(weights.items(), key=lambda x: x[1], reverse=True):
        print(f"{model_name}: {weight:.4f}")
    
    # Create ensemble prediction function
    def predict_ensemble(features, **kwargs):
        """
        Make well-being score predictions using ensemble
        
        Parameters:
        features (DataFrame): Features for traditional models
        **kwargs: Additional arguments (ignored)
        
        Returns:
        int: Predicted well-being score (1-10)
        """
        # Ensure we're predicting for a single sample
        if len(features.shape) > 1 and features.shape[0] > 1:
            print(f"Warning: Predicting for first sample only. Received {features.shape[0]} samples.")
            if isinstance(features, pd.DataFrame):
                features = features.iloc[[0]]
            else:
                features = features[[0], :]
        
        # Preprocess: handle NaN values
        try:
            if isinstance(features, pd.DataFrame):
                features_imputed = pd.DataFrame(
                    imputer.transform(features),
                    columns=features.columns,
                    index=features.index
                )
            else:
                features_imputed = imputer.transform(features)
        except Exception as e:
            print(f"Error in imputation: {str(e)}")
            return 5  # Default middle score if preprocessing fails
        
        # Collect predictions from each model
        model_predictions = []
        model_weights = []
        
        # Ridge model (needs scaling)
        if 'ridge' in weights:
            try:
                # Convert to numpy array to avoid feature name issues
                X_scaled = scaler.transform(features_imputed.values if isinstance(features_imputed, pd.DataFrame) 
                                          else features_imputed)
                ridge_pred = models['ridge'].predict(X_scaled)[0]
                model_predictions.append(int(np.round(np.clip(ridge_pred, 1, 10))))
                model_weights.append(weights['ridge'])
            except Exception as e:
                print(f"Ridge prediction error: {str(e)}")
        
        # Random Forest model
        if 'rf' in weights:
            try:
                # Convert to numpy array to avoid feature name issues
                rf_pred = models['rf'].predict(
                    features_imputed.values if isinstance(features_imputed, pd.DataFrame) 
                    else features_imputed
                )[0]
                model_predictions.append(int(np.round(np.clip(rf_pred, 1, 10))))
                model_weights.append(weights['rf'])
            except Exception as e:
                print(f"Random Forest prediction error: {str(e)}")
        
        # Gradient Boosting model
        if 'gb' in weights:
            try:
                # Convert to numpy array to avoid feature name issues
                gb_pred = models['gb'].predict(
                    features_imputed.values if isinstance(features_imputed, pd.DataFrame) 
                    else features_imputed
                )[0]
                model_predictions.append(int(np.round(np.clip(gb_pred, 1, 10))))
                model_weights.append(weights['gb'])
            except Exception as e:
                print(f"Gradient Boosting prediction error: {str(e)}")
        
        # XGBoost model
        if 'xgb' in weights:
            try:
                # Convert to numpy array to avoid feature name issues
                xgb_pred = models['xgb'].predict(
                    features_imputed.values if isinstance(features_imputed, pd.DataFrame) 
                    else features_imputed
                )[0]
                model_predictions.append(int(np.round(np.clip(xgb_pred, 1, 10))))
                model_weights.append(weights['xgb'])
            except Exception as e:
                print(f"XGBoost prediction error: {str(e)}")
        
        # If we have predictions, compute weighted average
        if model_predictions:
            # Weighted average
            weighted_pred = sum(p * w for p, w in zip(model_predictions, model_weights)) / sum(model_weights)
            final_pred = int(np.round(np.clip(weighted_pred, 1, 10)))
            return final_pred
        else:
            # If all models failed, return middle score
            print("All models failed to predict, returning default score")
            return 5
    
    return predict_ensemble, weights

# Main execution
if __name__ == "__main__":
    # Configuration
    TRAIN_DIR = "/kaggle/input/train-dataset-1"
    TEST_DIR = "/kaggle/input/test-dataset-1"
    OUTPUT_DIR = "/kaggle/working/"
    TEAM_NAME = "CIOL"
    
    # Run the pipeline
    submission = run_full_taskA_pipeline(TRAIN_DIR, TEST_DIR, OUTPUT_DIR, TEAM_NAME)
    
    print("Task A completed successfully!")


In [None]:
import json
import os
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# For systems without GPU, we can use CPU but generation will be slower
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


In [None]:
def load_examples_from_training_data(train_dir, num_examples=3):
    """
    Load example post summaries from training data to use as few-shot examples
    """
    examples = []
    files = [f for f in os.listdir(train_dir) if f.endswith('.json')]
    
    for filename in files:
        with open(os.path.join(train_dir, filename), 'r', encoding='utf-8') as f:
            timeline = json.load(f)
            
        for post in timeline["posts"]:
            # Skip posts without summaries or evidence
            if "Post Summary" not in post or not post["Post Summary"]:
                continue
                
            if "evidence" not in post:
                continue
                
            adaptive_evidence = []
            maladaptive_evidence = []
            
            # Extract evidence spans
            if "adaptive-state" in post["evidence"]:
                for component, details in post["evidence"]["adaptive-state"].items():
                    if "highlighted_evidence" in details:
                        adaptive_evidence.append(details["highlighted_evidence"])
            
            if "maladaptive-state" in post["evidence"]:
                for component, details in post["evidence"]["maladaptive-state"].items():
                    if "highlighted_evidence" in details:
                        maladaptive_evidence.append(details["highlighted_evidence"])
            
            if not adaptive_evidence and not maladaptive_evidence:
                continue
                
            wellbeing_score = post.get("Well-being")
            if wellbeing_score is None:
                continue
                
            examples.append({
                "post": post["post"],
                "adaptive_evidence": adaptive_evidence,
                "maladaptive_evidence": maladaptive_evidence,
                "wellbeing_score": wellbeing_score,
                "summary": post["Post Summary"]
            })
            
            if len(examples) >= num_examples:
                return examples
    
    return examples


In [None]:
def format_evidence_list(evidence_list):
    """Format evidence list for prompt"""
    if not evidence_list:
        return "None found."
    
    return "\n".join([f"- \"{item}\"" for item in evidence_list])

def determine_dominant_state(adaptive_evidence, maladaptive_evidence, wellbeing_score):
    """Determine which self-state is dominant based on evidence and wellbeing score"""
    has_adaptive = len(adaptive_evidence) > 0
    has_maladaptive = len(maladaptive_evidence) > 0
    
    # Determine dominant state based on wellbeing score and evidence presence
    if wellbeing_score >= 7 and has_adaptive:
        return "adaptive"
    elif wellbeing_score <= 5 and has_maladaptive:
        return "maladaptive"
    elif has_adaptive and not has_maladaptive:
        return "adaptive"
    elif has_maladaptive and not has_adaptive:
        return "maladaptive"
    elif len(adaptive_evidence) > len(maladaptive_evidence):
        return "adaptive"
    elif len(maladaptive_evidence) > len(adaptive_evidence):
        return "maladaptive"
    else:
        # Default to maladaptive if unclear
        return "maladaptive" if has_maladaptive else "adaptive"


In [None]:
def create_summary_prompt_with_examples(post_text, adaptive_evidence, maladaptive_evidence, wellbeing_score, examples):
    """Create a prompt for the language model with few-shot examples"""
    dominant_state = determine_dominant_state(adaptive_evidence, maladaptive_evidence, wellbeing_score)
    
    # Base prompt with instructions
    base_prompt = f"""As a clinical psychologist expert in mental health, I need to generate a summary of self-states from a social media post. This summary should capture the interplay between adaptive (positive, constructive) and maladaptive (negative, harmful) self-states.

Self-states are characterized by specific combinations of Affect (A), Behavior (B), Cognition (C), and Desire/Need (D) that are co-activated for periods of time:
- Affect (A): Emotions expressed (pride, anxiety, depression, etc.)
- Behavior (B): Actions toward self (self-care, self-harm) or others (relating, controlling)
- Cognition (C): Perceptions of self (self-acceptance, self-criticism) or others (related, detached)
- Desire/Need/Expectation (D): What the person wants or expects

The summary should:
1. Start with the dominant self-state and identify its central organizing aspect (A, B, C, or D)
2. Describe how this central aspect influences other components
3. Emphasize causal relationships between components
4. Address the second self-state if present
5. Follow the same structure for the second self-state

The summary should read naturally without explicitly labeling components as A, B, C, or D, but should clearly describe the psychological dynamics.

Here are some examples:"""
    
    # Add few-shot examples
    example_section = ""
    for example in examples:
        example_section += f"""

Example Post:
"{example['post']}"

Adaptive Evidence:
{format_evidence_list(example['adaptive_evidence'])}

Maladaptive Evidence:
{format_evidence_list(example['maladaptive_evidence'])}

Well-being Score: {example['wellbeing_score']}

Summary:
{example['summary']}
"""
    
    # Now add the current post information
    current_post_section = f"""

Now, please generate a summary for this post:

Post:
"{post_text}"

Adaptive Evidence:
{format_evidence_list(adaptive_evidence)}

Maladaptive Evidence:
{format_evidence_list(maladaptive_evidence)}

Well-being Score: {wellbeing_score}

Based on this information, the dominant self-state appears to be {dominant_state}.

Summary:"""
    
    return base_prompt + example_section + current_post_section


In [None]:
def post_process_summary(summary):
    """Clean up and improve the generated summary"""
    # Remove any instruction residue or repetitions
    summary = re.sub(r"Summary:|The summary is:", "", summary).strip()
    
    # Remove any "In conclusion" or similar phrases that might appear
    summary = re.sub(r"^In (summary|conclusion|this post),", "", summary).strip()
    
    # Ensure the summary has proper paragraph structure
    if "." in summary and not summary.endswith("."):
        # Get the last complete sentence
        last_period = summary.rindex(".")
        summary = summary[:last_period+1]
    
    # Ensure the summary isn't too short
    if len(summary.split()) < 30:
        return "The post does not contain sufficient information to generate a meaningful summary of self-states."
    
    return summary


In [None]:
def load_model_for_summarization(model_name="meta-llama/Llama-2-7b-chat-hf"):
    """Load the language model for summarization"""
    print(f"Loading language model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Add padding token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Use 8-bit quantization to reduce memory requirements
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto",
        load_in_8bit=True  # Use 8-bit quantization if available
    )
    return model, tokenizer


In [None]:
def generate_post_summary(post_text, adaptive_evidence, maladaptive_evidence, wellbeing_score, model, tokenizer, examples=None, max_length=500):
    """Generate a post-level summary using a language model approach"""
    # Create a prompt for the language model
    prompt = create_summary_prompt_with_examples(
        post_text, adaptive_evidence, maladaptive_evidence, wellbeing_score, examples
    )
    
    # Generate the summary
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Check if inputs are too long for the model
    if inputs.input_ids.shape[1] > tokenizer.model_max_length:
        print(f"Warning: Input too long ({inputs.input_ids.shape[1]} tokens). Truncating.")
        inputs = tokenizer(
            prompt, 
            truncation=True, 
            max_length=tokenizer.model_max_length - 100,  # Leave room for generation
            return_tensors="pt"
        ).to(model.device)
    
    # Generate summary
    try:
        with torch.no_grad():
            output = model.generate(
                inputs.input_ids,
                max_new_tokens=max_length,
                num_return_sequences=1,
                temperature=0.7,
                top_p=0.9,
                no_repeat_ngram_size=3,
                do_sample=True
            )
        
        # Decode the generated text
        summary = tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
        
        # Clean up the summary
        summary = post_process_summary(summary)
        
        return summary
    except Exception as e:
        print(f"Error generating summary: {str(e)}")
        return create_fallback_summary(post_text, adaptive_evidence, maladaptive_evidence, wellbeing_score)
def generate_post_summary(post_text, adaptive_evidence, maladaptive_evidence, wellbeing_score, model, tokenizer, examples=None, max_length=500):
    """Generate a post-level summary using a language model approach"""
    # Create a prompt for the language model
    prompt = create_summary_prompt_with_examples(
        post_text, adaptive_evidence, maladaptive_evidence, wellbeing_score, examples
    )
    
    # Generate the summary
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Check if inputs are too long for the model
    if inputs.input_ids.shape[1] > tokenizer.model_max_length:
        print(f"Warning: Input too long ({inputs.input_ids.shape[1]} tokens). Truncating.")
        inputs = tokenizer(
            prompt, 
            truncation=True, 
            max_length=tokenizer.model_max_length - 100,  # Leave room for generation
            return_tensors="pt"
        ).to(model.device)
    
    # Generate summary
    try:
        with torch.no_grad():
            output = model.generate(
                inputs.input_ids,
                max_new_tokens=max_length,
                num_return_sequences=1,
                temperature=0.7,
                top_p=0.9,
                no_repeat_ngram_size=3,
                do_sample=True
            )
        
        # Decode the generated text
        summary = tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
        
        # Clean up the summary
        summary = post_process_summary(summary)
        
        return summary
    except Exception as e:
        print(f"Error generating summary: {str(e)}")
        return create_fallback_summary(post_text, adaptive_evidence, maladaptive_evidence, wellbeing_score)


In [None]:
def create_fallback_summary(post_text, adaptive_evidence, maladaptive_evidence, wellbeing_score):
    """Create a simple template-based summary as a fallback"""
    has_adaptive = len(adaptive_evidence) > 0
    has_maladaptive = len(maladaptive_evidence) > 0
    
    # Determine dominant state
    dominant_state = determine_dominant_state(adaptive_evidence, maladaptive_evidence, wellbeing_score)
    
    if dominant_state == "adaptive" and has_adaptive:
        summary = f"The dominant self-state in this post is adaptive. The writer shows evidence of positive emotional states and constructive behaviors as seen in '{adaptive_evidence[0]}'. "
        
        if has_maladaptive:
            summary += f"However, there is also evidence of a maladaptive self-state as shown by '{maladaptive_evidence[0]}'. "
    elif dominant_state == "maladaptive" and has_maladaptive:
        summary = f"The dominant self-state in this post is maladaptive. The writer shows evidence of negative emotional states and harmful patterns as seen in '{maladaptive_evidence[0]}'. "
        
        if has_adaptive:
            summary += f"However, there is also evidence of an adaptive self-state as shown by '{adaptive_evidence[0]}'. "
    else:
        summary = "The post shows insufficient evidence to determine a clear dominant self-state."
    
    return summary


In [None]:
def process_posts_in_batches(taskA_output, test_timelines, model, tokenizer, examples, batch_size=8):
    """Process posts in batches for better efficiency"""
    # Create a mapping of timeline_id, post_id to post_text
    post_text_map = {}
    for timeline in test_timelines:
        timeline_id = timeline["timeline_id"]
        for post in timeline["posts"]:
            post_id = post["post_id"]
            post_text = post["post"]
            post_text_map[(timeline_id, post_id)] = post_text
    
    # Create a flat list of all posts to process
    posts_to_process = []
    for timeline_id, timeline_data in taskA_output.items():
        for post_id, post_data in timeline_data["post_level"].items():
            # Get data from Task A
            adaptive_evidence = post_data.get("adaptive_evidence", [])
            maladaptive_evidence = post_data.get("maladaptive_evidence", [])
            wellbeing_score = post_data.get("wellbeing_score")
            
            # Skip posts without evidence or wellbeing score
            if (not adaptive_evidence and not maladaptive_evidence) or wellbeing_score is None:
                continue
            
            # Get the post text from our mapping
            post_text = post_text_map.get((timeline_id, post_id), "")
            
            if not post_text:
                print(f"Warning: Could not find text for post {post_id} in timeline {timeline_id}")
                continue
            
            posts_to_process.append({
                "timeline_id": timeline_id,
                "post_id": post_id,
                "post_text": post_text,
                "adaptive_evidence": adaptive_evidence,
                "maladaptive_evidence": maladaptive_evidence,
                "wellbeing_score": wellbeing_score
            })
    
    # Process posts in batches
    taskB_output = taskA_output.copy()
    total_posts = len(posts_to_process)
    print(f"Processing {total_posts} posts in batches of {batch_size}")
    
    for i in range(0, total_posts, batch_size):
        batch = posts_to_process[i:i+batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(total_posts + batch_size - 1)//batch_size}")
        
        for post_data in tqdm(batch, desc="Generating summaries"):
            timeline_id = post_data["timeline_id"]
            post_id = post_data["post_id"]
            
            # Generate summary
            summary = generate_post_summary(
                post_data["post_text"],
                post_data["adaptive_evidence"],
                post_data["maladaptive_evidence"],
                post_data["wellbeing_score"],
                model,
                tokenizer,
                examples=examples
            )
            
            # Add summary to output
            taskB_output[timeline_id]["post_level"][post_id]["summary"] = summary
    
    # Ensure all posts have summaries (even empty ones for posts without evidence)
    for timeline_id, timeline_data in taskA_output.items():
        for post_id in timeline_data["post_level"]:
            if "summary" not in taskB_output[timeline_id]["post_level"][post_id]:
                taskB_output[timeline_id]["post_level"][post_id]["summary"] = ""
    
    return taskB_output


In [None]:
def run_task_b_pipeline(taskA_output, test_timelines, train_dir, model_name="meta-llama/Llama-2-7b-chat-hf"):
    """Run the full Task B pipeline"""
    # Load examples from training data
    print("Loading examples from training data...")
    examples = load_examples_from_training_data(train_dir, num_examples=3)
    
    # Load the language model
    model, tokenizer = load_model_for_summarization(model_name)
    
    # Process posts in batches
    print("Processing posts...")
    taskB_output = process_posts_in_batches(
        taskA_output, test_timelines, model, tokenizer, examples
    )
    
    return taskB_output


In [None]:
!pip install huggingface_hub
from huggingface_hub import notebook_login

# Log in to your Hugging Face account
notebook_login()


def main(train_dir, test_dir, taskA_output_path, output_path, model_name="meta-llama/Llama-2-7b-chat-hf"):
    """Main function to run Task B pipeline"""
    # Load Task A output
    print(f"Loading Task A output from {taskA_output_path}")
    with open(taskA_output_path, 'r', encoding='utf-8') as f:
        taskA_output = json.load(f)
    
    # Load test timelines
    print(f"Loading test timelines from {test_dir}")
    test_timelines = []
    for filename in os.listdir(test_dir):
        if filename.endswith('.json'):
            with open(os.path.join(test_dir, filename), 'r', encoding='utf-8') as f:
                test_timelines.append(json.load(f))
    
    # Run Task B pipeline
    print("Running Task B pipeline...")
    taskB_output = run_task_b_pipeline(taskA_output, test_timelines, train_dir, model_name)
    
    # Save output
    print(f"Saving output to {output_path}")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(taskB_output, f, ensure_ascii=False, indent=2)
    
    print("Task B processing complete!")
    return taskB_output

def load_model_for_summarization(model_name="meta-llama/Llama-2-7b-chat-hf"):
    """Load the language model for summarization"""
    print(f"Loading language model: {model_name}")
    
    # Use the access token to load the model
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True) 
    
    # ... (rest of the function) ...

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto",
        load_in_8bit=True,  # Use 8-bit quantization if available
        use_auth_token=True  # Use the access token to load the model
    )
    return model, tokenizer

if __name__ == "__main__":
    # Instead of using argparse, directly call main with your desired values:
    train_dir = "/kaggle/input/train-dataset-1" # Replace with your actual path
    test_dir = "/kaggle/input/test-dataset-1"     # Replace with your actual path
    taskA_output_path = "/kaggle/working/CIOL_TaskA.json" # Replace with your actual path
    output_path = "/kaggle/working/" # Replace with your actual path
    model_name = "meta-llama/Llama-2-7b-chat-hf"  # You can change this if needed
    
    main(train_dir, test_dir, taskA_output_path, output_path, model_name)