## Posibles Features

### Sentiment

In [12]:
# ANALISIS DE SENTIMIENTO DE LAS NOTAS MÉDICAS
# La única parte buena de que las notas sean escritas en inglés en vez de manera tabular 
# es que podemos extraer conclusiones sobre el estado del paciente a partir de cómo 
# las escribe el doctor.
from transformers import pipeline

# Initialize sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Example medical notes
medical_notes = [
    "The patient is a 17-year-old female with a BMI of 18.89, presenting with an above-normal random glucose level. She has no history of hypertension or heart disease and is a non-smoker. While her HbA1c is within normal limits at 5.0%, the elevated random glucose warrants further evaluation to rule out potential metabolic concerns. I recommend a comprehensive assessment, including detailed dietary and activity history, and consider referral to a specialist for further testing and management to ensure optimal health outcomes.",
    "The patient is a 38-year-old female with a BMI of 26.29, indicating a slightly overweight status. She has no history of hypertension or heart disease but reports a past smoking habit. Her recent blood work shows a normal HbA1c and a random glucose level of 85 mg/dL, suggesting good current metabolic health. It is advisable to maintain a balanced diet and regular physical activity to support cardiovascular health. Continued monitoring and lifestyle modifications are recommended to reduce future risk factors. Follow-up appointments should focus on cardiovascular risk assessment and lifestyle counseling.",
    """**Subjective:**  
The patient is an eighty-year-old female with a history of past smoking. She reports no current symptoms of chest pain, shortness of breath, or dizziness.

**Objective:**  
Vital signs are stable. BMI is 32.76, indicating obesity. Laboratory results show an HbA1c of 7.5% and a random glucose of 200 mg/dL, suggesting suboptimal glycemic control. No signs of hypertension or heart disease are noted.

**Assessment:**  
The patient’s metabolic parameters warrant ongoing monitoring. Her obesity and elevated glucose levels increase her risk for cardiovascular and metabolic complications. Her smoking history should be considered in her overall risk profile.

**Plan:**  
Recommend lifestyle modifications focusing on weight management and diet. Continue regular blood pressure and glucose monitoring. Consider referral to a dietitian and evaluate the need for medication adjustments. Encourage smoking cessation support if applicable. Follow-up in 3 months to reassess metabolic status."""
]

# Perform sentiment analysis
results = sentiment_analyzer(medical_notes)

# Display results
for note, result in zip(medical_notes, results):
    print(f"Review: {note[:100]}...")
    print(f"Sentiment: {result['label']}, Score: {result['score']:.4f}\n")


Device set to use cuda:0


Review: The patient is a 17-year-old female with a BMI of 18.89, presenting with an above-normal random gluc...
Sentiment: POSITIVE, Score: 0.9638

Review: The patient is a 38-year-old female with a BMI of 26.29, indicating a slightly overweight status. Sh...
Sentiment: POSITIVE, Score: 0.6516

Review: **Subjective:**  
The patient is an eighty-year-old female with a history of past smoking. She repor...
Sentiment: NEGATIVE, Score: 0.9725



In [16]:
import pandas as pd
train_df = pd.read_csv('raw_train_data.csv')
test_df = pd.read_csv('raw_test_data.csv')
train_df_sentiments = sentiment_analyzer(train_df['medical_note'].tolist())
test_df_sentiments = sentiment_analyzer(test_df['medical_note'].tolist())

In [21]:
train_df.drop(columns=['medical_note'], inplace=True)
test_df.drop(columns=['medical_note'], inplace=True)

In [23]:
train_df[['sentiment', 'score']] = pd.DataFrame(train_df_sentiments)
test_df[['sentiment', 'score']] = pd.DataFrame(test_df_sentiments)

In [25]:
train_df['sentiment'] = (train_df['sentiment'] == 'POSITIVE').astype(int)
test_df['sentiment'] = (test_df['sentiment'] == 'POSITIVE').astype(int)

In [26]:
train_df.to_csv('train_data_with_sentiment.csv', index=False)
test_df.to_csv('test_data_with_sentiment.csv', index=False)

### Embeddings

In [None]:
import torch
from sentence_transformers import SentenceTransformer

# Load the model
# model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B") # You can delete this first load

# We recommend enabling flash_attention_2 for better acceleration and memory saving,
# together with setting `padding_side` to "left":
model = SentenceTransformer(
    "Qwen/Qwen3-Embedding-0.6B",
    model_kwargs={
        "attn_implementation": "flash_attention_2",
        "device_map": "auto",
        "dtype": torch.bfloat16
    },
    tokenizer_kwargs={"padding_side": "left"},
)

# The queries and documents to embed
queries = [
    "What is the capital of China?",
    "Explain gravity",
]
documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
]

# ... your remaining code ...
query_embeddings = model.encode(queries, prompt_name="query")
document_embeddings = model.encode(documents)

similarity = model.similarity(query_embeddings, document_embeddings)
print(similarity)

tensor([[0.7640, 0.1404],
        [0.1349, 0.5974]])


In [10]:
import pandas as pd
train_df = pd.read_csv('raw_train_data.csv')
test_df = pd.read_csv('raw_test_data.csv')
train_df_embeddings = model.encode(train_df['medical_note'].tolist())
test_df_embeddings = model.encode(test_df['medical_note'].tolist())

# Remove medical_note column and add the embeddings as new columns
train_df = train_df.drop(columns=['medical_note'])
test_df = test_df.drop(columns=['medical_note'])
for i in range(train_df_embeddings.shape[1]):
    train_df[f'{i}'] = train_df_embeddings[:, i]
for i in range(test_df_embeddings.shape[1]):
    test_df[f'{i}'] = test_df_embeddings[:, i]

# Save it to a csv file
train_df.to_csv('train_embeddings.csv', index=False)
test_df.to_csv('test_embeddings.csv', index=False)


  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'{i}'] = train_df_embeddings[:, i]
  train_df[f'