In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# pip install pandas transformers nltk matplotlib seaborn plotly rouge-score


In [None]:
import pandas as pd

dataset_path = '/content/drive/MyDrive/5c Networks /impression_300_llm.csv'
df = pd.read_csv(dataset_path)

In [None]:
df.head()

Unnamed: 0,Report Name,History,Observation,Impression
0,MRI Brain Plain,LOC,Possible minimal volume loss of right hippocam...,Possible minimal volume loss of right hippocam...
1,MRI Orbits Plain and Contrast,document attached,The globe is normal shape. The uveoscleral thi...,Minimally increased perineural CSF signal alon...
2,MRI Brain Plain and Contrast,document attached,"No evidence of acute infarct, hemorrhage or sp...",No significant neuroparenchymal abnormality d...
3,MRI Brain Plain and Contrast,HEADACHE,Limited study due to phase encoded pulsation a...,Small vessel ischaemic disease (Fazekas grade ...
4,CT Abdomen & Pelvis Plain and Contrast - Female,PAIN IN RT SIDED UPPER ABDOMEN,The liver is normal sized. Focal fatty infiltr...,Prominently distended gallbladder. No calcifie...


In [None]:
# Combine the text columns into a single input for training
df['input_text'] = df['Report Name'] + " " + df['History'] + " " + df['Observation']

# Split the dataset into training and evaluation sets
train_samples = df.sample(n=300, random_state=42)
eval_samples = df.drop(train_samples.index)

In [None]:
print(f'Training samples: {len(train_samples)}')
print(f'Evaluation samples: {len(eval_samples)}')

Training samples: 300
Evaluation samples: 30


In [None]:
# !pip install transformers

In [25]:
# !pip install huggingface-hub
from huggingface_hub import login

# This will prompt you to enter your Hugging Face token
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
# !pip install transformers datasets torch rouge-score tqdm seaborn matplotlib


In [18]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd
import numpy as np
import gc
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

In [19]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [21]:
df = pd.read_csv("/content/drive/MyDrive/5c Networks /impression_300_llm.csv")

In [22]:
def preprocess_text(df):
    df = df.dropna(subset=['Report Name', 'History', 'Observation', 'Impression'])
    df['input_text'] = df['Report Name'] + ' ' + df['History'] + ' ' + df['Observation']
    df = df[['input_text', 'Impression']]
    return df

In [23]:
# Select 300 samples for training and 30 samples for evaluation
train_samples, eval_samples = train_test_split(df, test_size=30, random_state=42)
train_samples = preprocess_text(train_samples)
eval_samples = preprocess_text(eval_samples)

In [None]:
# Load the tokenizer and model (using Gemma 2B model)
model_name = "gemma-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
# Tokenization function to process inputs and labels
def tokenize_function(samples, batch_size=8):
    inputs = []
    labels = []

    for i in tqdm(range(0, len(samples), batch_size)):
        batch_samples = samples.iloc[i:i+batch_size]
        input_batch = tokenizer(batch_samples["input_text"].tolist(), max_length=512, truncation=True, padding="max_length", return_tensors='pt')
        label_batch = tokenizer(batch_samples["Impression"].tolist(), max_length=512, truncation=True, padding="max_length", return_tensors='pt')["input_ids"]

        inputs.append(input_batch)
        labels.append(label_batch)

    # Concatenate batches
    inputs = {key: torch.cat([batch[key] for batch in inputs], dim=0) for key in inputs[0]}
    labels = torch.cat(labels, dim=0)
    inputs["labels"] = labels

    return inputs

In [None]:
# Tokenize training and evaluation data
train_encodings = tokenize_function(train_samples)
eval_encodings = tokenize_function(eval_samples)

In [None]:
gc.collect()


In [None]:
# Define Hugging Face Trainer and Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    fp16=True,
)

In [None]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=Dataset.from_dict(train_encodings),
    eval_dataset=Dataset.from_dict(eval_encodings),
    tokenizer=tokenizer,
)

In [None]:
trainer.train()


In [None]:
def generate_impressions(samples):
    generated_impressions = []
    for input_text in tqdm(samples["input_text"].tolist()):
        inputs = tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        output = model.generate(**inputs)
        decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
        generated_impressions.append(decoded_output)
    return generated_impressions

In [None]:
eval_samples['Generated_Impressions'] = generate_impressions(eval_samples)


In [None]:
eval_samples.to_csv('generated_impressions.csv', index=False)


In [None]:
# Evaluation Metrics: Perplexity and ROUGE Score
from transformers import pipeline
from rouge_score import rouge_scorer

In [None]:
# Compute Perplexity
def compute_perplexity(eval_samples):
    perplexities = []
    for i, text in enumerate(eval_samples["Generated_Impressions"]):
        with torch.no_grad():
            inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
            loss = model(**inputs, labels=inputs["input_ids"]).loss
            perplexity = torch.exp(loss).item()
            perplexities.append(perplexity)
    return np.mean(perplexities)


In [None]:
avg_perplexity = compute_perplexity(eval_samples)
print(f"Average Perplexity: {avg_perplexity}")

In [None]:
# Compute ROUGE Score
def compute_rouge(eval_samples):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge_scores = []
    for i, row in eval_samples.iterrows():
        score = scorer.score(row['Impression'], row['Generated_Impressions'])
        rouge_scores.append(score)
    return rouge_scores

rouge_scores = compute_rouge(eval_samples)

In [None]:
# Display ROUGE scores
rouge_1_f1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
rouge_l_f1 = np.mean([score['rougeL'].fmeasure for score in rouge_scores])
print(f"Average ROUGE-1 F1 Score: {rouge_1_f1}")
print(f"Average ROUGE-L F1 Score: {rouge_l_f1}")

In [None]:
def process_text(samples):
    processed_text = []

    for text in samples['input_text']:
        words = text.split()
        words = [word for word in words if word.lower() not in stop_words]
        words = [stemmer.stem(word) for word in words]  # Stemming
        words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
        processed_text.append(" ".join(words))

    return processed_text

In [None]:
df['Processed_Text'] = process_text(df)


In [None]:
def compute_embeddings(processed_text):
    embeddings = []
    for text in tqdm(processed_text):
        inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).detach().numpy())
    return np.vstack(embeddings)

embeddings = compute_embeddings(df['Processed_Text'])

In [None]:
def get_top_word_pairs(embeddings, df, num_pairs=100):
    similarity_matrix = cosine_similarity(embeddings)
    word_pairs = []

    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            if len(word_pairs) < num_pairs:
                word_pairs.append((df['Processed_Text'].iloc[i], df['Processed_Text'].iloc[j], similarity_matrix[i, j]))
            else:
                break
    return sorted(word_pairs, key=lambda x: x[2], reverse=True)[:num_pairs]

top_word_pairs = get_top_word_pairs(embeddings, df)

In [None]:
# Visualization of top 100 word pairs
def plot_similarity_pairs(top_word_pairs):
    words = [f"{pair[0]} <-> {pair[1]}" for pair in top_word_pairs]
    similarities = [pair[2] for pair in top_word_pairs]

    plt.figure(figsize=(10, 6))
    sns.barplot(x=similarities, y=words)
    plt.title("Top 100 Word Pairs Based on Embedding Similarity")
    plt.xlabel("Cosine Similarity")
    plt.ylabel("Word Pairs")
    plt.show()

In [None]:
plot_similarity_pairs(top_word_pairs)


In [None]:
# Free up memory after execution
gc.collect()


# Key Features of This Code

- **Model Fine-tuning**: The code fine-tunes the Gemma 2B model on a specified dataset to generate impressions based on given report details.

- **Evaluation**: The model generates impressions for evaluation samples and computes metrics like Perplexity and ROUGE scores.

- **Text Analysis**: The code processes the entire dataset by removing stop words, applying stemming and lemmatization, and converting the text into embeddings.

- **Similarity Detection**: The top 100 pairs of words based on embedding similarity are identified, and a bar plot visualizing these pairs is generated.

- **Visualization**: The similarity of the top word pairs is plotted for easy interpretation.

# Deliverables for GitHub Repository

- All source code used in the notebook.
- Documentation that explains:
  - Your approach to model fine-tuning and evaluation.
  - Text analysis methodologies and any assumptions made.
- Results of the model evaluation including perplexity and ROUGE scores.
- Visualization(s) of the top 100 word pairs.
- (Bonus) Interactive visualization code if implemented.
- A brief report summarizing findings, challenges, and areas for improvement.

# Submission

Make sure to upload your dataset to the specified path in the code, and modify the path accordingly. Once everything is complete, create a public GitHub repository and submit the link through the specified submission form by the deadline.

# Additional Notes

- Ensure all necessary libraries are installed and up-to-date.
- Depending on your environment (local vs. cloud), you may need to adjust paths and configurations.
- Thoroughly test your code before submission to ensure everything runs smoothly.


