**Task-1**

In [6]:
def min_edit_distance(s1, s2):

    m = len(s1)
    n = len(s2)


    dp = [[0] * (n + 1) for _ in range(m + 1)]


    for i in range(m + 1):
        dp[i][0] = i

    for j in range(n + 1):
        dp[0][j] = j


    for i in range(1, m + 1):
        for j in range(1, n + 1):


            cost = 0 if s1[i - 1] == s2[j - 1] else 1


            dp[i][j] = min(dp[i - 1][j] + 1,
                           dp[i][j - 1] + 1,
                           dp[i - 1][j - 1] + cost)


    return dp[m][n]

In [8]:
def similarity_score(str1, str2):
    distance = min_edit_distance(str1, str2)
    max_len = max(len(str1), len(str2))
    if max_len == 0:
        return 1.0
    return 1 - (distance / max_len)

def main():
    print("String Similarity Calculator (with DP Table)")
    str1 = input("Enter first string: ")
    str2 = input("Enter second string: ")

    score = similarity_score(str1, str2)
    print(f"\nSimilarity Score: {score:.2f}")

if __name__ == "__main__":
    main()

String Similarity Calculator (with DP Table)
Enter first string: dwd
Enter second string: greg

Similarity Score: 0.00


**Task-2**

Setup and Load Corpus

In [None]:
import nltk
from nltk.corpus import brown
import os


nltk.download('brown')
nltk.download('punkt')
nltk.download('punkt_tab')



sentences = brown.sents()
sentences = [[w.lower() for w in sent] for sent in sentences]

print("Total sentences in Brown Corpus:", len(sentences))
print("Example sentence:", sentences[0])

Build unigram, bigram, trigram counts

In [None]:
from collections import Counter

unigram_counts = Counter()
bigram_counts = Counter()
trigram_counts = Counter()

for sent in sentences:
    sent = ["<s>"] + sent + ["</s>"]
    for i in range(len(sent)):
        unigram_counts[sent[i]] += 1
    for i in range(len(sent)-1):
        bigram_counts[(sent[i], sent[i+1])] += 1
    for i in range(len(sent)-2):
        trigram_counts[(sent[i], sent[i+1], sent[i+2])] += 1

vocab = len(unigram_counts)

most_freq_word = unigram_counts.most_common(1)[0][0]

print("Vocabulary size:", vocab)
print("Most frequent word:", most_freq_word)

Predict next word safely

In [None]:
def predict_next_word_bigram(word):
    candidates = {w2: count for (w1, w2), count in bigram_counts.items() if w1 == word}
    if candidates:
        return max(candidates, key=candidates.get)
    else:
        return most_freq_word

def predict_next_word_trigram(word1, word2):
    candidates = {w3: count for (w1, w2_, w3), count in trigram_counts.items() if w1 == word1 and w2_ == word2}
    if candidates:
        return max(candidates, key=candidates.get)
    else:
        return predict_next_word_bigram(word2)

Show n-grams

In [None]:


def show_ngrams(sentence):

    nltk.data.path.append(os.path.join(os.path.expanduser("~"), "nltk_data"))
    tokens = nltk.word_tokenize(sentence.lower())
    bigrams = list(nltk.bigrams(tokens))
    trigrams = list(nltk.trigrams(tokens))
    return bigrams, trigrams


sentence = "i am eating rice"
bigrams, trigrams = show_ngrams(sentence)
print("Bigrams:", bigrams)
print("Trigrams:", trigrams)

Sentence probability using plain Laplace smoothing

In [None]:
def bigram_probability_laplace(sentence):
    tokens = ["<s>"] + nltk.word_tokenize(sentence.lower()) + ["</s>"]
    prob = 1.0
    for i in range(len(tokens)-1):
        w1, w2 = tokens[i], tokens[i+1]
        count_bigram = bigram_counts.get((w1, w2), 0)
        count_unigram = unigram_counts.get(w1, 0)
        prob *= (count_bigram + 1) / (count_unigram + vocab)
    return prob

def trigram_probability_laplace(sentence):
    tokens = ["<s>"] + nltk.word_tokenize(sentence.lower()) + ["</s>"]
    prob = 1.0
    for i in range(len(tokens)-2):
        w1, w2, w3 = tokens[i], tokens[i+1], tokens[i+2]
        count_trigram = trigram_counts.get((w1, w2, w3), 0)
        count_bigram = bigram_counts.get((w1, w2), 0)
        prob *= (count_trigram + 1) / (count_bigram + vocab)
    return prob

User Input Loop and probability using laplace smothing

In [None]:
while True:
    print("\nEnter a sentence (or type 'exit' to quit):")
    user_input = input()
    if user_input.lower() == "exit":
        break

    tokens = nltk.word_tokenize(user_input.lower())
    print("Tokens:", tokens)


    if len(tokens) >= 2:
        next_word = predict_next_word_trigram(tokens[-2], tokens[-1])
    else:
        next_word = predict_next_word_bigram(tokens[-1])
    print("Predicted next word:", next_word)


    bigrams, trigrams = show_ngrams(user_input)
    print("Bigrams:", bigrams)
    print("Trigrams:", trigrams)


    bigram_prob = bigram_probability_laplace(user_input)
    trigram_prob = trigram_probability_laplace(user_input)
    print(f"Bigram Probability (Laplace): {bigram_prob:.10e}")
    print(f"Trigram Probability (Laplace): {trigram_prob:.10e}")

In [None]:

print("--- Installing necessary libraries ---")
!pip install transformers torch pandas scikit-learn tqdm -q

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re
from tqdm.auto import tqdm
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


print("\n--- Connecting to Google Drive ---")
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/NOTEEVENTS_random.csv'


print(f"\n--- Attempting to load data from: {file_path} ---")
if not os.path.exists(file_path):
    print(f"Error: File not found at '{file_path}'")
else:
    df_notes = pd.read_csv(file_path, low_memory=False)
    print("Data loaded successfully!")

    def clean_text(text):
        if not isinstance(text, str): return ""
        text = re.sub(r'\n', ' ', text); text = re.sub(r'\[\*\*.*?\*\*\]', '', text)
        text = re.sub(r'\s+', ' ', text).strip(); return text.lower()

    df_notes['TEXT'] = df_notes['TEXT'].apply(clean_text)
    print("Text cleaning complete.")


    print("\n--- Preparing datasets for two tasks ---")

    df_physician = df_notes[df_notes['CATEGORY'] == 'Physician '].copy()
    df_nursing = df_notes[df_notes['CATEGORY'] == 'Nursing/other'].copy()
    df_physician['label'] = 0; df_nursing['label'] = 1
    sample_size_task1 = min(len(df_physician), len(df_nursing), 1500)
    df_task1 = pd.concat([df_physician.sample(sample_size_task1, random_state=42), df_nursing.sample(sample_size_task1, random_state=42)]).sample(frac=1, random_state=42).reset_index(drop=True)
    print(f"Task 1 (Physician vs Nursing) Dataset Size: {len(df_task1)}")
    print(df_task1['label'].value_counts())


    def is_mortality_note(text):
        mortality_keywords = ['comfort measures', 'comfort care', 'palliative care', 'hospice', 'withdrew care', 'withdrawal of support', 'terminal extubation']
        return 1 if any(keyword in text for keyword in mortality_keywords) else 0

    df_notes['label'] = df_notes['TEXT'].apply(is_mortality_note)
    df_mortality_positive = df_notes[df_notes['label'] == 1].copy()
    df_mortality_negative = df_notes[df_notes['label'] == 0].copy()

    df_mortality_negative_sampled = df_mortality_negative.sample(len(df_mortality_positive), random_state=42)
    df_task2_full = pd.concat([df_mortality_positive, df_mortality_negative_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)

    df_task2 = df_task2_full.sample(n=min(len(df_task2_full), 3000), random_state=42).reset_index(drop=True)
    print(f"\nTask 2 (Mortality Prediction) Dataset Size: {len(df_task2)} (Sampled from {len(df_task2_full)} total)")
    print(df_task2['label'].value_counts())


    class MedicalNotesDataset(Dataset):
        def __init__(self, texts, labels, tokenizer, max_len=512):
            self.texts, self.labels, self.tokenizer, self.max_len = texts, labels, tokenizer, max_len
        def __len__(self): return len(self.texts)
        def __getitem__(self, item):
            text, label = str(self.texts[item]), self.labels[item]
            encoding = self.tokenizer.encode_plus(text, add_special_tokens=True, max_length=self.max_len, return_token_type_ids=False, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt')
            return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(label, dtype=torch.long)}

    def create_data_loader(df, tokenizer, max_len, batch_size):
        ds = MedicalNotesDataset(texts=df.TEXT.to_numpy(), labels=df.label.to_numpy(), tokenizer=tokenizer, max_len=max_len)
        return DataLoader(ds, batch_size=batch_size, num_workers=2)

    def train_epoch(model, data_loader, optimizer, device, scheduler):
        model = model.train(); losses, correct_predictions = [], 0
        for d in data_loader:
            input_ids, attention_mask, labels = d["input_ids"].to(device), d["attention_mask"].to(device), d["labels"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss; _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels); losses.append(loss.item())
            loss.backward(); torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step(); scheduler.step(); optimizer.zero_grad()
        return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

    def eval_model(model, data_loader, device):
        model = model.eval(); losses, correct_predictions = [], 0; all_labels, all_preds = [], []
        with torch.no_grad():
            for d in data_loader:
                input_ids, attention_mask, labels = d["input_ids"].to(device), d["attention_mask"].to(device), d["labels"].to(device)
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                _, preds = torch.max(outputs.logits, dim=1)
                correct_predictions += torch.sum(preds == labels); losses.append(outputs.loss.item())
                all_labels.extend(labels.cpu().numpy()); all_preds.extend(preds.cpu().numpy())
        accuracy = correct_predictions.double() / len(data_loader.dataset)
        precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary', zero_division=0)
        return accuracy, np.mean(losses), precision, recall, f1

    def run_experiment(df, model_name, task_name):
        print(f"\n{'='*25} RUNNING EXPERIMENT {'='*25}")
        print(f"TASK: {task_name} | MODEL: {model_name}")
        MAX_LEN, BATCH_SIZE, EPOCHS, LEARNING_RATE = 256, 16, 4, 2e-5
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
        train_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
        test_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
        optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=50, num_training_steps=len(train_loader) * EPOCHS)
        for epoch in range(EPOCHS):
            train_acc, train_loss = train_epoch(model, train_loader, optimizer, device, scheduler)
            print(f'Epoch {epoch + 1}/{EPOCHS} -> Train loss {train_loss:.4f}, accuracy {train_acc:.4f}')
        test_acc, _, precision, recall, f1 = eval_model(model, test_loader, device)
        print(f"-> FINAL TEST RESULTS: Accuracy: {test_acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
        return {'task': task_name, 'model': model_name.split('/')[-1], 'f1_score': f1, 'accuracy': test_acc.item()}


    models_to_compare = ['bert-base-uncased', 'emilyalsentzer/Bio_ClinicalBERT']
    all_results = []


    for model_name in models_to_compare:
        result = run_experiment(df_task1, model_name, "Physician vs Nursing")
        all_results.append(result)

    for model_name in models_to_compare:
        result = run_experiment(df_task2, model_name, "Mortality Prediction")
        all_results.append(result)

    print("\n\n" + "="*20 + " FINAL COMPARATIVE RESULTS " + "="*20)
    results_df = pd.DataFrame(all_results)
    print(results_df.pivot_table(index='task', columns='model', values=['f1_score', 'accuracy']))
    print("\n" + "="*65)


data = {
    'Task': ['Physician vs Nursing', 'Mortality Prediction'],
    'BERT-base Accuracy': [1.00, 0.91],
    'Bio_ClinicalBERT Accuracy': [1.00, 0.9217],
    'BERT-base F1': [1.00, 0.9135],
    'Bio_ClinicalBERT F1': [1.00, 0.9246]
}

df = pd.DataFrame(data)

df_melted = df.melt(id_vars='Task', var_name='Metric_Model', value_name='Score')
df_melted[['Model', 'Metric']] = df_melted['Metric_Model'].str.extract(r'([A-Za-z_]+)\s*(Accuracy|F1)')
df_melted.drop('Metric_Model', axis=1, inplace=True)


sns.set(style="whitegrid", font_scale=1.1)
plt.figure(figsize=(10, 6))

sns.barplot(
    data=df_melted,
    x='Task',
    y='Score',
    hue='Model',
    palette=['#66b3ff', '#99ff99'],
    errorbar=None
)

plt.title('Accuracy & F1 Comparison for BERT Models', fontsize=14, pad=15)
plt.ylabel('Score')
plt.xlabel('Task')
plt.ylim(0.85, 1.05)
plt.legend(title='Model', loc='lower right')
plt.tight_layout()
plt.show()

df_heatmap = pd.DataFrame({
    'Task': ['Physician vs Nursing', 'Physician vs Nursing', 'Mortality Prediction', 'Mortality Prediction'],
    'Metric': ['Accuracy', 'F1', 'Accuracy', 'F1'],
    'BERT-base': [1.00, 1.00, 0.91, 0.9135],
    'Bio_ClinicalBERT': [1.00, 1.00, 0.9217, 0.9246]
})

plt.figure(figsize=(8, 4))
sns.heatmap(
    df_heatmap.pivot(index='Task', columns='Metric', values='BERT-base'),
    annot=True, fmt='.3f', cmap='Blues', cbar=False, linewidths=1
)
plt.title('BERT-base Performance Heatmap', fontsize=13, pad=10)
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 4))
sns.heatmap(
    df_heatmap.pivot(index='Task', columns='Metric', values='Bio_ClinicalBERT'),
    annot=True, fmt='.3f', cmap='Greens', cbar=False, linewidths=1
)
plt.title('Bio_ClinicalBERT Performance Heatmap', fontsize=13, pad=10)
plt.tight_layout()
plt.show()


plt.figure(figsize=(8, 5))
sns.lineplot(
    data=df_melted,
    x='Task',
    y='Score',
    hue='Model',
    style='Metric',
    markers=True,
    dashes=False,
    linewidth=2.5,
    palette=['#1f77b4', '#2ca02c']
)

plt.title('Performance Trend: Accuracy vs F1 across Tasks', fontsize=14, pad=10)
plt.ylim(0.85, 1.05)
plt.ylabel('Score')
plt.tight_layout()
plt.show()
