## Question1

##### a) Implement FastText and BERT Embeddings:


In [5]:
import numpy as np

In [2]:
import fasttext
from huggingface_hub import hf_hub_download
import os

try:
    # Download the model and get the local path
    model_path = hf_hub_download(repo_id="facebook/fasttext-en-vectors", filename="model.bin")
    print(f"Model downloaded to: {model_path}")

    # Verify file exists and check its size
    if os.path.exists(model_path):
        file_size = os.path.getsize(model_path) / (1024 * 1024)  # Size in MB
        print(f"Model file size: {file_size:.2f} MB")
    else:
        raise FileNotFoundError(f"Model file not found at {model_path}")

    # Load the model
    model_ft = fasttext.load_model(model_path)
    print("Model loaded successfully!")

    
    print(model_ft.get_word_vector("hello"))
except MemoryError:
    print("MemoryError: Not enough RAM to load the model. Try:")
    print("- Closing other applications to free up memory")
    print("- Increasing virtual memory in Windows settings")
    print("- Using a smaller model or a machine with more RAM")
except Exception as e:
    print(f"An error occurred: {e}")

  from .autonotebook import tqdm as notebook_tqdm


Model downloaded to: C:\Users\NamraAbid\.cache\huggingface\hub\models--facebook--fasttext-en-vectors\snapshots\a80392390daaee1a91000da45c376d512e1dc555\model.bin
Model file size: 6901.91 MB
Model loaded successfully!
[ 1.57576188e-01  4.37820926e-02 -4.51271934e-03  6.65931404e-02
  7.70346820e-02  4.85855248e-03  8.19822028e-03  6.52402919e-03
  9.25899856e-03  3.53899002e-02 -2.31395271e-02 -4.91807126e-02
 -8.32642540e-02  1.56014524e-02  2.54856616e-01  3.45423706e-02
 -1.07451361e-02 -7.80188590e-02 -7.08099529e-02  7.62385577e-02
 -6.09613657e-02  4.48625796e-02 -7.29744136e-02  1.30583309e-02
  3.14881057e-02 -3.10055036e-02  1.66004002e-02  1.74405202e-02
 -7.35838860e-02  1.18252613e-01 -1.21330231e-01 -4.09253240e-02
  2.93969568e-02  4.84445989e-02 -1.33816330e-02 -1.74765270e-02
  7.51308873e-02  9.97046307e-02 -4.00476977e-02  4.05735290e-03
 -7.21896589e-02 -4.43356819e-02 -1.22628408e-03  7.56693557e-02
  3.98401320e-02  3.22643593e-02  1.95914153e-02  4.68016043e-02
 -1

In [6]:
texts = [
    "Cats are great pets.",
    "Dogs are good companions.",
    "I love programming in Python.",
    "Snakes are not very affectionate."
]
fasttext_embeddings = []
if model_ft:
    for text in texts:
        words = text.split()
        word_embeddings = [model_ft.get_word_vector(word) for word in words]
        sentence_embedding = np.mean(word_embeddings, axis=0) if word_embeddings else np.zeros(model_ft.get_dimension())
        fasttext_embeddings.append(sentence_embedding)
    print("FastText Embedding for first sentence:", fasttext_embeddings[0][:5], "...")

FastText Embedding for first sentence: [ 0.0445471   0.0264558  -0.0020836   0.04669745 -0.01345486] ...


In [9]:

import re
import torch
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer, util

In [66]:


# 1a. BERT Embeddings
try:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased')
    print("BERT model and tokenizer loaded successfully!")
except Exception as e:
    print(f"BERT error: {e}")
    tokenizer, bert_model = None, None

bert_embeddings = []
if bert_model and tokenizer:
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            outputs = bert_model(**inputs)
        sentence_embedding = outputs.last_hidden_state[:, 0, :].numpy()
        bert_embeddings.append(sentence_embedding[0])
    print("BERT Embedding for first sentence:", bert_embeddings[0][:5], "...")

# 1b. SBERT Embeddings
try:
    sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
    print("SBERT model loaded successfully!")
except Exception as e:
    print(f"SBERT error: {e}")
    sbert_model = None

sbert_embeddings = []
if sbert_model:
    sbert_embeddings = sbert_model.encode(texts)
    print("SBERT Embedding for first sentence:", sbert_embeddings[0][:5], "...")

# 1b. Calculate Cosine Similarity for All Pairs
if len(fasttext_embeddings) > 0 and len(bert_embeddings) > 0 and len(sbert_embeddings) > 0:
    print("\nSentence-level Cosine Similarities:")
    for i in range(len(texts)):
        for j in range(i + 1, len(texts)):
            fasttext_sim = util.cos_sim(fasttext_embeddings[i], fasttext_embeddings[j]).item()
            bert_sim = util.cos_sim(bert_embeddings[i], bert_embeddings[j]).item()
            sbert_sim = util.cos_sim(sbert_embeddings[i], sbert_embeddings[j]).item()
            print(f"s{i+1} vs s{j+1}: FastText={fasttext_sim:.4f}, BERT={bert_sim:.4f}, SBERT={sbert_sim:.4f}")



BERT model and tokenizer loaded successfully!
BERT Embedding for first sentence: [ 0.20707242  0.14218685 -0.3380488  -0.14978017 -0.43513113] ...
SBERT model loaded successfully!
SBERT Embedding for first sentence: [ 0.07553703 -0.00508587  0.0994834   0.00859571 -0.13170868] ...

Sentence-level Cosine Similarities:
s1 vs s2: FastText=0.8387, BERT=0.9628, SBERT=0.5809
s1 vs s3: FastText=0.3417, BERT=0.9100, SBERT=0.2578
s1 vs s4: FastText=0.6859, BERT=0.8899, SBERT=0.3806
s2 vs s3: FastText=0.3474, BERT=0.8719, SBERT=0.1761
s2 vs s4: FastText=0.7787, BERT=0.8939, SBERT=0.2572
s3 vs s4: FastText=0.4551, BERT=0.8401, SBERT=0.1329


In [47]:
import spacy
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [53]:
# Load spaCy model 
try:
    nlp = spacy.load('en_core_web_sm')
    print("spaCy model loaded successfully!")
except Exception as e:
    print(f"spaCy error: {e}")
    nlp = None

def preprocess_text(doc):
    preprocessed = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(preprocessed)

# 2a. Load newspaper text 
news_text = """
   In June 2022, Dr. Andrew Scott, a world-leading epidemiologist, delivered a keynote address at the World Health Organization (WHO) Headquarters in Geneva, Switzerland. 
   His presentation, titled “The Silent Wave: Climate Change and Emerging Health Crises,” emphasized the interconnectedness of environmental shifts and global health. 
   Dr. Scott outlined how rising temperatures exacerbate the spread of diseases like malaria and dengue fever while also affecting mental health due to increased natural disasters.

During the conference, he engaged in a collaborative workshop with Prof. Lin Mei, an environmental scientist, and Dr. Rajesh Kumar, a medical economist.
Together, they developed a framework to predict and mitigate future health crises using machine learning algorithms to analyze environmental and epidemiological data.
Later that evening, Andrew participated in a panel discussion at the Graduate Institute of International and Development Studies, addressing the need for policy reform to ensure equitable access to healthcare resources in climate-affected regions.

After a productive day, he spent time by Lake Geneva, reflecting on the urgency of integrating health and climate policy on a global scale.
"""
sentences = news_text.split('.')
sentences = [s.strip() for s in sentences if s.strip()]  # Split into sentences, remove empty
print("2a. Original sentences:")
for i, s in enumerate(sentences):
    print(f"s{i+1}: {s}")

spaCy model loaded successfully!
2a. Original sentences:
s1: In June 2022, Dr
s2: Andrew Scott, a world-leading epidemiologist, delivered a keynote address at the World Health Organization (WHO) Headquarters in Geneva, Switzerland
s3: His presentation, titled “The Silent Wave: Climate Change and Emerging Health Crises,” emphasized the interconnectedness of environmental shifts and global health
s4: Dr
s5: Scott outlined how rising temperatures exacerbate the spread of diseases like malaria and dengue fever while also affecting mental health due to increased natural disasters
s6: During the conference, he engaged in a collaborative workshop with Prof
s7: Lin Mei, an environmental scientist, and Dr
s8: Rajesh Kumar, a medical economist
s9: Together, they developed a framework to predict and mitigate future health crises using machine learning algorithms to analyze environmental and epidemiological data
s10: Later that evening, Andrew participated in a panel discussion at the Graduate Ins

In [54]:
# 2b. Apply SBERT 
try:
    sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
    print("SBERT model loaded successfully!")
except Exception as e:
    print(f"SBERT error: {e}")
    sbert_model = None

if sbert_model and nlp:
    preprocessed_sentences = [preprocess_text(nlp(s)) for s in sentences]
    sbert_embeddings_normal = sbert_model.encode(preprocessed_sentences)
    # Cluster with KMeans (2 clusters for simplicity)
    kmeans_normal = KMeans(n_clusters=2, random_state=42)
    normal_clusters = kmeans_normal.fit_predict(sbert_embeddings_normal)
    # Compute silhouette score to assess cluster quality
    if len(set(normal_clusters)) > 1:  # Silhouette requires 2+ clusters
        sil_score_normal = silhouette_score(sbert_embeddings_normal, normal_clusters)
    else:
        sil_score_normal = "N/A (too few clusters)"
    print("\n2b. Clustering with original text:")
    for i, s in enumerate(preprocessed_sentences):
        print(f"Sentence: {s} -> Cluster {normal_clusters[i]}")
    print(f"Silhouette Score (higher is better, range -1 to 1): {sil_score_normal}")

SBERT model loaded successfully!

2b. Clustering with original text:
Sentence: june 2022 dr -> Cluster 1
Sentence: andrew scott world lead epidemiologist deliver keynote address world health organization headquarter geneva switzerland -> Cluster 0
Sentence: presentation title silent wave climate change emerging health crises emphasize interconnectedness environmental shift global health -> Cluster 0
Sentence: dr -> Cluster 1
Sentence: scott outline rise temperature exacerbate spread disease like malaria dengue fever affect mental health increase natural disaster -> Cluster 0
Sentence: conference engage collaborative workshop prof -> Cluster 0
Sentence: lin mei environmental scientist dr -> Cluster 1
Sentence: rajesh kumar medical economist -> Cluster 1
Sentence: develop framework predict mitigate future health crisis machine learn algorithm analyze environmental epidemiological datum -> Cluster 0
Sentence: later evening andrew participate panel discussion graduate institute internation

In [55]:
# 2c. Remove named entities (time, places, people) using spaCy
if nlp and sbert_model:
    sentences_no_entities = []
    for s in sentences:
        doc = nlp(s)
        # Remove entities: PERSON, GPE (places), DATE, TIME
        filtered_tokens = [token.lemma_ for token in doc if token.ent_type_ not in ('PERSON', 'GPE', 'DATE', 'TIME')]
        filtered_text = ' '.join(filtered_tokens)
        sentences_no_entities.append(preprocess_text(nlp(filtered_text)))
    sbert_embeddings_no_entities = sbert_model.encode(sentences_no_entities)
    # Cluster
    kmeans_no_entities = KMeans(n_clusters=2, random_state=42)
    no_entities_clusters = kmeans_no_entities.fit_predict(sbert_embeddings_no_entities)
    # Silhouette score
    if len(set(no_entities_clusters)) > 1:
        sil_score_no_entities = silhouette_score(sbert_embeddings_no_entities, no_entities_clusters)
    else:
        sil_score_no_entities = "N/A (too few clusters)"
    print("\n2c. Clustering after removing named entities:")
    for i, s in enumerate(sentences_no_entities):
        print(f"Sentence: {s} : C:luster {no_entities_clusters[i]}")
    print(f"Silhouette Score (higher is better, range -1 to 1): {sil_score_no_entities}")
    # Visualize named entities in original text with displacy
    print("\n2c. Visualizing named entities in original text:")
    for s in sentences:
        doc = nlp(s)
        from spacy import displacy
        displacy.render(doc, style="ent")


2c. Clustering after removing named entities:
Sentence: dr : C:luster 1
Sentence: world lead epidemiologist deliver keynote address world health organization headquarter : C:luster 1
Sentence: presentation title silent wave climate change emerging health crises emphasize interconnectedness environmental shift global health : C:luster 0
Sentence: dr : C:luster 1
Sentence: scott outline rise temperature exacerbate spread disease like malaria dengue fever affect mental health increase natural disaster : C:luster 0
Sentence: conference engage collaborative workshop prof : C:luster 0
Sentence: environmental scientist dr : C:luster 1
Sentence: medical economist : C:luster 1
Sentence: develop framework predict mitigate future health crisis use machine learn algorithm analyze environmental epidemiological datum : C:luster 0
Sentence: later evening participate panel discussion graduate institute international development studies address need policy reform ensure equitable access healthcare res



In [57]:
# 2d. Remove all nouns using spaCy
if nlp and sbert_model:
    sentences_no_nouns = []
    for s in sentences:
        doc = nlp(s)
        # Remove nouns (NOUN, PROPN)
        filtered_tokens = [token.lemma_ for token in doc if token.pos_ not in ('NOUN', 'PROPN')]
        filtered_text = ' '.join(filtered_tokens)
        sentences_no_nouns.append(preprocess_text(nlp(filtered_text)))
    sbert_embeddings_no_nouns = sbert_model.encode(sentences_no_nouns)
    # Cluster
    kmeans_no_nouns = KMeans(n_clusters=2, random_state=42)
    no_nouns_clusters = kmeans_no_nouns.fit_predict(sbert_embeddings_no_nouns)
    # Silhouette score
    if len(set(no_nouns_clusters)) > 1:
        sil_score_no_nouns = silhouette_score(sbert_embeddings_no_nouns, no_nouns_clusters)
    else:
        sil_score_no_nouns = "N/A (too few clusters)"
    print("\n2d. Clustering after removing all nouns:")
    for i, s in enumerate(sentences_no_nouns):
        print(f"Sentence: {s} -> Cluster {no_nouns_clusters[i]}")
    print(f"Silhouette Score (higher is better, range -1 to 1): {sil_score_no_nouns}")


2d. Clustering after removing all nouns:
Sentence: 2022 -> Cluster 1
Sentence: lead deliver keynote -> Cluster 1
Sentence: title emphasize environmental global -> Cluster 1
Sentence:  -> Cluster 1
Sentence: outline rise exacerbate like dengue affect mental increase natural -> Cluster 0
Sentence: engage collaborative -> Cluster 1
Sentence: environmental -> Cluster 1
Sentence: medical -> Cluster 0
Sentence: develop predict mitigate future use learn analyze environmental epidemiological -> Cluster 0
Sentence: later participate address ensure equitable affect -> Cluster 1
Sentence: productive spend reflect integrate global -> Cluster 1
Silhouette Score (higher is better, range -1 to 1): 0.03165236860513687


In [64]:
import pandas as pd


In [65]:
# 2e. Compare results

if normal_clusters is not None and no_entities_clusters is not None and no_nouns_clusters is not None:
    # Create DataFrame for cluster distributions
    cluster_data = pd.DataFrame({
        'Sentence': range(len(sentences)),
        'Original': normal_clusters,
        'No Entities': no_entities_clusters,
        'No Nouns': no_nouns_clusters
    })
    print("\nCluster assignments:")
    print(cluster_data)

 
    cluster_melted = pd.melt(cluster_data, id_vars=['Sentence'], 
                             value_vars=['Original', 'No Entities', 'No Nouns'], 
                             var_name='Condition', value_name='Cluster')
    
    
   


Cluster assignments:
    Sentence  Original  No Entities  No Nouns
0          0         1            1         1
1          1         0            1         1
2          2         0            0         1
3          3         1            1         1
4          4         0            0         0
5          5         0            0         1
6          6         1            1         1
7          7         1            1         0
8          8         0            0         0
9          9         0            0         1
10        10         0            0         1
