<a href="https://colab.research.google.com/github/JemyHo/NLPCW1/blob/Lihang/cw1/F21NL_CW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget -O wikitext-filtered-full.zip "https://www.dropbox.com/scl/fi/ibd4cmixckghx6hhb361c/wikitext-filtered-full.zip?rlkey=q71cebf0k5fvvwhmcntoswzhq&dl=1"
!wget -O wikitext-filtered-10k.zip "https://www.dropbox.com/scl/fi/ek174r3sg7qjx0aa9atop/wikitext-filtered-10k.zip?rlkey=zy6jqxv6qsc16lr9qm3ki9uhf&dl=1"

In [None]:
!unzip wikitext-filtered-full.zip
!unzip wikitext-filtered-10k.zip

In [None]:
!pip install datasets
import datasets

In [None]:
!pip install gensim

In [None]:
from datasets import load_dataset, Dataset

def load_dataset():
  wikitext_small = "wikitext-filtered-10k"
  wikitext_large = "wikitext-filtered-full"

  dataset_small = Dataset.load_from_disk(wikitext_small)
  dataset_large = Dataset.load_from_disk(wikitext_large)
  print("wikitext_small: {} docs, wikitext_large: {} docs".format(len(dataset_small), len(dataset_large)))
  return dataset_small, dataset_large

wikitext_small, wikitext_large = load_dataset()

In [None]:
def tokenize_by_space(text):
    """
    Minimal preprocessing (as required in spec):
    - lowercase
    - split by spaces
    - strip punctuation
    - remove stopwords and empty tokens
    """
    tokens = []
    for tok in text.lower().split(" "):
        tok = tok.strip(string.punctuation)
        if not tok or tok in STOP:
            continue
        tokens.append(tok)
    return tokens

def prepare_corpus(raw_ds):
    """Convert HuggingFace Dataset to list of token lists."""
    first = raw_ds[0]
    # find the right text field
    if isinstance(first, dict):
        key = [k for k in first.keys() if isinstance(first[k], str)][0]
        docs = [ex[key] for ex in raw_ds]
    else:
        docs = list(raw_ds)
    return [tokenize_by_space(doc) for doc in docs if doc.strip()]


In [None]:
tokens_small = prepare_corpus(wikitext_small)
tokens_large = prepare_corpus(wikitext_large)

print("Docs (small):", len(tokens_small))
print("Docs (large):", len(tokens_large))
print("Example tokens:", tokens_small[0][:20])


In [33]:
SEED = 42
WORKERS = os.cpu_count()

def train_word2vec(sentences, name, vector_size=50, window=5, min_count=5, epochs=5, sg=0):
    print(f"▶ Training {name} | vec={vector_size}, win={window}, min={min_count}, ep={epochs}, sg={sg}")
    model = Word2Vec(
        sentences=sentences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        sg=sg,      # 0 = CBOW, 1 = Skip-gram
        workers=WORKERS,
        seed=SEED
    )
    model.train(sentences, total_examples=len(sentences), epochs=epochs)
    print("Vocab size:", len(model.wv))
    model.save(f"w2v_{name}.model")
    print(f"✅ Saved model: w2v_{name}.model\n")
    return model

w2v_small = train_word2vec(tokens_small, "small")
w2v_large = train_word2vec(tokens_large, "large")


▶ Training small | vec=50, win=5, min=5, ep=5, sg=0




Vocab size: 14554
✅ Saved model: w2v_small.model

▶ Training large | vec=50, win=5, min=5, ep=5, sg=0




KeyboardInterrupt: 

In [None]:
import nltk, string, os
from gensim.models import Word2Vec
nltk.download('stopwords')
from nltk.corpus import stopwords

STOP = set(stopwords.words('english'))
print(f"{len(STOP)} English stopwords loaded.")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec


# Load both trained Word2Vec models

w2v_small = Word2Vec.load("w2v_small.model")
w2v_large = Word2Vec.load("w2v_large.model")
print("✅ Models loaded")
print("small vocab:", len(w2v_small.wv), )#"| large vocab:", len(w2v_large.wv)

In [None]:
# Load WordSim-353 CSV from Google Drive

file_path = '/content/drive/My Drive/wordsim353/combined.csv'

try:
    wordsim_df = pd.read_csv(file_path)
    print("✅ WordSim-353 data loaded successfully.")
    display(wordsim_df.head())
except FileNotFoundError:
    print(f"❌ File not found at {file_path}")
except Exception as e:
    print(f"❌ Error loading file: {e}")

print("Total pairs:", len(wordsim_df))


In [None]:
# Helper: compute cosine safely

def compute_cosine(model, w1, w2):
    w1, w2 = w1.lower(), w2.lower()
    try:
        return model.wv.similarity(w1, w2)
    except KeyError:
        return np.nan   # skip missing words

# Compute cosine for the 4 required pairs

pairs = [
    ("plane", "car"),
    ("planet", "sun"),
    ("cup", "article"),
    ("sugar", "approach")
]

print("\n---- wikitext_small ----")
for a, b in pairs:
    print(f"{a:8s}/{b:8s} → {compute_cosine(w2v_small, a, b):.4f}")

print("\n---- wikitext_large ----")   ####
for a, b in pairs:   ###
    print(f"{a:8s}/{b:8s} → {compute_cosine(w2v_large, a, b):.4f}")   #####


# Prepare for full 353-pair evaluation (Step 4)

def evaluate_all(model, df):
    cosines = []
    for _, row in df.iterrows():
        w1, w2 = row['Word 1'], row['Word 2']
        cos = compute_cosine(model, w1, w2)
        cosines.append(cos)
    df[f'{model}'] = cosines
    return cosines

# compute for both models (optional preview)
wordsim_df['cosine_small'] = [
    compute_cosine(w2v_small, w1, w2) for w1, w2 in zip(wordsim_df['Word 1'], wordsim_df['Word 2'])
]
wordsim_df['cosine_large'] = [
    compute_cosine(w2v_large, w1, w2) for w1, w2 in zip(wordsim_df['Word 1'], wordsim_df['Word 2'])
]

display(wordsim_df.head())
print("✅ Cosine columns added – ready for Step 4 (Spearman).")
