#### Group members

Mostafa Allahmoradi - 9087818
Jarius Bedward - 8841640

## Imports


In [5]:
import string
import nltk
from nltk.corpus.reader import documents
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import os
from gensim.models import Word2Vec
from tensorflow.python.types.doc_typealias import document


## Setup


In [6]:
nltk_data_path = os.path.join(os.getcwd(), "nltk_data")
print("Downloading tokenizer resources...")

nltk.download("punkt", download_dir=nltk_data_path, force=True)
nltk.download("punkt_tab", download_dir=nltk_data_path, force=True)

# makes sure path is used by nltk
if nltk_data_path not in nltk.data.path:
    nltk.data.path.append(nltk_data_path)

print("Active nltk paths:", nltk.data.path)
print("Contents of nltk_data:", os.listdir(nltk_data_path))

Downloading tokenizer resources...


[nltk_data] Downloading package punkt to C:\Users\jjbed\Downloads\ML
[nltk_data]     prog week 13\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jjbed\Downloads\ML prog week 13\nltk_data...


Active nltk paths: ['C:\\Users\\jjbed/nltk_data', 'C:\\Users\\jjbed\\AppData\\Local\\Programs\\Python\\Python313\\nltk_data', 'C:\\Users\\jjbed\\AppData\\Local\\Programs\\Python\\Python313\\share\\nltk_data', 'C:\\Users\\jjbed\\AppData\\Local\\Programs\\Python\\Python313\\lib\\nltk_data', 'C:\\Users\\jjbed\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data', 'C:\\Users\\jjbed\\Downloads\\ML prog week 13\\nltk_data']
Contents of nltk_data: ['tokenizers']


[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


## Document Collection
-

##  Tokenizer, Normalization Pipeline

In [7]:
#Normalization

def normalize(text):
    # in lowercase text
    text = text.lower()
    #removes punctionation
    text = text.translate(str.maketrans('', '', string.punctuation))
    #removes numbers
    text = re.sub(r"\d+", "", text)
    #Removes urls
    text = re.sub(r"http\S+|www\S+", "", text)
    #removes extra white spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

normalize_docs = [normalize(doc) for doc in documents]

#Tokenization
nltk.download("punkt")
nltk.download("stopwords")

stopwords=set(stopwords.words("english"))

def tokenize(text):
    tokens = word_tokenize(text)
    #remove stopwords
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

tokenize_docs = [tokenize(doc) for doc in normalize_docs]

#FInal output print
print("Original:")
print(documents, "\n")

print("Normalized:")
print(normalize_docs, "\n")

print("Tokenized:")
print(tokenize_docs, "\n")


[nltk_data] Downloading package punkt to C:\Users\jjbed/nltk_data...


Original:
{'coadrian.o34': 'Adrian and Ritheus', 'coaelhom.o3': 'Ã†lfric, Supplemental Homilies', 'coaelive.o3': "Ã†lfric's Lives of Saints", 'coalcuin': 'Alcuin De virtutibus et vitiis', 'coalex.o23': "Alexander's Letter to Aristotle", 'coapollo.o3': 'Apollonius of Tyre', 'coaugust': 'Augustine', 'cobede.o2': "Bede's History of the English Church", 'cobenrul.o3': 'Benedictine Rule', 'coblick.o23': 'Blickling Homilies', 'coboeth.o2': "Boethius' Consolation of Philosophy", 'cobyrhtf.o3': "Byrhtferth's Manual", 'cocanedgD': 'Canons of Edgar (D)', 'cocanedgX': 'Canons of Edgar (X)', 'cocathom1.o3': "Ã†lfric's Catholic Homilies I", 'cocathom2.o3': "Ã†lfric's Catholic Homilies II", 'cochad.o24': 'Saint Chad', 'cochdrul': 'Chrodegang of Metz, Rule', 'cochristoph': 'Saint Christopher', 'cochronA.o23': 'Anglo-Saxon Chronicle A', 'cochronC': 'Anglo-Saxon Chronicle C', 'cochronD': 'Anglo-Saxon Chronicle D', 'cochronE.o34': 'Anglo-Saxon Chronicle E', 'cocura.o2': 'Cura Pastoralis', 'cocuraC': 'Cu

[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jjbed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Implement a Word2Vec predictive model using the knowledge corpus.

In [8]:


model_w2v = Word2Vec(
    sentences=tokenize_docs, # the tokenized corpus must be a list of lists
    vector_size=100,    #size of embedding
   window=5,        #context window
   min_count=1,     #keep all words (for demo purpose
    workers=4,         #choose how much cpu coreses use
  sg = 1            # number of skip-grams = 1 since this is small data 1
)

#train model
# model_w2v.train(tokenize_docs, total_examples=len(tokenize_docs), epochs=20)

# ex: check similar words
# Example: check similar words
# model_w2v.wv.most_similar("nlp", topn=3)



## ðŸ§  Learning Objectives
- Teams of 2 (individual evaluation in class).
- Implement **Word2Vec**  and **GloVe** using real-world data during the NLP process.
- Build **Jupyter Notebooks** with well-structured code and clear Markdown documentation.
- Use **Git and GitHub** for collaborative version control and code sharing.
- Identify and articulate coding issues ("**talking points**") and insert them directly into markdown comments.


## ðŸ§© Workshop Structure (In Class)
1. **Set up teams of 2 people** â€“ Read and understand the workshop, plus submission instructions. Seek assistance if needed.
2. **Jupyter Notebook Development** *(In class)* â€“ NLP Pipeline (if needed) and Probabilistic Model method implementations + Markdown documentation (work as teams)
3. **Push to GitHub** â€“ Teams commit and push the notebook. **Make sure to include your names so it is easy to identify the team that developed the code**.
4. **Instructor Review** - The instructor will go around in class, take notes, and provide coaching as needed, during the **Peer Review Round**


## ðŸ’» Submission Checklist
- âœ… `EmbeddingClusteringVectorizationWorkshop.ipynb` with:
  - Demo code: Document Collection, Tokenizer, Normalization Pipeline on a relevant corpus.
  - Demo code: Implement a Word2Vec predictive model using the knowledge corpus.
  - Demo code: Implement a GloVe count-based model using the knowledge corpus.
  - Markdown explanations for each major step
  - In a table that compare **Word2Vec** against **GloVe** in the context of the use case that makes use of the knowledge corpus.
- âœ… `README.md` with:
  - Dataset description
  - Team member names
  - Link to the dataset and license (if public)
- âœ… GitHub Repo:
  - Public repo named `EmbeddingClusteringVectorizationWorkshop`
  - **Markdowns and meaningful talking points**