In [None]:
from gensim.models import Word2Vec, FastText
from gensim.utils import simple_preprocess
from pathlib import Path
from typing import List
import json, unicodedata, re, html
import numpy as np
import sys

if Path.cwd().parent not in sys.path:
    sys.path.insert(0, str(Path.cwd().parent))

from config import settings

In [2]:
def read_file_to_docs(filepath: Path) -> List[str]:
    docs = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            record = json.loads(line)
            doc = record["text"]
            doc = unicodedata.normalize("NFKC", doc)
            doc = html.unescape(doc)
            doc = re.sub(r'\s+', ' ', doc)
            docs.append(doc.strip().lower())
    return docs

# Load the 1000 training docs
train_docs = read_file_to_docs(settings.OUTPUT_JSONL)

# Sample the first 2 documents
train_docs[:2]

["visceral fat more than subcutaneous fat has been associated with numerous age-related problems, such as insulin resistance, chronic inflammation, and cardiac diastolic dysfunction. computed tomography (ct) is a widely adopted technique for assessing abdominal fat. measurements of subcutaneous and visceral fat areas are typically computed from ct images using manual or semimanual analyses operated by clinical specialists. even though these techniques usually provide reliable results, they are time-consuming and affected by intra-and inter-operator variability. therefore, a regional fat quantification approach is highly desirable to increase the analysis throughput and reproducibility, especially in large epidemiological studies (3)(4)(5)(6). earlier abdomen fat quantification approaches (7,8) required user intervention to define a region of interest and detected the fat voxels using density thresholding techniques in the domain of hounsfield units (hus). more recent techniques make us

In [3]:
sentences = [simple_preprocess(doc) for doc in train_docs]

### Custom Fasttext

In [4]:
def char_trigrams(word):
    w = f"<{word}>"
    return [w[i:i+3] for i in range(len(w)-2)]

# Convert sentences into trigram lists
tri_sentences = [[g for w in sent for g in char_trigrams(w)] for sent in sentences]

# Train Word2Vec on trigrams
custom_model = Word2Vec(
    tri_sentences,
    vector_size=100,
    window=5,
    min_count=1,
    sg=1,
    epochs=10
)

In [5]:
def get_custom_vector(word):
    grams = [g for g in char_trigrams(word) if g in custom_model.wv]
    if not grams:
        return None
    return np.mean([custom_model.wv[g] for g in grams], axis=0)

### Gensim Fasttext Model

In [6]:
ft_model = FastText(
    sentences,
    vector_size=100,
    window=5,
    min_count=2,
    sg=1,
    min_n=3,
    max_n=3,
    epochs=5
)

In [13]:
custom_model.save("../outputs/mex5/custom_trigram.model")
ft_model.save("../outputs/mex5/fasttext_trigram.model")

print("Models saved:")
print(" - custom_trigram.model")
print(" - fasttext_trigram.model")

Models saved:
 - custom_trigram.model
 - fasttext_trigram.model


### Comparison

In [10]:
word = "running"
print("Custom vector shape:", get_custom_vector(word).shape)
print("FastText vector shape:", ft_model.wv[word].shape)
print("FastText neighbors:", ft_model.wv.most_similar("running", topn=5))

Custom vector shape: (100,)
FastText vector shape: (100,)
FastText neighbors: [('dunning', 0.8167727589607239), ('rønning', 0.8116998672485352), ('gunning', 0.8071601390838623), ('recurving', 0.7977186441421509), ('logging', 0.7805308103561401)]


### Summary

