# Georgian Char2Vec model 

This model will be trained on the kawiki wikimedia dump from the 21.03.2025.

In [None]:
# import regex

import re

In [4]:
def clean_text(line):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', line)
    # Collapse multiple whitespace to a single space
    text = re.sub(r'\s+', ' ', text)
    # Remove any character that is not a Georgian letter (ა-ჰ) or space
    text = re.sub(r'[^ა-ჰ ]', '', text)
    return text.strip()

with open(r'C:\Users\Home\Desktop\Python Scripts\wikipedia Char2vec\kawiki-20250320-pages-articles-multistream.xml', 'r', encoding='utf-8') as f:
    raw_lines = f.readlines()

cleaned_lines = [clean_text(line) for line in raw_lines]

In [None]:
cleaned_lines # preferably open in data wrangler or similar tool to check the output

In [None]:
char_sequences = []
for line in cleaned_lines:
    # Skip empty lines
    if not line:
        continue
    # Convert string to list of characters (spaces included)
    chars = list(line)
    char_sequences.append(chars)

In [None]:
char_sequences # check the output (again, preferably in a tool like data wrangler)

In [6]:
# now we can import gensim and create the char2vec model
from gensim.models import Word2Vec

model = Word2Vec(
    sentences=char_sequences,
    vector_size=70,   # each character is embedded in 70-dimensional space
    window=3,         # context window of ±3 characters
    min_count=1,      # include all characters (no frequency cutoff)
    sg=1,             # use skip-gram; sg=0 would be CBOW
    workers=6         # use multiple CPU cores (adjust as needed)
)

In [10]:
model.wv['ა'] # check the vector for the character 'ა'


array([ 0.05419026,  0.00563378, -0.19373715, -0.04249081,  0.08144802,
       -0.09836317,  0.04549257, -0.00763209, -0.01986822, -0.13298015,
       -0.10904527, -0.10848953, -0.13864368,  0.22707045,  0.08945944,
        0.05887409, -0.06608294, -0.03424963,  0.05651427, -0.07570385,
        0.01690504, -0.23031797,  0.04436377,  0.07457335,  0.04419934,
       -0.0659989 ,  0.04810683, -0.08581562, -0.07262397,  0.18199104,
       -0.11170873,  0.09225684,  0.06508425,  0.04657187, -0.13099499,
       -0.13851371,  0.10315397, -0.2060935 ,  0.17761321,  0.06892851,
        0.03047237, -0.0087992 , -0.03969027,  0.0455033 , -0.08634393,
        0.18564503,  0.0900536 ,  0.09447334, -0.04408414, -0.1128443 ,
        0.11260456, -0.04557788,  0.05660466, -0.00816   ,  0.01893308,
       -0.12012144, -0.04589818,  0.10544215,  0.07555658, -0.01709197,
        0.08597674, -0.11429593,  0.34148946, -0.15637003,  0.03406613,
       -0.33672574, -0.23496766,  0.10089198, -0.02178187, -0.05

In [11]:
model.wv['ა'].shape # check the shape of the vector (should be (70,))

(70,)

In [12]:
import torch
import json

# Get the embedding weight matrix (NumPy array) and convert to a PyTorch tensor
weights = torch.FloatTensor(model.wv.vectors)
torch.save(weights, 'char_embeddings.pt')    # save as a .pt file

# Also save the mapping from character to index
char_to_idx = {char: idx for idx, char in enumerate(model.wv.index_to_key)}
with open('char2idx.json', 'w', encoding='utf-8') as f:
    json.dump(char_to_idx, f, ensure_ascii=False, indent=2)