<a href="https://colab.research.google.com/github/IkuStudies/Genie_GPT/blob/main/IPA2VEC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numpy
!pip install tqdm

In [2]:
import json
import numpy as np

# IPA mapping dictionary
ipa_mapping = {
    'ʉ': 1, 'ɯ': 1, 'u': 1, 'ʏ': 1, 'ʊ': 1,
    'ɵ': 2, 'ɤ': 2, 'o': 2, 'ə': 2, 'ɤ̞': 2,
    'e̞': 3, 'ø̞': 3, 'ɛ': 3, 'œ': 3, 'ɶ': 3, 'ɜ': 3, 'ɞ': 3, 'e': 3, 'ø': 3, 'ɘ': 3, 'ɪ': 3, 'i': 3, 'y': 3, 'ɨ': 3,
    'ɐ': 4, 'ʌ': 4, 'ʡ': 4, 'ʔ': 4, 'ʔ̞': 4, 'ʡ̆': 4,
    'æ': 5, 'a': 5, 'ä': 5,
    'o̞': 6, 'ɔ': 6, 'ɑ': 6, 'ɒ': 6,
    'm̥': 7, 'm': 7, 'ɱ': 7, 'ɳ̊': 7,
    'n̼': 8, 'n̥': 8, 'n': 8, 'ɳ': 8, 'ɲ̊': 8, 'ɲ': 8, 'ŋ̊': 8, 'ŋ': 8, 'ɴ': 8, 'n': 8,
    'p': 9, 'b': 9, 'p̪': 9, 'b̪': 9, 'ʙ̥': 9, 'ʙ': 9, 'ɹ̥': 9,
    'd̼': 10, 'd': 10, 'ɖ': 10,
    't': 11, 'ʈ': 11, 't̼': 11,
    'j': 12, 'ɟ': 12,
    'k': 13, 'q': 13, 'x': 13, 'χ': 13, 'ħ': 13, 'c': 13,
    'ɣ': 14, 'ɡ': 14, 'ɢ': 14, 'ɢ̆': 14,
    'z': 15, 'ɮ': 15, 'ʃ': 15, 'ʒ': 15, 'ʂ': 15, 'ʐ': 15, 'ɕ': 15, 'ʑ': 15, 'ʝ': 15, 'ɕ': 15, 'ʑ': 15, 's': 15,
    'ɸ': 16, 'β': 16, 'f': 16, 'v': 16, 'ⱱ̟': 16, 'ⱱ': 16,
    'θ̼': 17, 'ð̼': 17, 'θ': 17, 'ð': 17, 'θ̠': 17, 'ð̠': 17, 'ɹ̠̊˔': 17, 'ɹ̠˔': 17, 'ɻ̊˔': 17, 'ɻ˔': 17, 'th': 17,
    'ɺ̥': 18, 'ɺ': 18, 'ɾ̼': 18, 'ɾ̥': 18, 'ɾ': 18, 'ɽ̊': 18, 'ɽ': 18, 'ɹ': 18, 'ɻ': 18, 'ʁ': 18,
    'ç': 19, 'ʕ': 19, 'h': 19, 'ɦ': 19,
    'ɰ': 20, 'w': 20,
    'r': 21, 'r̥': 21, 'ɽ̊r̥': 21, 'ɽr': 21, 'ʀ̥': 21, 'ʀ': 21,
    'ʜ': 22, 'ʢ': 22,
    'ɬ': 23, 'ꞎ': 23, 'ʎ̝': 23, 'ʟ̝': 23, 'l': 23, 'ɭ': 23, 'ʎ': 23, 'ʟ': 23, 'ʟ̠': 23, 'ʎ̆': 23, 'ʟ̆': 23
}



# Custom JSON encoder to handle NumPy arrays
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

# Function to transcribe IPA string to numerical values using ipa_mapping
def transcribe_ipa(ipa_string):
    ipa_transcription = []
    for char in ipa_string:
        if char in ipa_mapping:
            ipa_transcription.append(ipa_mapping[char])
    return np.array(ipa_transcription)

input_file = '/content/drive/MyDrive/Colab Notebooks/greeeeek.json'
output_file = '/content/drive/MyDrive/Colab Notebooks/greekfinal.json'
max_length = 18  # Maximum length for numerical representations

# Open input and output files
with open(input_file, 'r') as file_in, open(output_file, 'w', encoding='utf-8') as file_out:
    data = json.load(file_in)
    rows = data['rows']
    for row in rows:
        word = row['word']
        pronunciation = row['pronunciation']
        embedding = row['embedding']
        
        # Convert pronunciation to numerical representation
        transcription = transcribe_ipa(pronunciation)

        # Truncate the transcription if the length exceeds the maximum
        if len(transcription) > max_length:
            transcription = transcription[:max_length]

        # Pad the transcription values with zeros to match the maximum length
        padded_transcription = np.pad(transcription, (0, max_length - len(transcription)))

        # Replace the incorrect embedding with the correct embedding
        row['embedding'] = padded_transcription.tolist()

    # Write the updated data to the output file with formatted indentation and non-ASCII characters preserved
    json.dump(data, file_out, cls=NumpyEncoder, indent=4, ensure_ascii=False)


In [None]:
import json

input_file = '/content/drive/MyDrive/Colab Notebooks/updated_turkish.json'
output_file = '/content/drive/MyDrive/Colab Notebooks/tk.json'
code = 'tk_'  # Code to add in front of the number in "id"

# Open input and output files
with open(input_file, 'r', encoding='utf-8') as file_in, open(output_file, 'w', encoding='utf-8') as file_out:
    data = json.load(file_in)
    rows = data['rows']
    for row in rows:
        entity_id = row['id']
        modified_id = f'{code}{entity_id}'
        row['id'] = modified_id

    # Write the updated data to the output file with formatted indentation and non-ASCII characters preserved
    json.dump(data, file_out, indent=4, ensure_ascii=False)


In [None]:
import json
import math

def split_json(input_file, output_prefix, num_files):
    with open(input_file, 'r') as input_file:
        data = json.load(input_file)
        total_rows = len(data["rows"])
        rows_per_file = math.ceil(total_rows / num_files)

        for i in range(num_files):
            start_index = i * rows_per_file
            end_index = min((i + 1) * rows_per_file, total_rows)
            subset_data = {
                "rows": data["rows"][start_index:end_index]
            }
            output_file = f"{output_prefix}_{i}.json"
            with open(output_file, 'w') as output_file:
                json.dump(subset_data, output_file, indent=4)

# Example usage
input_file_path = '/content/drive/MyDrive/boyah/german_milvus_formatted.json'
output_prefix = 'german'
num_files = 3

# Split the JSON file into multiple files
split_json(input_file_path, output_prefix, num_files)




In [None]:
import json

def fix_json_field_names(json_data):
    fixed_data = []
    for row in json_data["rows"]:
        fixed_row = {}
        for key, value in row.items():
            fixed_key = key.strip()  # Remove leading/trailing spaces from the field name
            fixed_row[fixed_key] = value
        fixed_data.append(fixed_row)
    json_data["rows"] = fixed_data
    return json_data

# Read the JSON file with UTF-8 encoding
with open("/content/drive/MyDrive/boyah/arab2_2.json", encoding="utf-8") as file:
    json_content = json.load(file)

# Fix the field names
fixed_json = fix_json_field_names(json_content)

# Write the fixed JSON to a new file with UTF-8 encoding
with open("arab6.json", "w", encoding="utf-8") as file:
    json.dump(fixed_json, file, indent=4, ensure_ascii=False)


KeyboardInterrupt: ignored

In [1]:
!pip install epitran

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting epitran
  Downloading epitran-1.24-py2.py3-none-any.whl (164 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.9/164.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting panphon>=0.20 (from epitran)
  Downloading panphon-0.20.0-py2.py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting marisa-trie (from epitran)
  Downloading marisa_trie-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
Collecting unicodecsv (from panphon>=0.20->epitran)
  Downloading unicodecsv-0.14.1.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting munkres (from panphon>=0.20->epitran)
  Downloading munkres-

In [52]:
import json
from g2p_en import G2p

def transcribe_greek_words(json_file):
    g2p_transcriber = G2p()

    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    for entry in data['rows']:
        greek_word = entry['word']
        transcription = g2p_transcriber(greek_word)
        entry['pronunciation'] = ' '.join(transcription)

    with open(json_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False)

    print("Transcription completed and JSON file updated.")

# Usage example
json_file_path = '/content/drive/MyDrive/Colab Notebooks/gr.json'
transcribe_greek_words(json_file_path)



Transcription completed and JSON file updated.


In [44]:
import json
from greek_accentuation.characters import *

def transliterate_greek_words(json_file):
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    for entry in data['rows']:
        greek_word = entry['word']
        transliteration = ''.join([GREEK_TO_LATIN.get(c, c) for c in greek_word])
        entry['pronunciation'] = transliteration

    with open(json_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False)

    print("Transliteration completed and JSON file updated.")

# Usage example
json_file_path = '/content/drive/MyDrive/Colab Notebooks/gr.json'
transliterate_greek_words(json_file_path)




NameError: ignored

In [50]:
!pip install g2p_en



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting g2p_en
  Downloading g2p_en-2.1.0-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
Collecting distance>=0.1.3 (from g2p_en)
  Downloading Distance-0.1.3.tar.gz (180 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.3/180.3 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: distance
  Building wheel for distance (setup.py) ... [?25l[?25hdone
  Created wheel for distance: filename=Distance-0.1.3-py3-none-any.whl size=16258 sha256=21d4ae2655388b2719b6ff74fef119a20eeb0c1afa152fe07e39e0fb1d825fc3
  Stored in directory: /root/.cache/pip/wheels/e8/bb/de/f71bf63559ea9a921059a5405806f7ff6ed612a9231c4a9309
Successfully built distance
Installing collected packages: distance, g2p_

In [20]:
import json
import pykakasi

def transliterate_japanese_words(word_list_file, output_file):
    kakasi = pykakasi.kakasi()
    kakasi.setMode("J", "H")
    converter = kakasi.getConverter()

    rows = []
    with open(word_list_file, 'r', encoding='utf-8') as file:
        word_list = file.read().splitlines()

    for i, word in enumerate(word_list):
        entry = {
            "id": f"jp_{i}",
            "word": word,
            "embedding": [0] * 18,
            "pronunciation": converter.do(word)
        }
        rows.append(entry)

    data = {"rows": rows}

    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

    print("Transliteration completed and JSON file created.")

# Usage example
word_list_file = '/content/drive/MyDrive/japanese.txt'
output_file = 'japanese.json'
transliterate_japanese_words(word_list_file, output_file)


  kakasi.setMode("J", "H")
  converter = kakasi.getConverter()
  "pronunciation": converter.do(word)


Transliteration completed and JSON file created.


In [22]:
!pip install romkan

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting romkan
  Downloading romkan-0.2.1.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: romkan
  Building wheel for romkan (setup.py) ... [?25l[?25hdone
  Created wheel for romkan: filename=romkan-0.2.1-py3-none-any.whl size=9252 sha256=9a58dbd946dce96a09bcd0ca7a1a22690f4d0633d9febf06ef2e5b615cb3e664
  Stored in directory: /root/.cache/pip/wheels/b7/2d/cc/074f978dbf64f2cf699bc32cc2e8f5a61a1e82f0d781984056
Successfully built romkan
Installing collected packages: romkan
Successfully installed romkan-0.2.1


In [23]:
import json
from romkan import to_roma

def transliterate_japanese_words(word_list_file, output_file):
    rows = []
    with open(word_list_file, 'r', encoding='utf-8') as file:
        word_list = file.read().splitlines()

    for i, word in enumerate(word_list):
        pronunciation = to_roma(word)
        entry = {
            "id": f"jp_{i}",
            "word": word,
            "embedding": [0] * 18,
            "pronunciation": pronunciation
        }
        rows.append(entry)

    data = {"rows": rows}

    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

    print("Transliteration completed and JSON file created.")

# Usage example
word_list_file = '/content/drive/MyDrive/japanese.txt'
output_file = 'japanese.json'
transliterate_japanese_words(word_list_file, output_file)


Transliteration completed and JSON file created.


In [1]:
import json

GREEK_TO_IPA = {
    'α': 'a',
    'β': 'v',
    'γ': 'ɣ',
    'δ': 'ð',
    'ε': 'ɛ',
    'ζ': 'z',
    'η': 'i',
    'θ': 'θ',
    'ι': 'i',
    'κ': 'k',
    'λ': 'l',
    'μ': 'm',
    'ν': 'n',
    'ξ': 'ks',
    'ο': 'ɔ',
    'π': 'p',
    'ρ': 'r',
    'σ': 's',
    'τ': 't',
    'υ': 'u',
    'φ': 'f',
    'χ': 'x',
    'ψ': 'ps',
    'ω': 'ɔ',
    'ά': 'a',
    'έ': 'ɛ',
    'ή': 'i',
    'ί': 'i',
    'ό': 'ɔ',
    'ύ': 'u',
    'ώ': 'ɔ',
    'ς': 's',
    'ϊ': 'i',
    'ΐ': 'i',
    'ϋ': 'u',
    'ΰ': 'u',
    'Α': 'a',
    'Β': 'v',
    'Γ': 'ɣ',
    'Δ': 'ð',
    'Ε': 'ɛ',
    'Ζ': 'z',
    'Η': 'i',
    'Θ': 'θ',
    'Ι': 'i',
    'Κ': 'k',
    'Λ': 'l',
    'Μ': 'm',
    'Ν': 'n',
    'Ξ': 'ks',
    'Ο': 'ɔ',
    'Π': 'p',
    'Ρ': 'r',
    'Σ': 's',
    'Τ': 't',
    'Υ': 'u',
    'Φ': 'f',
    'Χ': 'x',
    'Ψ': 'ps',
    'Ω': 'ɔ',
    'Ά': 'a',
    'Έ': 'ɛ',
    'Ή': 'i',
    'Ί': 'i',
    'Ό': 'ɔ',
    'Ύ': 'u',
    'Ώ': 'ɔ',
    'Ϊ': 'i',
    'Ϋ': 'u'
}


def transcribe_greek_words(json_file):
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    for entry in data['rows']:
        greek_word = entry['word']
        ipa_transcription = ''.join([GREEK_TO_IPA.get(c, c) for c in greek_word])
        entry['pronunciation'] = ipa_transcription

    with open(json_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False)

    print("Transcription completed and JSON file updated.")

# Usage example
json_file_path = '/content/drive/MyDrive/Colab Notebooks/greeeeek.json'
transcribe_greek_words(json_file_path)


Transcription completed and JSON file updated.


In [3]:
import json

HEBREW_TO_IPA = {
    'א': '',
    'ב': 'v',
    'ג': 'ɡ',
    'ד': 'd',
    'ה': 'h',
    'ו': 'v',
    'ז': 'z',
    'ח': 'χ',
    'ט': 't',
    'י': 'j',
    'כ': 'k',
    'ך': 'k',
    'ל': 'l',
    'מ': 'm',
    'ם': 'm',
    'נ': 'n',
    'ן': 'n',
    'ס': 's',
    'ע': '',
    'פ': 'p',
    'ף': 'p',
    'צ': 'ts',
    'ץ': 'ts',
    'ק': 'k',
    'ר': 'ʁ',
    'ש': 'ʃ',
    'ת': 't',
    'װ': 'v',
    'ױ': 'ɔj',
    'ײ': 'ej',
    'ְ': '',
    'ֱ': 'ɛ',
    'ֲ': 'a',
    'ֳ': 'ɔ',
    'ִ': 'i',
    'ֵ': 'e',
    'ֶ': 'ɛ',
    'ַ': 'a',
    'ָ': 'a',
    'ֹ': 'o',
    'ֺ': 'o',
    'ֻ': 'u',
    'ּ': '',
    'ֽ': '',
    '־': '',
    'ֿ': '',
    '׀': '',
    'ׁ': '',
    'ׂ': '',
    '׃': '',
    'ׄ': '',
    'ׅ': '',
    'ׇ': ''
}


def convert_hebrew_to_ipa(hebrew_word):
    ipa_word = ''.join([HEBREW_TO_IPA.get(c, c) for c in hebrew_word])
    return ipa_word

def process_hebrew_files(input_file1, input_file2, input_file3, output_file):
    data = {'rows': []}
    seen_words = set()

    def add_word(word):
        if word not in seen_words:
            data['rows'].append({
                'id': f'he_{len(data["rows"])}',
                'word': word,
                'embedding': [0.0] * 18,
                'pronunciation': convert_hebrew_to_ipa(word)
            })
            seen_words.add(word)

    with open(input_file1, 'r', encoding='utf-8') as f:
        for line in f:
            word = line.strip()
            add_word(word)

    with open(input_file2, 'r', encoding='utf-8') as f:
        for line in f:
            word = line.strip()
            add_word(word)

    with open(input_file3, 'r', encoding='utf-8') as f:
        for line in f:
            word = line.strip()
            add_word(word)

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

# Usage example
input_file1 = '/content/drive/MyDrive/words.txt'
input_file2 = '/content/drive/MyDrive/bible.txt'
input_file3 = '/content/drive/MyDrive/all_with_fatverb.txt'
output_file = '/content/drive/MyDrive/Colab Notebooks/he.json'
process_hebrew_files(input_file1, input_file2, input_file3, output_file)

