<a href="https://colab.research.google.com/github/IkuStudies/abraxalexicon/blob/main/IPA2VEC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numpy
!pip install tqdm

In [23]:
import json
import numpy as np

# IPA mapping dictionary
ipa_mapping = {
    'ʉ': 1, 'ɯ': 1, 'u': 1, 'ʏ': 1, 'ʊ': 1,
    'ɵ': 2, 'ɤ': 2, 'o': 2, 'ə': 2, 'ɤ̞': 2,
    'e̞': 3, 'ø̞': 3, 'ɛ': 3, 'œ': 3, 'ɶ': 3, 'ɜ': 3, 'ɞ': 3, 'e': 3, 'ø': 3, 'ɘ': 3, 'ɪ': 3, 'i': 3, 'y': 3, 'ɨ': 3,
    'ɐ': 4, 'ʌ': 4, 'ʡ': 4, 'ʔ': 4, 'ʔ̞': 4, 'ʡ̆': 4,
    'æ': 5, 'a': 5, 'ä': 5,
    'o̞': 6, 'ɔ': 6, 'ɑ': 6, 'ɒ': 6,
    'm̥': 7, 'm': 7, 'ɱ': 7, 'ɳ̊': 7,
    'n̼': 8, 'n̥': 8, 'n': 8, 'ɳ': 8, 'ɲ̊': 8, 'ɲ': 8, 'ŋ̊': 8, 'ŋ': 8, 'ɴ': 8, 'n': 8,
    'p': 9, 'b': 9, 'p̪': 9, 'b̪': 9, 'ʙ̥': 9, 'ʙ': 9, 'ɹ̥': 9,
    'd̼': 10, 'd': 10, 'ɖ': 10,
    't': 11, 'ʈ': 11, 't̼': 11,
    'j': 12, 'ɟ': 12,
    'k': 13, 'q': 13, 'x': 13, 'χ': 13, 'ħ': 13, 'c': 13,
    'ɣ': 14, 'ɡ': 14, 'ɢ': 14, 'ɢ̆': 14,
    'z': 15, 'ɮ': 15, 'ʃ': 15, 'ʒ': 15, 'ʂ': 15, 'ʐ': 15, 'ɕ': 15, 'ʑ': 15, 'ʝ': 15, 'ɕ': 15, 'ʑ': 15, 's': 15,
    'ɸ': 16, 'β': 16, 'f': 16, 'v': 16, 'ⱱ̟': 16, 'ⱱ': 16,
    'θ̼': 17, 'ð̼': 17, 'θ': 17, 'ð': 17, 'θ̠': 17, 'ð̠': 17, 'ɹ̠̊˔': 17, 'ɹ̠˔': 17, 'ɻ̊˔': 17, 'ɻ˔': 17, 'th': 17,
    'ɺ̥': 18, 'ɺ': 18, 'ɾ̼': 18, 'ɾ̥': 18, 'ɾ': 18, 'ɽ̊': 18, 'ɽ': 18, 'ɹ': 18, 'ɻ': 18, 'ʁ': 18,
    'ç': 19, 'ʕ': 19, 'h': 19, 'ɦ': 19,
    'ɰ': 20, 'w': 20,
    'r': 21, 'r̥': 21, 'ɽ̊r̥': 21, 'ɽr': 21, 'ʀ̥': 21, 'ʀ': 21,
    'ʜ': 22, 'ʢ': 22,
    'ɬ': 23, 'ꞎ': 23, 'ʎ̝': 23, 'ʟ̝': 23, 'l': 23, 'ɭ': 23, 'ʎ': 23, 'ʟ': 23, 'ʟ̠': 23, 'ʎ̆': 23, 'ʟ̆': 23
}



# Custom JSON encoder to handle NumPy arrays
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

# Function to transcribe IPA string to numerical values using ipa_mapping
def transcribe_ipa(ipa_string):
    ipa_transcription = []
    for char in ipa_string:
        if char in ipa_mapping:
            ipa_transcription.append(ipa_mapping[char])
    return np.array(ipa_transcription)

input_file = '/content/drive/MyDrive/boyah/turkish.json'
output_file = '/content/drive/MyDrive/Colab Notebooks/updated_turkish.json'
max_length = 18  # Maximum length for numerical representations

# Open input and output files
with open(input_file, 'r') as file_in, open(output_file, 'w', encoding='utf-8') as file_out:
    data = json.load(file_in)
    rows = data['rows']
    for row in rows:
        word = row['word']
        pronunciation = row['pronunciation']
        embedding = row['embedding']
        
        # Convert pronunciation to numerical representation
        transcription = transcribe_ipa(pronunciation)

        # Truncate the transcription if the length exceeds the maximum
        if len(transcription) > max_length:
            transcription = transcription[:max_length]

        # Pad the transcription values with zeros to match the maximum length
        padded_transcription = np.pad(transcription, (0, max_length - len(transcription)))

        # Replace the incorrect embedding with the correct embedding
        row['embedding'] = padded_transcription.tolist()

    # Write the updated data to the output file with formatted indentation and non-ASCII characters preserved
    json.dump(data, file_out, cls=NumpyEncoder, indent=4, ensure_ascii=False)


In [42]:
import json

input_file = '/content/drive/MyDrive/Colab Notebooks/updated_turkish.json'
output_file = '/content/drive/MyDrive/Colab Notebooks/tk.json'
code = 'tk_'  # Code to add in front of the number in "id"

# Open input and output files
with open(input_file, 'r', encoding='utf-8') as file_in, open(output_file, 'w', encoding='utf-8') as file_out:
    data = json.load(file_in)
    rows = data['rows']
    for row in rows:
        entity_id = row['id']
        modified_id = f'{code}{entity_id}'
        row['id'] = modified_id

    # Write the updated data to the output file with formatted indentation and non-ASCII characters preserved
    json.dump(data, file_out, indent=4, ensure_ascii=False)


In [None]:
import json
import math

def split_json(input_file, output_prefix, num_files):
    with open(input_file, 'r') as input_file:
        data = json.load(input_file)
        total_rows = len(data["rows"])
        rows_per_file = math.ceil(total_rows / num_files)

        for i in range(num_files):
            start_index = i * rows_per_file
            end_index = min((i + 1) * rows_per_file, total_rows)
            subset_data = {
                "rows": data["rows"][start_index:end_index]
            }
            output_file = f"{output_prefix}_{i}.json"
            with open(output_file, 'w') as output_file:
                json.dump(subset_data, output_file, indent=4)

# Example usage
input_file_path = '/content/drive/MyDrive/boyah/german_milvus_formatted.json'
output_prefix = 'german'
num_files = 3

# Split the JSON file into multiple files
split_json(input_file_path, output_prefix, num_files)




In [None]:
import json

def fix_json_field_names(json_data):
    fixed_data = []
    for row in json_data["rows"]:
        fixed_row = {}
        for key, value in row.items():
            fixed_key = key.strip()  # Remove leading/trailing spaces from the field name
            fixed_row[fixed_key] = value
        fixed_data.append(fixed_row)
    json_data["rows"] = fixed_data
    return json_data

# Read the JSON file with UTF-8 encoding
with open("/content/drive/MyDrive/boyah/arab2_2.json", encoding="utf-8") as file:
    json_content = json.load(file)

# Fix the field names
fixed_json = fix_json_field_names(json_content)

# Write the fixed JSON to a new file with UTF-8 encoding
with open("arab6.json", "w", encoding="utf-8") as file:
    json.dump(fixed_json, file, indent=4, ensure_ascii=False)


KeyboardInterrupt: ignored

In [None]:
git clone https://github.com/zilliztech/cloud-vectordb-examples.git
pip3 install pymilvus
cd cloud-vectordb-examples
cd python

change this config.ini file uri = https://in01-daf1771fc27e45a.aws-us-west-2.vectordb.zillizcloud.com:19542
user = db_admin
password = ******
secure = True
python3 hello_zilliz_vectordb.py

should be similarlooking to this 
Connecting to DB: https://in01-daf1771fc27e45a.aws-us-west-2.vectordb.zillizcloud.com:19542
Success!
Creating example collection: book
Schema: {...}
Success!
Inserting 100000 entities... 
Succeed in 6.0288 seconds!
Building AutoIndex...
Succeed in 18.9118 seconds!
Loading collection...
Succeed in 2.5229 seconds!
Searching vector:[[...][...]...]
search 0 latency: 0.0057 seconds!
Searching vector:[[...][...]...]
search 1 latency: 0.0049 seconds!
Searching vector:[[...][...]...]
search 2 latency: 0.0051 seconds!
...
...