In [5]:
import json

files = ["hin_train.json", "hin_valid.json", "hin_test.json"]

for fname in files:
    with open(fname, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f if line.strip()]
    
    total = len(data)
    with_score = [d for d in data if "score" in d]
    
    print(f"{fname}:")
    print(f"  Total instances: {total}")
    print(f"  Instances with 'score' field: {len(with_score)}")
    print("-" * 40)

hin_train.json:
  Total instances: 1299155
  Instances with 'score' field: 1299155
----------------------------------------
hin_valid.json:
  Total instances: 6357
  Instances with 'score' field: 6357
----------------------------------------
hin_test.json:
  Total instances: 10112
  Instances with 'score' field: 0
----------------------------------------


In [2]:
import json
import os
import random

os.makedirs("dataset", exist_ok=True)

# Files
train_file = "hin_train.json"
test_file = "hin_test.json"

# Step 1: Process & random subsample
def process_json(input_file, output_file, sample_size=None):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f if line.strip()]
    
    if sample_size is not None:
        data = random.sample(data, min(sample_size, len(data)))  # random subsample

    with open(output_file, 'w', encoding='utf-8') as out_f:
        for item in data:
            json.dump(item, out_f, ensure_ascii=False)
            out_f.write("\n")
    return output_file

train_processed_file = process_json(train_file, "hin_train__processed_random.json", sample_size=100000)
test_processed_file = process_json(test_file, "hin_test__processed.json")  # keep full test

# Step 2: Convert processed JSON to tab-separated .txt
def json_to_txt(json_file, txt_file):
    with open(json_file, 'r', encoding='utf-8') as f, open(txt_file, 'w', encoding='utf-8') as out_f:
        for line in f:
            if line.strip():
                item = json.loads(line)
                eng_word = item.get('english word', '').strip()
                hin_word = item.get('native word', '').strip()
                if eng_word and hin_word:
                    out_f.write(f"{eng_word}\t{hin_word}\n")

json_to_txt(train_processed_file, "dataset/train_translit.txt")
json_to_txt(test_processed_file, "dataset/test_translit.txt")

print("Processed JSON saved and converted to .txt successfully!")

Processed JSON saved and converted to .txt successfully!


In [1]:
import json
import os
import random
import csv
from collections import defaultdict

os.makedirs("dataset", exist_ok=True)
os.makedirs("aks_dataset", exist_ok=True)

# Files
train_file = "hin_train.json"
test_file = "hin_test.json"
valid_file = "hin_valid.json"

def process_json_stratified(input_file, output_file, sample_size=None):
    # Step 1: Load JSON lines
    with open(input_file, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f if line.strip()]
    
    if sample_size is not None:
        # Step 2: Group by 'source' field
        groups = defaultdict(list)
        for item in data:
            src = item.get('source', 'unknown')
            groups[src].append(item)
        
        total_count = len(data)
        stratified_sample = []

        # Step 3: Sample proportionally per source
        for src, items in groups.items():
            proportion = len(items) / total_count
            n_samples = int(round(proportion * sample_size))
            n_samples = min(n_samples, len(items))  # avoid oversampling
            stratified_sample.extend(random.sample(items, n_samples))
        
        data = stratified_sample
        random.shuffle(data)
    
    # Step 4: Write to output
    with open(output_file, 'w', encoding='utf-8') as out_f:
        for item in data:
            json.dump(item, out_f, ensure_ascii=False)
            out_f.write("\n")

    return output_file


# Step 1: Stratified sample for train (100k)
train_processed_file = process_json_stratified(
    train_file,
    "dataset/hin_train__processed_stratified.json",
    sample_size=100000
)

# Step 2: Keep full test and valid sets
test_processed_file = process_json_stratified(
    test_file, "dataset/hin_test__processed.json"
)
valid_processed_file = process_json_stratified(
    valid_file, "dataset/hin_valid__processed.json"
)

# Step 3: Convert processed JSON → CSV (no headers)
def json_to_csv(json_file, csv_file):
    with open(json_file, 'r', encoding='utf-8') as f, open(csv_file, 'w', newline='', encoding='utf-8') as out_f:
        writer = csv.writer(out_f)
        for line in f:
            if line.strip():
                item = json.loads(line)
                eng_word = item.get('english word', '').strip()
                hin_word = item.get('native word', '').strip()
                if eng_word and hin_word:
                    writer.writerow([eng_word, hin_word])

json_to_csv(train_processed_file, "aks_dataset/train_s_translit.csv")
json_to_csv(test_processed_file, "aks_dataset/test_translit.csv")
json_to_csv(valid_processed_file, "aks_dataset/valid_translit.csv")

print("Stratified sampling complete — processed JSON & .csv files saved!")

Stratified sampling complete — processed JSON & .csv files saved!


In [4]:
import json
import random
from collections import defaultdict
import pandas as pd

random.seed(42)  # reproducible sampling

TRAIN_FILE = "rough/hin_train.json"
SAMPLE_SIZE = 100000  # desired stratified sample size

# Load jsonl
with open(TRAIN_FILE, 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f if line.strip()]

# Build counts by source
groups = defaultdict(list)
for item in data:
    src = item.get('source', 'unknown')
    groups[src].append(item)

# Original distribution dataframe
orig_rows = []
total_count = len(data)
for src, items in sorted(groups.items(), key=lambda x: -len(x[1])):
    count = len(items)
    proportion = count / total_count if total_count > 0 else 0.0
    orig_rows.append((src, count, proportion))

orig_df = pd.DataFrame(orig_rows, columns=['source', 'count', 'proportion'])
orig_df['proportion'] = (orig_df['proportion'] * 100).round(3)  # show as percent
orig_df.loc['sum'] = ['sum', orig_df['count'].sum(), orig_df['proportion'].sum()]

# Create stratified sample in-memory (no saving)
# If sample size > total, fall back to full data (no sampling)
if SAMPLE_SIZE >= total_count:
    sampled = data.copy()
else:
    sampled = []
    for src, items in groups.items():
        proportion = len(items) / total_count
        n_samples = int(round(proportion * SAMPLE_SIZE))
        # avoid oversampling adjustments (clip to available)
        n_samples = min(n_samples, len(items))
        if n_samples > 0:
            sampled.extend(random.sample(items, n_samples))
    # If rounding caused fewer/more than SAMPLE_SIZE, adjust by random picks
    # If fewer, add random items from remaining; if more, trim randomly
    if len(sampled) < SAMPLE_SIZE:
        remaining = [itm for itm in data if itm not in sampled]
        need = SAMPLE_SIZE - len(sampled)
        if remaining:
            sampled.extend(random.sample(remaining, min(need, len(remaining))))
    elif len(sampled) > SAMPLE_SIZE:
        sampled = random.sample(sampled, SAMPLE_SIZE)

sample_total = len(sampled)

# Compute sampled distribution
sample_groups = defaultdict(int)
for item in sampled:
    src = item.get('source', 'unknown')
    sample_groups[src] += 1

sample_rows = []
for src, items in sorted(groups.items(), key=lambda x: -len(x[1])):
    sampled_count = sample_groups.get(src, 0)
    sampled_prop = sampled_count / sample_total * 100 if sample_total > 0 else 0.0
    sample_rows.append((src, sampled_count, round(sampled_prop, 3)))

sample_df = pd.DataFrame(sample_rows, columns=['source', 'sampled_count', 'sampled_proportion_pct'])
sample_df.loc['sum'] = ['sum', sample_df['sampled_count'].sum(), sample_df['sampled_proportion_pct'].sum()]

# Print nicely: in notebooks display will show styled tables, otherwise fallback to to_string()
try:
    from IPython.display import display
    display(orig_df.style.set_caption("Original train source distribution (counts & %)").format({'count':'{:,}','proportion':'{:.3f}%'}))
    display(sample_df.style.set_caption(f"Stratified data (Our Train Set) (total=100k)"))
except Exception:
    print("\nOriginal train source distribution (counts & %):")
    print(orig_df.to_string(index=False))
    print(f"\nStratified data (Our Train Set) (total=100k):")
    print(sample_df.to_string(index=False))

Unnamed: 0,source,count,proportion
0,IndicCorp,956190,73.601%
1,Samanantar,152778,11.760%
2,Existing,131773,10.143%
3,Dakshina,24727,1.903%
4,Wikidata,24528,1.888%
5,AK-Freq,9159,0.705%
sum,sum,1299155,100.000%


Unnamed: 0,source,sampled_count,sampled_proportion_pct
0,IndicCorp,73601,73.601
1,Samanantar,11760,11.76
2,Existing,10143,10.143
3,Dakshina,1903,1.903
4,Wikidata,1888,1.888
5,AK-Freq,705,0.705
sum,sum,100000,100.0


In [None]:
def json_to_txt(json_file, txt_file):
    with open(json_file, 'r', encoding='utf-8') as f, open(txt_file, 'w', encoding='utf-8') as out_f:
        for line in f:
            if line.strip():
                item = json.loads(line)
                eng_word = item.get('english word', '').strip()
                hin_word = item.get('native word', '').strip()
                if eng_word and hin_word:
                    out_f.write(f"{eng_word}\t{hin_word}\n")

json_to_txt(train_processed_file, "dataset/train_s_translit.txt")
json_to_txt(test_processed_file, "dataset/test_translit.txt")

print("✅ Stratified sampling complete — processed JSON & .txt files saved!")