In [1]:
from pprint import pprint 
from elasticsearch import Elasticsearch 
es = Elasticsearch('http://localhost:9200',basic_auth=('user','password')) 
client_info = es.info() 
print(' Connected to Elasticsearch!') 
pprint(client_info.body) 

 Connected to Elasticsearch!
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'YkeBIv4WSkuNojcRv-HDJQ',
 'name': 'a15d97879705',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2025-06-18T22:09:56.772581489Z',
             'build_flavor': 'default',
             'build_hash': 'cc7302afc8499e83262ba2ceaa96451681f0609d',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '10.1.0',
             'minimum_index_compatibility_version': '8.0.0',
             'minimum_wire_compatibility_version': '8.18.0',
             'number': '9.0.3'}}


In [6]:
es.indices.delete(index='transcription',ignore_unavailable=True)
es.indices.create(
    index="transcription",
    # settings={
    #     "index":{
    #         "number_of_shards":3,
    #         "number_of_replicas":2
    #     }
    # },
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'transcription'})

In [7]:
import json
import os
from glob import glob
def insert_docs(doc):
    response=es.index(index='transcription',body=doc)
    return response

def print_info(res):
    print(f"""
Document Id: {res['_id']} is {res['result']} and is split into {res['_shards']['total']} shreds
          """)
    



def process_transcription_file(file_path):
    """Process a single transcription file and insert all segments"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        print(f"Processing file: {os.path.basename(file_path)}")
        
        # Insert each segment as a separate document
        for i, segment in enumerate(data['segment']):
            doc = {
                'video_link': data['video_link'],
                'segment_index': i,
                'start': segment['start'],
                'end': segment['end'],
                'text': segment['text']
            }
            
            res = insert_docs(doc)
            print(f"  Segment {i}: {res['_id']}")
        
        print(f"✅ Completed: {len(data['segment'])} segments inserted\n")
        
    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}\n")

# Method 1: Process all JSON files in a directory
transcription_dir = "../transcriptions_1445h_v3 - Copy"
json_files = glob(os.path.join(transcription_dir, "*.json"))

print(f"Found {len(json_files)} JSON files to process\n")

for file_path in json_files:
    process_transcription_file(file_path)

print("🎉 All files processed!")

Found 49 JSON files to process

Processing file: 0T28iTJFhxw.json
  Segment 0: evTL5ZcBz5YhD79RVs2-
  Segment 1: e_TL5ZcBz5YhD79RVs3q
  Segment 2: fPTL5ZcBz5YhD79RVs3w
  Segment 3: ffTL5ZcBz5YhD79RVs33
  Segment 4: fvTL5ZcBz5YhD79RV80A
  Segment 5: f_TL5ZcBz5YhD79RV80F
  Segment 6: gPTL5ZcBz5YhD79RV80J
  Segment 7: gfTL5ZcBz5YhD79RV80S
  Segment 8: gvTL5ZcBz5YhD79RV80Y
  Segment 9: g_TL5ZcBz5YhD79RV80c
  Segment 10: hPTL5ZcBz5YhD79RV80h
  Segment 11: hfTL5ZcBz5YhD79RV80m
  Segment 12: hvTL5ZcBz5YhD79RV80q
  Segment 13: h_TL5ZcBz5YhD79RV80v
  Segment 14: iPTL5ZcBz5YhD79RV800
  Segment 15: ifTL5ZcBz5YhD79RV804
  Segment 16: ivTL5ZcBz5YhD79RV81B
  Segment 17: i_TL5ZcBz5YhD79RV81F
  Segment 18: jPTL5ZcBz5YhD79RV81K
  Segment 19: jfTL5ZcBz5YhD79RV81P
  Segment 20: jvTL5ZcBz5YhD79RV81U
  Segment 21: j_TL5ZcBz5YhD79RV81Y
  Segment 22: kPTL5ZcBz5YhD79RV81e
  Segment 23: kfTL5ZcBz5YhD79RV81j
  Segment 24: kvTL5ZcBz5YhD79RV81o
  Segment 25: k_TL5ZcBz5YhD79RV81s
  Segment 26: lPTL5ZcBz5YhD79RV81x

In [8]:
from pprint import pprint
index_mapping=es.indices.get_mapping(index='transcription')
pprint(index_mapping["transcription"]["mappings"]["properties"])

{'end': {'type': 'float'},
 'segment_index': {'type': 'long'},
 'start': {'type': 'float'},
 'text': {'fields': {'keyword': {'ignore_above': 256, 'type': 'keyword'}},
          'type': 'text'},
 'video_link': {'fields': {'keyword': {'ignore_above': 256, 'type': 'keyword'}},
                'type': 'text'}}
