In [1]:
from pprint import pprint 
from elasticsearch import Elasticsearch 
es = Elasticsearch('http://localhost:9200',basic_auth=('elastic','Fe1odvmZ')) 
client_info = es.info() 
print(' Connected to Elasticsearch!') 
pprint(client_info.body) 

 Connected to Elasticsearch!
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'YkeBIv4WSkuNojcRv-HDJQ',
 'name': 'a15d97879705',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2025-06-18T22:09:56.772581489Z',
             'build_flavor': 'default',
             'build_hash': 'cc7302afc8499e83262ba2ceaa96451681f0609d',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '10.1.0',
             'minimum_index_compatibility_version': '8.0.0',
             'minimum_wire_compatibility_version': '8.18.0',
             'number': '9.0.3'}}


In [5]:
es.indices.delete(index='transcription',ignore_unavailable=True)
es.indices.create(
    index="transcription",
    settings={
        "index":{
            "number_of_shards":3,
            "number_of_replicas":2
        }
    },
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'transcription'})

In [None]:
import json
import os
from glob import glob
from tqdm import tqdm
def insert_docs(doc):
    response=es.index(index='transcription',body=doc)
    return response

def print_info(res):
    print(f"""
Document Id: {res['_id']} is {res['result']} and is split into {res['_shards']['total']} shreds
          """)
    
docs_ids=[]

def process_transcription_file(file_path):
    """Process a single transcription file and insert all segments"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # print(f"Processing file: {os.path.basename(file_path)}")
        
        # Insert each segment as a separate document
        for i, segment in enumerate(tqdm(data['segment'],desc="Segments", leave=False)):
            doc = {
                'video_link': data['video_link'],
                'segment_index': i,
                'start': segment['start'],
                'end': segment['end'],
                'text': segment['text']
            }
            
            res = insert_docs(doc)
            docs_ids.append(res['_id'])
        
        
    except Exception as e:
        print(f"โ Error processing {file_path}: {e}\n")

# Method 1: Process all JSON files in a directory
transcription_dir = "../transcriptions_1445h_v3 - Copy"
json_files = glob(os.path.join(transcription_dir, "*.json"))

print(f"Found {len(json_files)} JSON files to process\n")

for file_path in json_files:
    process_transcription_file(file_path)

print("๐ All files processed!")

Found 49 JSON files to process



                                                            

๐ All files processed!




In [2]:
#this is to show the datatypes of 
from pprint import pprint
index_mapping=es.indices.get_mapping(index='transcription')
pprint(index_mapping["transcription"]["mappings"]["properties"])

{'end': {'type': 'float'},
 'segment_index': {'type': 'long'},
 'start': {'type': 'float'},
 'text': {'fields': {'keyword': {'ignore_above': 256, 'type': 'keyword'}},
          'type': 'text'},
 'video_link': {'fields': {'keyword': {'ignore_above': 256, 'type': 'keyword'}},
                'type': 'text'}}


In [3]:
# count documents
es.count(index='transcription')

ObjectApiResponse({'count': 7250, '_shards': {'total': 3, 'successful': 3, 'skipped': 0, 'failed': 0}})

In [None]:
#get a document
es.get(index='transcription',id=docs_ids[0])

ObjectApiResponse({'_index': 'transcription', '_id': '3tjeZ5kB4dmQKG5PqRxM', '_version': 1, '_seq_no': 2341, '_primary_term': 1, 'found': True, '_source': {'video_link': '0T28iTJFhxw', 'segment_index': 0, 'start': 0.0, 'end': 21.14, 'text': 'ุจุณู ุงููู ุงูุฑุญูู ุงูุฑุญูู ุงูุญูุฏ ููู ูุงูุตูุงุฉ ูุงูุณูุงู ุนูู ุฑุณูู ุงููู ูุนูู ุขูู ูุตุญุจู ููู ููุงู ุงูุณูุงู ุนูููู ุฌููุนุง ูุฑุญูุฉ ุงููู ูุจุฑูุงุชู ูุฃููุง ููุฑุญุจุง ุจูู ูู ุญููุฉ ุฃุฎุฑู ูู ูุฐุง ุงูุจุฑูุงูุฌ ููุฐู ุงูุญููุงุช ุงูุชู ุฎุตุตูุงูุง ููุบุฉ ุงูุฅุดุงุฑุฉ ููุฐู ุงููุฆุฉ ุงูุฑุงููุฉ ููุง ุชุญุฏุซูุง ุจุงูุฃูุณ'}})

In [3]:
# get doucment with query
query={
    "match":{
        "text":"ูุฑุถ"
    }
}

es.count(index='transcription',query=query)

ObjectApiResponse({'count': 16, '_shards': {'total': 3, 'successful': 3, 'skipped': 0, 'failed': 0}})

In [4]:
# serch doucment with match word
query={
    "match":{
        "text":"ุตูุงุฉ ุงูุณูุฑ"
    }
}

# Execute search and get results
response = es.search(index='transcription', query=query, size=10)
# Print results
for hit in response['hits']['hits']:
    print(f"Video: {hit['_source']['video_link']}")
    print(f"Time: {hit['_source']['start']} - {hit['_source']['end']}")
    print(f"Text: {hit['_source']['text']}")
    print("-" * 50)

Video: pSrHxXY44AI
Time: 499.24 - 514.48
Text: ุณุคุงููู ุฏูุชูุฑ ุดุฎุต ุชุญุฑู ูู ูุทูู ุจุนุฏ ุฃุฐุงู ุงูุธูุฑ ููู ููู ูููู ุงูุณูุฑ ููุทุน ูุณุงูุฉ ุงูุณูุฑ ูุฃุฎุฑ ุตูุงุฉ ุงูุธูุฑ ูุน ุงูุนุตุฑ ุฃู ุฌูุน ุชุฃุฎูุฑ
--------------------------------------------------
Video: WRTjOg8-B18
Time: 2381.58 - 2399.08
Text: ูุงูุตุงุญุจ ุงูุณูุฑ ุฅููุง ูู ูุฃุฌู ุงูุณูุฑ ูุนู ุทุงู ูุฐุง ุงูุณูุฑ ุฃู ูุตุฑ ุทุงูุช ูุนูู ูุฏุฉ ุงูููุซ ุฃู ูู ุชุทู ูุนู ููู ุงูุชุฑุงู ู ูุนูุฉ ูู ุฃูุฑ ุงูุณูุฑ ูุนู ุจูุฐุง ูููู ุฃู ุงููู ุชุจุงุฑู ู ุชุนุงูู ููู ุนู ููุณู ุงูุตุงุญุจุฉ
--------------------------------------------------
Video: eUS0e-7tSHA
Time: 1857.16 - 1874.18
Text: ูู ูุตูู ุงูุชุฑุงููุญ ูุฃููุง ุณูุตูููุง ุนูุฏ ุงูุฑุฌูุน ู ุฑุฌุนูุง ูู ููุณ ุงูููู ู ูุณูุช ุฃู ุฃุตูู ุงูุชุฑุงููุญ ููุงุฐุง ุนูู ู ูุง ุญูู ุตูุงุฉ ุงูุชุฑุงููุญ ุนู

In [6]:
# search document with match word
query={
    "match":{
        "text":"ุตูุงุฉ ุงูุณูุฑ"
    }
}

# Execute search and get results
response = es.search(index='transcription', query=query, size=10)

# Print results
for i, hit in enumerate(response['hits']['hits']):
    video_link = hit['_source']['video_link']
    start_time = hit['_source']['start']
    
    # Create YouTube link with timestamp
    if 'youtube.com/watch?v=' in video_link or 'youtu.be/' in video_link:
        # Extract video ID
        if 'youtube.com/watch?v=' in video_link:
            video_id = video_link.split('v=')[1].split('&')[0]
        else:  # youtu.be format
            video_id = video_link.split('youtu.be/')[-1].split('?')[0]
        
        # Create timestamped YouTube link
        youtube_link_with_time = f"https://www.youtube.com/watch?v={video_id}&t={int(start_time)}s"
    else:
        youtube_link_with_time = f"{video_link}&t={int(start_time)}s"
    
    print(f"Result {i+1}:")
    print(f"Video: {video_link}")
    print(f"YouTube Link with Time: {youtube_link_with_time}")
    print(f"Time: {hit['_source']['start']} - {hit['_source']['end']}")
    print(f"Text: {hit['_source']['text']}")
    print("-" * 50)

# If you want just the first result with timestamp:
if response['hits']['hits']:
    first_hit = response['hits']['hits'][0]
    video_link = first_hit['_source']['video_link']
    start_time = int(first_hit['_source']['start'])
    
    # Extract video ID and create timestamped link
    if 'youtube.com/watch?v=' in video_link:
        video_id = video_link.split('v=')[1].split('&')[0]
    elif 'youtu.be/' in video_link:
        video_id = video_link.split('youtu.be/')[-1].split('?')[0]
    else:
        video_id = video_link  # Assume it's already a video ID
    
    timestamped_link = f"https://www.youtube.com/watch?v={video_id}&t={start_time}s"
    
    print(f"\n๐ฏ First Result with Timestamp:")
    print(f"Link: {timestamped_link}")

Result 1:
Video: pSrHxXY44AI
YouTube Link with Time: pSrHxXY44AI&t=499s
Time: 499.24 - 514.48
Text: ุณุคุงููู ุฏูุชูุฑ ุดุฎุต ุชุญุฑู ูู ูุทูู ุจุนุฏ ุฃุฐุงู ุงูุธูุฑ ููู ููู ูููู ุงูุณูุฑ ููุทุน ูุณุงูุฉ ุงูุณูุฑ ูุฃุฎุฑ ุตูุงุฉ ุงูุธูุฑ ูุน ุงูุนุตุฑ ุฃู ุฌูุน ุชุฃุฎูุฑ
--------------------------------------------------
Result 2:
Video: WRTjOg8-B18
YouTube Link with Time: WRTjOg8-B18&t=2381s
Time: 2381.58 - 2399.08
Text: ูุงูุตุงุญุจ ุงูุณูุฑ ุฅููุง ูู ูุฃุฌู ุงูุณูุฑ ูุนู ุทุงู ูุฐุง ุงูุณูุฑ ุฃู ูุตุฑ ุทุงูุช ูุนูู ูุฏุฉ ุงูููุซ ุฃู ูู ุชุทู ูุนู ููู ุงูุชุฑุงู ู ูุนูุฉ ูู ุฃูุฑ ุงูุณูุฑ ูุนู ุจูุฐุง ูููู ุฃู ุงููู ุชุจุงุฑู ู ุชุนุงูู ููู ุนู ููุณู ุงูุตุงุญุจุฉ
--------------------------------------------------
Result 3:
Video: eUS0e-7tSHA
YouTube Link with Time: eUS0e-7tSHA&t=1857s
Time: 1857.16 - 1874.18
Text: ูู ูุตูู ุงูุชุฑุงููุญ ูุฃููุง ุณูุตูููุง ุน