In [None]:
#fastwhisper and gemini
!pip install -q faster-whisper google-generativeai
!sudo apt update -y && sudo apt install -y ffmpeg


from faster_whisper import WhisperModel
import os
import json
import google.generativeai as genai
from google.colab import drive
from pathlib import Path

#Loading FastWhisper
whisper_model = WhisperModel("medium", device="cpu", compute_type="int8")

#Gemini API initialization
GEMINI_API_KEY = "AIzaSyCYKK_Neo0V9MmiaOqyv-uo2P1Qx4Ong2w"
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel("gemini-1.5-flash")

#transcribing(Fastwhisper)
def transcribe_hindi_fastwhisper(audio_path):
    segments, _ = whisper_model.transcribe(audio_path, language="hi")
    return " ".join(segment.text for segment in segments)

#Gemini(Correction,Translation)
def correct_and_translate(text_hindi):
    prompt = f"""
    You're a Hindi-English translator. The following text is a Hindi speech transcription.
    Step 1: Correct spelling/grammar/punctuation errors in Hindi.
    Step 2: Translate the corrected Hindi to professional English.

    Hindi Input:
    {text_hindi}

    Respond as:
    Hindi (Corrected):
    <corrected_hindi>

    English Translation:
    <translated_english>
    """
    response = gemini_model.generate_content(prompt)
    return response.text.strip()

#Audio
base_path = Path("/content/drive/MyDrive/C-DAC")
audio_files = [base_path / name for name in [
    "Root_1_6130579.mp3", "6131490.mp3", "6131551.mp3", "6131785.mp3", "6131832.mp3", "6131851.mp3",
    "6131888.mp3", "6132149.mp3", "6132376.mp3", "6132569.mp3", "6132577.mp3", "6132622.mp3",
    "6132729.mp3", "6132735.mp3", "6132745.mp3", "6132751.mp3", "6132811.mp3", "6133246.mp3",
    "6133493.mp3", "6133498.mp3", "6133522.mp3", "6133557.mp3", "6133644.mp3", "6133757.mp3",
    "6133863.mp3", "6134064.mp3"
]]


resume_path = "/content/drive/MyDrive/C-DAC/asr_translations_partial.json"

if os.path.exists(resume_path):
    with open(resume_path, "r", encoding="utf-8") as f:
        results = json.load(f)
    done_files = {item["file"] for item in results}
    print(f" Skipping {len(done_files)} already processed files.")
else:
    results = []
    done_files = set()
    print(" Starting fresh...")

audio_files = [f for f in audio_files if f.name not in done_files]


results = []
for file in audio_files:
    try:
        print(f" Processing: {file}")
        hindi_raw = transcribe_hindi_fastwhisper(str(file))
        translated_output = correct_and_translate(hindi_raw)
        results.append({
            "file": file.name,
            "hindi_raw": hindi_raw,
            "translation": translated_output
        })
    except Exception as e:
        print(f" Error processing {file.name}: {e}")

#  Saving JSON to Drive
output_path = "/content/drive/MyDrive/C-DAC/asr_translations_fast.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"Fast ASR + Translation complete. File saved to: {output_path}")


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.1/1.1 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m35.3/35.3 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m38.6/38.6 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m16.4/16.4 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m46.0/46.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.26k [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/1.53G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

üÜï Starting fresh...
‚è≥ Processing: /content/drive/MyDrive/C-DAC/Root_1_6130579.mp3
‚è≥ Processing: /content/drive/MyDrive/C-DAC/6131490.mp3
‚è≥ Processing: /content/drive/MyDrive/C-DAC/6131551.mp3
‚è≥ Processing: /content/drive/MyDrive/C-DAC/6131785.mp3
‚è≥ Processing: /content/drive/MyDrive/C-DAC/6131832.mp3
‚è≥ Processing: /content/drive/MyDrive/C-DAC/6131851.mp3
‚è≥ Processing: /content/drive/MyDrive/C-DAC/6131888.mp3
‚è≥ Processing: /content/drive/MyDrive/C-DAC/6132149.mp3
‚è≥ Processing: /content/drive/MyDrive/C-DAC/6132376.mp3
‚è≥ Processing: /content/drive/MyDrive/C-DAC/6132569.mp3
‚è≥ Processing: /content/drive/MyDrive/C-DAC/6132577.mp3
‚è≥ Processing: /content/drive/MyDrive/C-DAC/6132622.mp3
‚è≥ Processing: /content/drive/MyDrive/C-DAC/6132729.mp3
‚è≥ Processing: /content/drive/MyDrive/C-DAC/6132735.mp3
‚è≥ Processing: /content/drive/MyDrive/C-DAC/6132745.mp3
‚è≥ Processing: /content/drive/MyDrive/C-DAC/6132751.mp3
‚è≥ Processing: /content/drive/MyDrive/C-DAC/6132811.mp3
‚

JSON TO .TXT

In [None]:
import json

input_path = "/content/drive/MyDrive/C-DAC/asr_translations_fast.json"
output_path = "/content/drive/MyDrive/C-DAC/english_translations.txt"

with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)


translations = []
for item in data:
    full = item["translation"]
    # Split and keep only the final English part after "**English Translation:**"
    if "**English Translation:**" in full:
        english_part = full.split("**English Translation:**")[-1].strip()
    else:
        english_part = full.strip()
    translations.append(english_part)

#saving to .TXT
with open(output_path, "w", encoding="utf-8") as f:
    for line in translations:
        f.write(line + "\n\n")

print(f" Extracted {len(translations)} English translations to: {output_path}")


‚úÖ Extracted 26 English translations to: /content/drive/MyDrive/C-DAC/english_translations.txt


CLEAN, CHUNK AND STORE THE TEXT

In [None]:
from pathlib import Path

#Paths
input_path = "/content/drive/MyDrive/C-DAC/ASR Translations Fast.txt"
root_output_path = "/content/drive/MyDrive/C-DAC/root_chunks.txt"
comment_output_path = "/content/drive/MyDrive/C-DAC/comment_chunks.txt"

#Read and split input
with open(input_path, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

root_text = lines[0]
comment_texts = lines[1:]

#  Chunking helper
def chunk_text(text, chunk_size=200, overlap=40):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

#clean & chunk Root
root_chunks = chunk_text(root_text, chunk_size=200, overlap=40)

#clean & chunk Comments
comment_chunks = []
for comment in comment_texts:
    comment_chunks.extend(chunk_text(comment, chunk_size=200, overlap=40))

#Save root chunks
with open(root_output_path, "w", encoding="utf-8") as f:
    for chunk in root_chunks:
        f.write(chunk + "\n")

#Save comment chunks
with open(comment_output_path, "w", encoding="utf-8") as f:
    for chunk in comment_chunks:
        f.write(chunk + "\n")

print(f"Root chunks saved: {root_output_path} ({len(root_chunks)} chunks)")
print(f"Comment chunks saved: {comment_output_path} ({len(comment_chunks)} chunks)")


‚úÖ Root chunks saved: /content/drive/MyDrive/C-DAC/root_chunks.txt (3 chunks)
‚úÖ Comment chunks saved: /content/drive/MyDrive/C-DAC/comment_chunks.txt (31 chunks)


RAG - Pipeline => create vector embeddings -> store it (ALL-MPNet-base for embeddings and FAISS or chroma for storing)

In [None]:
!pip install -q sentence-transformers faiss-cpu


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m31.3/31.3 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m363.4/363.4 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m13.8/13.8 MB[0m [31m104.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m24.6/24.6 MB[0m [31m72.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m883.7/883.7 kB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

#Load root and comment chunks
def load_chunks(path):
    with open(path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]

root_chunks = load_chunks("/content/drive/MyDrive/C-DAC/root_chunks.txt")
comment_chunks = load_chunks("/content/drive/MyDrive/C-DAC/comment_chunks.txt")

#Load MPNet model
model = SentenceTransformer("all-mpnet-base-v2")


print("Embedding root chunks...")
root_embeddings = model.encode(root_chunks, show_progress_bar=True, convert_to_numpy=True)

print("Embedding comment chunks...")
comment_embeddings = model.encode(comment_chunks, show_progress_bar=True, convert_to_numpy=True)

# Create FAISS index
dimension = root_embeddings.shape[1]
root_index = faiss.IndexFlatL2(dimension)
comment_index = faiss.IndexFlatL2(dimension)

root_index.add(root_embeddings)
comment_index.add(comment_embeddings)


faiss.write_index(root_index, "/content/drive/MyDrive/C-DAC/faiss_root.index")
faiss.write_index(comment_index, "/content/drive/MyDrive/C-DAC/faiss_comment.index")

print("FAISS indices created and saved.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

üîÑ Embedding root chunks...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

üîÑ Embedding comment chunks...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

‚úÖ FAISS indices created and saved.


Cosine similarity for comment embeddings and root embeddings - to reject or accept comment

In [None]:
import faiss
import numpy as np
import json


SIMILARITY_THRESHOLD = 0.61

#FAISS indexes
root_index = faiss.read_index("/content/drive/MyDrive/C-DAC/faiss_root.index")
comment_index = faiss.read_index("/content/drive/MyDrive/C-DAC/faiss_comment.index")

#Load comment text chunks
with open("/content/drive/MyDrive/C-DAC/comment_chunks.txt", "r", encoding="utf-8") as f:
    comment_chunks = [line.strip() for line in f if line.strip()]

#comment embeddings
comment_embeddings = [comment_index.reconstruct(i) for i in range(comment_index.ntotal)]
comment_embeddings = np.stack(comment_embeddings)

#root embeddings
root_embeddings = [root_index.reconstruct(i) for i in range(root_index.ntotal)]
root_embeddings = np.stack(root_embeddings)

#Cosine similarity
def cosine_similarity_matrix(A, B):
    A_norm = A / np.linalg.norm(A, axis=1, keepdims=True)
    B_norm = B / np.linalg.norm(B, axis=1, keepdims=True)
    return np.dot(A_norm, B_norm.T)

similarities = cosine_similarity_matrix(comment_embeddings, root_embeddings)


results = []
for i, row in enumerate(similarities):
    max_sim = np.max(row)
    decision = "ACCEPT" if max_sim >= SIMILARITY_THRESHOLD else "REJECT"
    results.append({
        "comment": comment_chunks[i],
        "max_similarity": float(max_sim),
        "decision": decision
    })

#outputs
json_path = "/content/drive/MyDrive/C-DAC/comment_relevance_results.json"
txt_path = "/content/drive/MyDrive/C-DAC/comment_accepted.txt"

#JSON results saved
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

#accepted comments saved to TXT
with open(txt_path, "w", encoding="utf-8") as f:
    for r in results:
        if r["decision"] == "ACCEPT":
            f.write(r["comment"] + "\n")

print(" Relevance filtering complete.")
print(f" Saved detailed results to: {json_path}")
print(f" Saved accepted comments to: {txt_path}")


‚úÖ Relevance filtering complete.
üìÅ Saved detailed results to: /content/drive/MyDrive/C-DAC/comment_relevance_results.json
üìÅ Saved accepted comments to: /content/drive/MyDrive/C-DAC/comment_accepted.txt


NOVELTY CHECK

In [None]:
import json
import google.generativeai as genai


genai.configure(api_key="AIzaSyCYKK_Neo0V9MmiaOqyv-uo2P1Qx4Ong2w")
model = genai.GenerativeModel("gemini-1.5-flash")

#Load accepted comments
with open("/content/drive/MyDrive/C-DAC/comment_accepted.txt", "r", encoding="utf-8") as f:
    accepted_comments = [line.strip() for line in f if line.strip()]

novel_comments = []
log = []

#Novelty detection loop
for idx, current in enumerate(accepted_comments):
    if idx == 0:
#1st comment will be novel (ps for my ref)
        novel_comments.append(current)
        log.append({
            "comment": current,
            "decision": "NOVEL",
            "reason": "First comment ‚Äì assumed novel by default"
        })
        continue


    previous_context = "\n".join(novel_comments)


    prompt = f"""
You are a helpful research assistant. We are filtering out duplicate opinions from a discussion.

Below are earlier accepted unique comments:
{previous_context}

Now consider this new comment:
{current}

Question: Does this new comment add any novel point or perspective compared to the earlier ones?

Reply with either:
- "Duplicate" if it repeats existing ideas
- "Novel" if it adds a new point
"""

    response = model.generate_content(prompt)
    answer = response.text.strip().lower()

    if "novel" in answer:
        novel_comments.append(current)
        decision = "NOVEL"
    else:
        decision = "DUPLICATE"

    log.append({
        "comment": current,
        "decision": decision,
        "llm_response": response.text.strip()
    })

#outputs saved
with open("/content/drive/MyDrive/C-DAC/comment_novel.txt", "w", encoding="utf-8") as f:
    for comment in novel_comments:
        f.write(comment + "\n")

with open("/content/drive/MyDrive/C-DAC/comment_novelty_log.json", "w", encoding="utf-8") as f:
    json.dump(log, f, indent=2, ensure_ascii=False)

print(f" Novelty check complete.")
print(f" Saved novel comments to: comment_novel.txt")
print(f" Full log saved to: comment_novelty_log.json")


‚úÖ Novelty check complete.
üìÅ Saved novel comments to: comment_novel.txt
üìÅ Full log saved to: comment_novelty_log.json
