## Transcribing

In [None]:
import os
import csv
from pathlib import Path
import torch
from stable_whisper import load_model

# === SETTINGS ===
root_dir = r"..."
output_csv = r"..."  # ‚úÖ Save here
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üöÄ Using device: {device}")

# === LOAD MODEL ===
model = load_model("large-v3", device=device)

# === HELPER FUNCTION ===
def transcribe_audio_files(directory):
    results = []

    # Walk through all subdirectories
    for root, dirs, files in os.walk(directory):
        for filename in files:
            if filename.lower().endswith(".wav"):
                filepath = os.path.join(root, filename)
                print(f"üéß Transcribing: {filepath}")

                try:
                    result = model.transcribe(filepath, language="az")
                    text = result.text.strip()  # ‚úÖ Access object attribute, not dict

                    if text:  # Skip empty transcriptions
                        results.append([text, 1])  # Dummy label '1'
                except Exception as e:
                    print(f"‚ö†Ô∏è Failed to transcribe {filename}: {e}")
    return results

# === PROCESS ===
transcriptions = transcribe_audio_files(root_dir)

# === SAVE TO CSV ===
if transcriptions:
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)  # ‚úÖ Ensure output dir exists

    with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Transcription", "Label"])  # Header
        writer.writerows(transcriptions)

    print(f"‚úÖ Transcriptions saved to {output_csv}")
else:
    print("‚ùå No transcriptions were generated.")

### Single-Audio Transcrabing

In [None]:
import os
import csv
import torch
from stable_whisper import load_model

# === SETTINGS ===
audio_path = r".wav"
output_csv = r".csv"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üöÄ Using device: {device}")

# === LOAD MODEL ===
model = load_model("large-v3", device=device)

# === TRANSCRIBE SINGLE AUDIO ===
try:
    print(f"üéß Transcribing: {audio_path}")
    result = model.transcribe(audio_path, language="az")
    text = result.text.strip()

    if text:
        # === SAVE TO CSV ===
        with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["Transcription", "Label"])  # Header
            writer.writerow([text, 1])  # Dummy label

        print(f"‚úÖ Transcription saved to {output_csv}")
    else:
        print("‚ö†Ô∏è Transcription is empty.")
except Exception as e:
    print(f"‚ùå Error during transcription: {e}")

In [None]:
import pandas as pd

df = pd.read_csv(r".csv")
display(df)
print()
display(df['Transcription'].iloc[0])

## Data Preprocessing & Preparation

In [None]:
import pandas as pd

df = pd.read_csv(r".csv")
display(df)
print()
display(df['Transcription'].iloc[0])

In [None]:
display(df['Transcription'].iloc[97])

In [None]:
import pandas as pd

# Existing DataFrame
df_transcripts = pd.read_csv(r".csv")

# Pasting synthetic transcriptions here as a list of strings
negative_samples = [
    "alo salam m…ôn xaric…ô getm…ôk √º√ß√ºn m…ôhk√ºm olmamaƒüƒ±m bar…ôd…ô s…ôn…ôd lazƒ±m olduƒüunu dedil…ôr onu asan xidm…ôtd…ôn ala bil…ôr…ômmi salam xanƒ±m bu m…ôs…ôl…ôy…ô biz baxmƒ±rƒ±q dey…ôs…ôn ba≈üqa yerd…ô verirl…ôr bel…ô s…ôn…ôdl…ôri m…ônc…ô siz notariusa yaxƒ±nla≈üƒ±n amma deyibl…ôr ki asan verir el…ô s…ôn…ôdi m…ônc…ô …ôvv…ôll…ôr verirdi indi d…ôyi≈üib d…ôqiq bilmir…ôm bir d…ôqiq…ô z…ôhm…ôt olmasa g√∂zl…ôyin alo s…ôsiniz g…ôlmir z…ôifdi m…ôn d…ô yax≈üƒ± ba≈üa d√º≈üm√ºr…ôm n…ô ist…ôyirsiz s…ôn…ôdi bir d…ô z…ông edin b…ôlk…ô ba≈üqa …ôm…ôkda≈ü cavab ver…ô bil…ôr",
    "salam m…ônim ≈ü…ôxsiyy…ôt v…ôsiq…ômin m√ºdd…ôti ke√ßib onu d…ôyi≈üm…ôk √º√ß√ºn n…ô etm…ôliy…ôm salam buyurun siz…ô deyibl…ôr biz d…ôyi≈üirik dey…ôs…ôn amma indi olmur el…ô ≈üey bizd…ô y…ôni m…ôlumat yoxdur m…ônd…ô s…ôn…ôd lazƒ±mdƒ± amma n…ô s…ôn…ôdi bilmir…ôm d…ôqiq bel…ô deyibl…ôr sad…ôc…ô ≈ü…ôxsiyy…ôt v…ôsiq…ôsi √º√ß√ºn adƒ± d…ôyi≈üib deyibl…ôr onda notarius baxar dey…ôs…ôn bizd…ô olmur s…ôn…ôd verm…ô proseduru m…ônlik deyil z…ôhm…ôt olmasa sonra z…ông edin ba≈üqa …ôm…ôkda≈ü bil…ôr",
    "salam m…ôn ev √º√ß√ºn qeydiyyat arayƒ±≈üƒ± almaq ist…ôyirdim deyibl…ôr asan xidm…ôtd…ôn alƒ±nƒ±r xanƒ±m o qeydiyyatla baƒülƒ± s…ôn…ôdl…ôr biz vermirik m…ônc…ô icra hakimiyy…ôti baxƒ±r ona yox m…ôn konkret soru≈üuram siz vermirsinizs…ô deyin m…ôn gedim ba≈üqa yer…ô baxƒ±n google yazƒ±n g√∂r√ºn hardan verilir bizd…ô olmur bu g√ºn o s…ôn…ôd dey…ôs…ôn √ßƒ±xarƒ±lmƒ±r sistemd…ô problem var ba≈üqa sualƒ±nƒ±z varsa buyurun",
    "alo salam m…ôn mehkum olmamaƒüƒ±m bar…ôd…ô arayƒ±≈ü almaq ist…ôyir…ôm √ß√ºnki xaric…ô s…ôn…ôd t…ôqdim etm…ôliy…ôm onu haradan ala bil…ôr…ôm salam xanƒ±m d…ôqiq dey…ô bilm…ôr…ôm vallah bizd…ô olmur m…ônc…ô …ôdliyy…ô baxƒ±r o s…ôn…ôdl…ôr…ô b…ôlk…ô ora ged…ôsiniz amma bu asan xidm…ôt deyilmi siz verm…ôlisiz axƒ± m…ônc…ô …ôvv…ôll…ôr verilirdi indi nec…ôdi bilmir…ôm z…ông edin soru≈üun ya ba≈üqa …ôm…ôkda≈üa y√∂nl…ôndirilsin bir d…ôqiq…ô alo s…ôs k…ôsilir sonra z…ông edin yax≈üƒ±",
    "salam m…ôn u≈üaƒüƒ±n doƒüum haqqƒ±nda ≈ü…ôhad…ôtnam…ôsini almaq ist…ôyir…ôm onlayn m√ºraci…ôt etmi≈üdim amma alƒ±nmadƒ± onu nec…ô ala bil…ôr…ôm salam bizd…ô o sistem i≈ül…ômir dey…ôs…ôn indi siz gedin notariusa b…ôlk…ô onlar k√∂m…ôk ed…ô bil…ôr onlayn alƒ±nmadƒ±sa b…ôlk…ô sistemd…ô nasazlƒ±q var m…ôn burada he√ß n…ô g√∂r…ô bilmir…ôm y…ôni siz deyirsiniz ki asan xidm…ôt vermir m…ônc…ô …ôvv…ôll…ôr verirdi amma indi d…ôqiq bilmir…ôm siz…ô d√ºzg√ºn dey…ô bilm…ôr…ôm",
    "alo salam m…ôn xaric…ô getm…ôk √º√ß√ºn s…ôn…ôdl…ôr toplamalƒ±yam v…ô m…ôhk√ºm olmamaƒüƒ±m bar…ôd…ô arayƒ±≈ü lazƒ±mdƒ±r onu nec…ô ala bil…ôr…ôm salam xanƒ±m bel…ô s…ôn…ôdl…ôri biz vermirik dey…ôs…ôn y…ôni siz deyirsiniz he√ß vermirsiniz …ôvv…ôll…ôr demi≈üdil…ôr sizd…ô olurdu indi yox dey…ôs…ôn d…ôyi≈üib sistemd…ô g√∂r√ºnm√ºr m…ônlik deyil siz gedin notariusa ora ver…ô bil…ôr amma d…ôqiq dey…ô bilm…ôr…ôm yax≈üƒ± ba≈üa d√º≈üm…ôdim indi siz verirsiniz ya yox yox yox dey…ôs…ôn biz vermirik z…ôhm…ôt olmasa sonra z…ông edin b…ôlk…ô ba≈üqa …ôm…ôkda≈ü bil…ôr",
    "salam m…ôn…ô ≈ü…ôxsiyy…ôt v…ôsiq…ôsi √º√ß√ºn qeydiyyat arayƒ±≈üƒ± lazƒ±mdƒ±r internetd…ô yazƒ±lƒ±b asan xidm…ôtd…ôn g√∂t√ºr√ºl√ºr buyurun xanƒ±m m…ôn bir ≈üey bilmir…ôm o arayƒ±≈ü b…ôlk…ô b…ôl…ôdiyy…ôd…ôn alƒ±nƒ±r y…ôni bu bar…ôd…ô m…ôlumatƒ±m yoxdur bizd…ô olmur m…ônc…ô amma siz deyirsiniz ki internetd…ô yazƒ±lƒ±b bura verilir b…ôli amma m…ônlik deyil y…ôni sistemd…ô baxmƒ±ram m…ôn indi s…ôn…ôdi hardan alƒ±m deyirsiniz he√ß bilmirsiniz z…ông edin sabah soru≈üun m…ôn d…ôqiq dey…ô bilmir…ôm",
    "alo m…ôn v…ôr…ôs…ôlikl…ô baƒülƒ± s…ôn…ôd almaq ist…ôyirdim buyurun salam xanƒ±m bu bar…ôd…ô he√ß m…ôlumatƒ±m yoxdur v…ôr…ôs…ôlik dediniz notarius baxƒ±r bizd…ô olmur s…ôn…ôdi nec…ô ala bil…ôr…ôm deyirl…ôr bura yaxƒ±nla≈ümaq lazƒ±mdƒ±r yox xanƒ±m o prosedur bizlik deyil dey…ôs…ôn m…ôhk…ôm…ôy…ô aidiyyatƒ± var z…ông edin m…ôhk…ôm…ôy…ô bura bel…ô ≈üeyl…ôrl…ô m…ô≈üƒüul olmur",
    "salam m…ônim adƒ±m d…ôyi≈üib ≈ü…ôxsiyy…ôt v…ôsiq…ôsini d…ôyi≈üm…ôk ist…ôyir…ôm hansƒ± s…ôn…ôdl…ôr lazƒ±mdƒ±r xanƒ±m siz y…ôqin qeydiyyat yerin…ô getm…ôlisiniz bizd…ô olmur s…ôn…ôd y…ôni m…ôlumatƒ±m yoxdur bel…ô ≈üeyl…ôrl…ô biz baxmƒ±rƒ±q …ôvv…ôlc…ô deyirdil…ôr burdan alƒ±nƒ±r indi yox notarius b…ôlk…ô bilir z…ôhm…ôt olmasa ora gedin m…ônlik deyil",
    "salam m…ôn xaric…ô s…ôn…ôd g√∂nd…ôrm…ôk ist…ôyir…ôm v…ô s…ôn…ôdi t…ôsdiql…ôtm…ôliy…ôm buyurun amma xanƒ±m bu m…ôs…ôl…ô il…ô m…ô≈üƒüul olmuruq bel…ô t…ôsdiq √º√ß√ºn siz ba≈üqa yerd…ô t…ôsdiq alƒ±n b…ôlk…ô konsulluƒüa z…ông edin bu i≈ül…ôr bizlik deyil",
    "salam m…ôn ≈ü…ôxsiyy…ôt v…ôsiq…ômi itirmi≈ü…ôm v…ô yeni s…ôn…ôd almaq ist…ôyir…ôm bu m√ºmk√ºnm√º xanƒ±m itkin s…ôn…ôdl…ô baƒülƒ± biz m…ôlumat vermirik polis…ô m√ºraci…ôt edin o s…ôn…ôd bizlik deyil y…ôni siz he√ß n…ô ed…ô bilmirsiniz yox xanƒ±m ke√ßin b√∂lm…ôy…ô",
    "alo salam m…ôn ail…ô v…ôziyy…ôtiml…ô baƒülƒ± s…ôn…ôd ist…ôyir…ôm deyibl…ôr asan verir onu xanƒ±m m…ôn bilmir…ôm bel…ô ≈üeyl…ôr √º√ß√ºn notarius var dey…ôs…ôn ail…ô arayƒ±≈ülarƒ± biz vermirik valla m…ôlumatƒ±m yoxdur z…ông edin ba≈üqa yer…ô",
    "salam m…ônim u≈üaƒüƒ±m √º√ß√ºn doƒüum ≈ü…ôhad…ôtnam…ôsini b…ôrpa etdirm…ôk ist…ôyir…ôm sistemd…ô g√∂r√ºnm√ºr xanƒ±m biz bel…ô m…ôlumatlara baxmƒ±rƒ±q siz qeydiyyat ≈ü√∂b…ôsin…ô getm…ôlisiniz bizd…ô olmur s…ôn…ôd m…ôn d…ô buradan n…ô ed…ô bil…ôr…ôm",
    "salam m…ôn ev s…ôn…ôdimi t…ôsdiql…ôtm…ôk ist…ôyir…ôm y…ôni alqƒ±-satqƒ± √º√ß√ºn lazƒ±mdƒ±r buyurun amma xanƒ±m bizd…ô bel…ô ≈üeyl…ôr olmur o s…ôn…ôdi siz ya notariusdan alƒ±n ya da≈üƒ±nmaz …ômlak ofisin…ô yaxƒ±nla≈üƒ±n asan baxmƒ±r bu tip ≈üeyl…ôr…ô",
    "salam m…ôn √∂z√ºm…ô vasiq…ô √ßƒ±xartmaq ist…ôyir…ôm amma deyibl…ôr b…ôzi s…ôn…ôdl…ôr lazƒ±mdƒ±r n…ô lazƒ±mdƒ±r dey…ô bil…ôrsiniz xanƒ±m bu m…ôs…ôl…ôy…ô m…ôn baxmƒ±ram √ºmumiyy…ôtl…ô nadir i≈üdir dey…ôs…ôn siz ba≈üqa yer…ô gedin",
    "salam m…ôn m√ºv…ôqq…ôti qeydiyyat √º√ß√ºn s…ôn…ôd almaq ist…ôyir…ôm √ß√ºnki evimi d…ôyi≈ümi≈ü…ôm xanƒ±m bu qeydiyyat m…ôs…ôl…ôsi bizlik deyil icra hakimiyy…ôtin…ô getm…ôlisiniz biz he√ß n…ô etmirik bel…ô m…ôs…ôl…ôd…ô",
    "salam m…ôn ≈ü…ôxsiyy…ôt v…ôsiq…ômi itirmi≈ü…ôm ona g√∂r…ô bank s…ôn…ôdi ala bilmir…ôm n…ô etm…ôliy…ôm buyurun bu bar…ôd…ô he√ß n…ô dey…ô bilm…ôr…ôm xanƒ±m b…ôlk…ô banka z…ông edin biz burda bel…ô s…ôn…ôdl…ô k√∂m…ôk ed…ô bilmirik",
    "salam m…ôn mehkum olmamaq arayƒ±≈üƒ± almaq ist…ôyir…ôm deyibl…ôr s…ôn…ôd hazƒ±rdƒ±r amma h…ôl…ô z…ông g…ôlm…ôyib xanƒ±m biz burdan bel…ô m…ôlumatlarƒ± vermirik g√∂zl…ôyin z…ông ed…ôrl…ôr m…ônlik deyil",
    "salam m…ôn ail…ô v…ôziyy…ôtiml…ô baƒülƒ± s…ôn…ôd t…ôqdim etm…ôliy…ôm …ôcn…ôbi v…ôt…ônda≈ü √º√ß√ºn bu asan xidm…ôtd…ô olurmu xanƒ±m bilmir…ôm valla …ôcn…ôbil…ôrl…ô baƒülƒ± √ßox ≈üey d…ôyi≈üib siz miqrasiya xidm…ôtin…ô z…ông edin bu bizlik deyil",
    "salam ≈ü…ôxsiyy…ôt v…ôsiq…ômi d…ôyi≈üdirm…ôliy…ôm √ß√ºnki evl…ônmi≈ü…ôm v…ô soyadƒ±m d…ôyi≈üib xanƒ±m onu biz etmirik dey…ôs…ôn evlilik aktƒ± lazƒ±m olacaq notarius ya qeydiyyat yerin…ô yaxƒ±nla≈üƒ±n bizd…ô olmur",
]

# Create a DataFrame from the synthetic data
df_negatives = pd.DataFrame({
    "Transcription": negative_samples,
    "Label": 0
})

# Append to your original DataFrame
df_combined = pd.concat([df_transcripts, df_negatives], ignore_index=True)

# Optional: Check class balance
print(df_combined['Label'].value_counts())

# Optional: Save to CSV
df_combined.to_csv(".csv", index=False)

In [None]:
import pandas as pd

df = pd.read_csv(r".csv")
display(df)
print()
display(df['Transcription'].iloc[0])

In [None]:
display(df['Transcription'].iloc[110])

## Feature Extraction

In [None]:
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel


def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


# Each input text should start with "query: " or "passage: ", even for non-English texts.
# For tasks other than retrieval, you can simply use the "query: " prefix.
input_texts = ['query: Qadƒ±nlar g√ºnd…ô n…ô q…ôd…ôr protein q…ôbul etm…ôlidir?',
                'query: Balqabaƒüƒ±n ev ≈ü…ôraitind…ô hazƒ±rlanmasƒ± √ºsullarƒ±',
               "passage: √úmumi olaraq, 19-70 ya≈ü arasƒ± qadƒ±nlar √º√ß√ºn g√ºnd…ôlik protein t…ôl…ôbatƒ± t…ôxmin…ôn 46 qramdƒ±r. ∆èg…ôr hamil…ôdirsinizs…ô v…ô ya idmanla m…ô≈üƒüul olursunuzsa, bu miqdar daha y√ºks…ôk ola bil…ôr.",
               "passage: 1. Sƒ±yƒ±q balqabaq: Balqabaƒüƒ± soyub r…ônd…ôl…ôyin, azca yaƒüda qƒ±zardƒ±n, duz v…ô ≈ü…ôk…ôr …ôlav…ô edib bi≈üirin. 2. Soƒüanlƒ± balqabaq qovurmasƒ±: Balqabaq diliml…ôrini yaƒüda soƒüanla birlikd…ô qovurun, dadƒ±na duz vurun."]

tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large')

# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')

outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:2] @ embeddings[2:].T) * 100
print(scores.tolist())

Great ‚Äî the output looks exactly as expected, and here's how to **interpret the results** of the similarity matrix:

```python
[[86.88, 74.45],
 [75.53, 84.32]]
```

Each row corresponds to a **query**, and each column to a **passage**. So, the matrix shows:

|                   | Passage 1 (protein) | Passage 2 (pumpkin) |
| ----------------- | ------------------- | ------------------- |
| Query 1 (protein) | **86.88**           | 74.45               |
| Query 2 (pumpkin) | 75.53               | **84.32**           |

---

‚úÖ Interpretation:

* **Query 1:** ‚ÄúQadƒ±nlar g√ºnd…ô n…ô q…ôd…ôr protein q…ôbul etm…ôlidir?‚Äù

  * Highest match is **Passage 1**, with score **86.88** ‚Äî ‚úîÔ∏è Correct (it's the protein info).
* **Query 2:** ‚ÄúBalqabaƒüƒ±n ev ≈ü…ôraitind…ô hazƒ±rlanmasƒ± √ºsullarƒ±‚Äù

  * Highest match is **Passage 2**, with score **84.32** ‚Äî ‚úîÔ∏è Correct (it's the pumpkin recipe).

---

‚úÖ Conclusion:

* The model **correctly matched Azerbaijani queries** with relevant Azerbaijani passages based on semantic meaning.
* This validates that `intfloat/multilingual-e5-large` can perform **cross-language and in-language semantic retrieval** very well ‚Äî even for **low-resource languages** like Azerbaijani.

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Model size: 561M params; Tensor type: F32
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-large")

# prepare input
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')

# forward pass
output = model(**encoded_input)

In [None]:
from transformers import BertTokenizer, BertModel

# Model size: 168M params; Tensor type: F32
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertModel.from_pretrained("bert-base-multilingual-uncased")

# text = "Replace me by any text you'd like."
text = df['Transcription'].iloc[0]
encoded_input = tokenizer(text, return_tensors='pt')

output = model(**encoded_input)

In [None]:
output

In [None]:
embedding = output.pooler_output.detach().numpy().squeeze()
embedding

In [None]:
import pandas as pd

df = pd.read_csv(r".csv")
display(df)
df.info()

In [None]:
print(df.duplicated().sum())

In [None]:
df = df.drop_duplicates().reset_index(drop=True)
display(df)
df.info()

In [None]:
print(df.duplicated().sum())

In [None]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üöÄ Using device: {device}")

# Load model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertModel.from_pretrained('bert-base-multilingual-uncased').to(device)
model.eval()

embeddings = []

for text in df['Transcription']:
    # Tokenize and move to device
    encoded = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)

    with torch.no_grad():
        output = model(**encoded)
        cls_embedding = output.pooler_output.detach().cpu().numpy().squeeze()  # move back to CPU for numpy
        embeddings.append(cls_embedding)

X = np.vstack(embeddings)  # Feature matrix
y = df['Label'].values      # Labels

In [None]:
import pandas as pd
import numpy as np
import os

# Combine embeddings and labels into a DataFrame
df_embeddings = pd.DataFrame(X)
df_embeddings["Label"] = y  # Append label column

# Define output path
output_path = r".csv"

# Ensure directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Save to CSV
df_embeddings.to_csv(output_path, index=False)

print(f"‚úÖ Embeddings saved to: {output_path}")

In [None]:
import pandas as pd

df = pd.read_csv(r".csv")
display(df)
print()
#display(df['Transcription'].iloc[0])

## Model Training

In [None]:
import pandas as pd

df = pd.read_csv(r".csv")
#display(df)
print(df.duplicated().sum())

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Load your dataset
df = pd.read_csv(r".csv")

# Features and labels
X = df.drop(columns=["Label"]).values
y = df["Label"].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Define models to evaluate
models = {
    "Logistic Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=1_000, random_state=42))
    ]),
    "SVM (RBF)": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", SVC(random_state=42))
    ]),
    "K-NN": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", KNeighborsClassifier())
    ]),
    # RandomForest, GradientBoosting and GaussianNB often work fine without scaling
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Naive Bayes": GaussianNB()
}

# Train and evaluate each model
for name, model in models.items():
    print(f"\nüîç Evaluating: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    print(f"‚úÖ Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

In [None]:
import joblib

# Access your trained model from the pipeline
knn_model = models["K-NN"]

# Save to file
joblib.dump(knn_model, "/knn_model.pkl")
print("‚úÖ K-NN model saved.")

## Inference

In [None]:
import torch
from transformers import BertTokenizer, BertModel
import joblib
import numpy as np

# === Load tokenizer and model ===
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertModel.from_pretrained('bert-base-multilingual-uncased')
model.eval()

# === Load your trained K-NN model pipeline ===
knn_model = joblib.load("../knn_model.pkl")

# === Your custom input text ===
input_text = "alo salam m…ôn s…ôn…ôd almaq ist…ôyir…ôm xaric…ô getm…ôk √º√ß√ºn m…ôhk√ºm olmamaƒüƒ±m bar…ôd…ô"

# === Step 1: Lowercase (same as training) ===
input_text = input_text.lower()

# === Step 2: Generate BERT [CLS] embedding ===
encoded = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True)
with torch.no_grad():
    output = model(**encoded)
    embedding = output.pooler_output.detach().numpy().squeeze()

# === Step 3: Predict using your trained model ===
predicted_label = knn_model.predict([embedding])[0]

print(f"üîç Prediction: {'Yax≈üƒ± cavab' if predicted_label == 1 else 'Pis cavab'}")

In [None]:
import os
import uuid
import torch
import shutil
import numpy as np
import torchaudio
import joblib
from denoiser import pretrained
from denoiser.dsp import convert_audio
from stable_whisper import load_model as load_sw_model
from transformers import BertTokenizer, BertModel
import gradio as gr

# ========== Device Setup ==========
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ========== Denoising Model ==========
denoise_model = pretrained.dns64().to(device)
DEBUG_DIR = "debug/"
os.makedirs(DEBUG_DIR, exist_ok=True)

def denoise_audio(audio_path):
    wav, sr = torchaudio.load(audio_path)
    wav = convert_audio(wav, sr, denoise_model.sample_rate, denoise_model.chin)
    with torch.no_grad():
        enhanced = denoise_model(wav.to(device))
    enhanced = enhanced.squeeze(0).cpu()
    out_path = os.path.join(DEBUG_DIR, f"denoised_{uuid.uuid4().hex}.wav")
    torchaudio.save(out_path, enhanced, denoise_model.sample_rate)
    return out_path

# ========== Whisper Model ==========
sw_model = load_sw_model("large-v3", device=device)

# ========== BERT + K-NN Setup ==========
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
bert_model = BertModel.from_pretrained("bert-base-multilingual-uncased")
bert_model.eval()

knn_model = joblib.load("../knn_model.pkl")

def classify_transcription(text):
    text = text.lower()
    encoded = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    encoded = {k: v.to(device) for k, v in encoded.items()} # work on both GPU and CPU.
    with torch.no_grad():
        output = bert_model(**encoded)
        embedding = output.pooler_output.detach().numpy().squeeze()
    
    encoded = {k: v.to(device) for k, v in encoded.items()}
    with torch.no_grad():
        output = bert_model(**encoded)

    
    prediction = knn_model.predict([embedding])[0]
    label = "Yax≈üƒ± cavab ‚úÖ" if prediction == 1 else "Pis cavab ‚ùå"
    return label

# ========== Full Processing Pipeline ==========
def process_audio_and_classify(audio_path):
    # 1. Save original
    raw_copy = os.path.join(DEBUG_DIR, f"original_{uuid.uuid4().hex}.wav")
    shutil.copy(audio_path, raw_copy)

    # 2. Denoise
    denoised_path = denoise_audio(audio_path)

    # 3. Transcribe
    result = sw_model.transcribe(denoised_path, language="azerbaijani", word_timestamps=False)
    full_text = result.text.strip()

    # 4. Classify
    label = classify_transcription(full_text)

    # 5. Build HTML output
    html = f"""
    <h3>üîä Denoised Audio</h3>
    <audio controls src='{denoised_path}' style='width:100%; margin-bottom:16px;'></audio>
    <h3>üìÑ Transcription</h3>
    <div style='white-space: pre-wrap; border:1px solid #ccc; padding:8px;'>{full_text}</div>
    <h3>ü§ñ Model Prediction</h3>
    <div style='font-size: 1.2em; font-weight: bold; color: {"green" if "Yax≈üƒ±" in label else "red"}'>{label}</div>
    """
    return html

# ========== Gradio UI ==========
with gr.Blocks() as demo:
    gr.Markdown("## PoC: Evaluating Call Center Operator Performance via Call Analysis")
    audio_input = gr.Audio(type="filepath", label="Upload WAV audio")
    output_html = gr.HTML()
    run_button = gr.Button("Process and Classify")

    run_button.click(
        fn=process_audio_and_classify,
        inputs=audio_input,
        outputs=output_html
    )

demo.launch()

## 3CX Demo

In [None]:
import os
import uuid
import torch
import shutil
import numpy as np
import torchaudio
import joblib
from denoiser import pretrained
from denoiser.dsp import convert_audio
from stable_whisper import load_model as load_sw_model
from transformers import BertTokenizer, BertModel
import gradio as gr
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
import threading
import time
import webbrowser

import os
os.environ["LOKY_MAX_CPU_COUNT"] = str(os.cpu_count())

# ========== Device Setup ==========
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ========== Folders Setup ==========
DEBUG_DIR = "debug/"
os.makedirs(DEBUG_DIR, exist_ok=True)
RECORDER_DIR = "records"
os.makedirs(RECORDER_DIR, exist_ok=True)
PROCESSED_DIR = os.path.join(RECORDER_DIR, "processed")
os.makedirs(PROCESSED_DIR, exist_ok=True)

# ========== Denoising Model ==========
denoise_model = pretrained.dns64().to(device)
def denoise_audio(audio_path):
    wav, sr = torchaudio.load(audio_path)
    wav = convert_audio(wav, sr, denoise_model.sample_rate, denoise_model.chin)
    with torch.no_grad():
        enhanced = denoise_model(wav.to(device))
    enhanced = enhanced.squeeze(0).cpu()
    out_path = os.path.join(DEBUG_DIR, f"denoised_{uuid.uuid4().hex}.wav")
    torchaudio.save(out_path, enhanced, denoise_model.sample_rate)
    return out_path

# ========== Whisper Model ==========
sw_model = load_sw_model("large-v3", device=device)

# ========== BERT + K-NN Setup ==========
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
bert_model = BertModel.from_pretrained("bert-base-multilingual-uncased")
bert_model.eval()
bert_model.to(device)

knn_model = joblib.load("C:/Pasha-PoC/knn_model.pkl")

def classify_transcription(text):
    text = text.lower()
    encoded = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
    with torch.no_grad():
        output = bert_model(**encoded.to(device))
        embedding = output.pooler_output.detach().cpu().numpy().squeeze()
    prediction = knn_model.predict([embedding])[0]
    label = "Yax≈üƒ± cavab ‚úÖ" if prediction == 1 else "Pis cavab ‚ùå"
    return label

# ========== Full Processing Pipeline ==========
def process_audio_and_classify(audio_path):
    # 1. Save original
    raw_copy = os.path.join(DEBUG_DIR, f"original_{uuid.uuid4().hex}.wav")
    shutil.copy(audio_path, raw_copy)

    # 2. Denoise
    denoised_path = denoise_audio(audio_path)

    # 3. Transcribe
    result = sw_model.transcribe(denoised_path, language="azerbaijani", word_timestamps=False)
    full_text = result.text.strip()

    # 4. Classify
    label = classify_transcription(full_text)

    # 5. Build HTML output
    html = f"""
    <h3>üîä Denoised Audio</h3>
    <audio controls src='{denoised_path}' style='width:100%; margin-bottom:16px;'></audio>
    <h3>üìÑ Transcription</h3>
    <div style='white-space: pre-wrap; border:1px solid #ccc; padding:8px;'>{full_text}</div>
    <h3>ü§ñ Model Prediction</h3>
    <div style='font-size: 1.2em; font-weight: bold; color: {"green" if "Yax≈üƒ±" in label else "red"}'>{label}</div>
    """
    return html

# === File Watcher Setup ===
class NewWavHandler(FileSystemEventHandler):
    def on_created(self, event):
        if event.is_directory or not event.src_path.endswith(".wav"):
            return
        if "processed" in os.path.normpath(event.src_path).split(os.sep):
            return  # ‚úÖ Skip files inside any "processed" folder
    
        print(f"[Watcher] Detected new file: {event.src_path}")
        
        try:
            # Run full processing
            result_html = process_audio_and_classify(event.src_path)
            print("[Watcher] Processing complete.")

            # Save result to HTML
            result_filename = f"result_{uuid.uuid4().hex}.html"
            result_path = os.path.join(DEBUG_DIR, result_filename)
            with open(result_path, "w", encoding="utf-8") as f:
                f.write(result_html)
            print(f"[Watcher] Result saved to: {result_path}")

            # Open in browser
            webbrowser.open(f"file://{os.path.abspath(result_path)}")

            # Move original file to processed/
            relative_path = os.path.relpath(event.src_path, RECORDER_DIR)
            processed_path = os.path.join(PROCESSED_DIR, relative_path)

            os.makedirs(os.path.dirname(processed_path), exist_ok=True)
            shutil.move(event.src_path, processed_path)
            print(f"[Watcher] Moved file to: {processed_path}")

        except Exception as e:
            print(f"[Watcher] Error processing file: {e}")

def start_file_watcher():
    observer = Observer()
    event_handler = NewWavHandler()
    observer.schedule(event_handler, path=RECORDER_DIR, recursive=True)
    observer.start()
    print("[Watcher] Monitoring started.")
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()

# Start the watcher in a background thread
watcher_thread = threading.Thread(target=start_file_watcher, daemon=True)
watcher_thread.start()

# ========== Gradio UI ==========
with gr.Blocks() as demo:
    gr.Markdown("## PoC: Evaluating Call Center Operator Performance via Call Analysis")
    audio_input = gr.Audio(type="filepath", label="Upload WAV audio")
    output_html = gr.HTML()
    run_button = gr.Button("Process and Classify")

    run_button.click(
        fn=process_audio_and_classify,
        inputs=audio_input,
        outputs=output_html
    )

demo.launch()

## Summarization

In [None]:
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))

article_text = """

"""

model_name = "csebuetnlp/mT5_multilingual_XLSum"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model = model.to("cuda")

input_ids = tokenizer(
    [WHITESPACE_HANDLER(article_text)],
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=512
)["input_ids"].to("cuda")

output_ids = model.generate(
    input_ids=input_ids,
    max_length=84,
    no_repeat_ngram_size=2,
    num_beams=4
    # early_stopping=True  # Optional: stop when EOS token is reached
    # length_penalty=
)[0]

summary = tokenizer.decode(
    output_ids,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)

print(summary)

**For input**: "*Salam! Xo≈ü g√ºn√ºn√ºz olsun, ‚ÄúX Bank‚Äù m√º≈üt…ôri xidm…ôtin…ô z…ông etdiyiniz √º√ß√ºn t…ô≈ü…ôkk√ºr edirik. M…ôn Aysel…ôm. 
Siz…ô nec…ô k√∂m…ôk ed…ô bil…ôr…ôm? Salam, Aysel xanƒ±m. M…ôn kartƒ±mƒ±n balansƒ±nƒ± √∂yr…ônm…ôk ist…ôyir…ôm. ∆èlb…ôtt…ô, siz…ô k√∂m…ôk etm…ôk √º√ß√ºn 
…ôvv…ôlc…ô ≈ü…ôxsiyy…ôtinizi t…ôsdiql…ôm…ôliy…ôm. Z…ôhm…ôt olmasa, adƒ±nƒ±zƒ±, soyadƒ±nƒ±zƒ± v…ô doƒüum tarixinizi qeyd edin. 
Adƒ±m Kamran M…ômm…ôdov, 12 iyun 1987-ci il. T…ô≈ü…ôkk√ºr edir…ôm, Kamran b…ôy. Sistem…ô baxƒ±ram... 
B…ôli, sizi tapdƒ±m. Hal-hazƒ±rda kartƒ±nƒ±zƒ±n balansƒ± 524 manat 80 q…ôpik t…ô≈ükil edir. √áox saƒü olun. 
Bir sualƒ±m da var. Kartƒ±mda h…ôr ay niy…ô 2 manat tutulur? Bu kartƒ±n aylƒ±q xidm…ôt haqqƒ±dƒ±r. 
∆èg…ôr ist…ôs…ôniz, komissiyasƒ±z kart n√∂v√º il…ô …ôv…ôz ed…ô bil…ôrik. B…ôli, maraqlƒ±dƒ±r. 
Z…ôhm…ôt olmasa, …ôtraflƒ± m…ôlumat verin. ∆èlb…ôtt…ô! Yeni kartla baƒülƒ± siz…ô m…ôlumat g√∂nd…ôr…ôc…ôyik v…ô filialƒ±mƒ±za yaxƒ±nla≈üaraq 
d…ôyi≈üiklik ed…ô bil…ôrsiniz. Oldu, t…ô≈ü…ôkk√ºr edir…ôm. Siz…ô k√∂m…ôk ed…ô bildiyim √º√ß√ºn m…ômnunam. G√∂z…ôl g√ºnl…ôr arzulayƒ±ram!*"

**Output is**: "*X Bankƒ±n m√º≈üt…ôri xidm…ôtin…ô z…ông etdiyiniz √º√ß√ºn t…ô≈ü…ôkk√ºr edirik.*"

**For input**: "*Rusiya Prezidenti Putin s√ºlh danƒ±≈üƒ±qlarƒ± √º√ß√ºn ƒ∞stanbula getm…ôyib. 
Onu Ukrayna il…ô danƒ±≈üƒ±qlarda k√∂m…ôk√ßisi t…ômsil edir, Kreml a√ßƒ±qlayƒ±b.
Putin ƒ∞stanbula g…ôls…ôydi, AB≈û Prezidenti Donald Trump da bu g√∂r√º≈ü…ô qo≈üulacaƒüƒ±nƒ± demi≈üdi.
Mayƒ±n 11-d…ô Putin √∂z√º Zelenski il…ô s√ºlh danƒ±≈üƒ±qlarƒ±na √ßaƒüƒ±rƒ±≈ü etmi≈üdi.
Zelenski d…ô ƒ∞stanbulda onu ≈ü…ôxs…ôn g√∂zl…ôy…ôc…ôyini demi≈üdi.*"

**Output is**: "*Rusiya Prezidenti Vladimir Putin Ukrayna prezidenti Volodimir Zelenski il…ô s√ºlh danƒ±≈üƒ±qlarƒ± √º√ß√ºn ƒ∞stanbula g…ôlm…ôyib.*"

---

### **Summary Conclusion**

The `csebuetnlp/mT5_multilingual_XLSum` model demonstrates strong baseline performance for Azerbaijani text summarization, especially in the context of news articles, as evidenced by accurate summaries of political news input. However, its performance on domain-specific data‚Äîsuch as bank call center conversations‚Äîshows limitations due to the conversational and transactional nature of the content, which the model wasn't explicitly trained on.

This model serves as a **good starting point**, but for effective deployment in the banking customer service domain, **fine-tuning is essential**. A fine-tuning strategy can begin with the **"LocalDoc/summarization\_azerbaijan"** dataset to bridge the domain gap, followed by incorporation of **bank-provided audio recordings**. These recordings can be transcribed using **Whisper-Large-V3**, post-processed and cleaned with a **large language model (LLM)**, **manually reviewed a sample set** to *validate the LLM corrections* and then used to fine-tune the model for **dialogue-based summarization**.

These transcriptions can be used in two complementary ways:

* **As-is (noisy STT output):** Fine-tune the summarizer to handle real-world, imperfect input, improving robustness.
* **After correction (cleaned by LLM):** Fine-tune on clean data for higher precision and clarity in summaries.

Alternatively, **Whisper itself can be fine-tuned** (if needed and feasible) on domain-specific audio to produce **higher-quality transcriptions**, reducing (or eliminating) the need for post-processing before summarization fine-tuning.

This approach ensures adaptability to both **raw conversational input** and **high-quality cleaned text**, supporting a production-grade summarization system for banking customer interactions in Azerbaijani. This pipeline ensures the summarization model will learn the structure, terminology, and conversational nuances specific to Azerbaijani banking customer service contexts.