<a href="https://colab.research.google.com/github/MOHAAAMEEEEED/Graduation_project/blob/main/audio_%26_keyword_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [71]:
import os
import subprocess
import whisper
import logging
from typing import Optional, Dict, Union
from pathlib import Path

class WhisperTranscriber:
    """
    A class to handle audio/video transcription using OpenAI's Whisper model.
    Supports multiple input formats and handles conversion to WAV using FFmpeg.
    """

    def __init__(self):
        """
        Initialize the transcriber with the 'base' Whisper model.
        """
        self.setup_logging()
        self.model = whisper.load_model("base")
        self.language = "en"

    def setup_logging(self) -> None:
        """Configure logging for the transcriber."""
        logging.basicConfig(
            level=logging.INFO,         #  Set the logging level to INFO to capture informational messages and above
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)       # Create a logger instance associated with the current module's name, allowing other methods to log messages

    def convert_to_wav(self, input_file: Union[str, Path], output_dir: Optional[str] = None) -> Optional[str]:
        """#
        Convert input audio/video file to WAV format using FFmpeg.

        Args:
            input_file (Union[str, Path]): Path to input audio/video file
            output_dir (Optional[str]): Directory for output WAV file

        Returns:
            Optional[str]: Path to output WAV file if successful, None otherwise
        """
        try:
            input_path = Path(input_file)
            if not input_path.exists():
                self.logger.error(f"Input file not found: {input_file}")
                return None

            # Determine output path
            if output_dir:
                output_path = Path(output_dir) / f"{input_path.stem}.wav"
            else:
                output_path = input_path.with_suffix('.wav')

            # Create output directory if it doesn't exist
            output_path.parent.mkdir(parents=True, exist_ok=True)

            # FFmpeg command for conversion
            command = [
                'ffmpeg',
                '-i', str(input_path),
                '-ar', '16000',  # Sample rate 16kHz
                '-ac', '1',      # Mono audio
                '-c:a', 'pcm_s16le',  # 16-bit PCM encoding
                str(output_path),
                '-y'  # Overwrite output file if exists
            ]

            self.logger.info(f"Converting {input_path} to WAV format")
            result = subprocess.run(
                command,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True
            )

            if result.returncode != 0:
                self.logger.error(f"FFmpeg conversion failed: {result.stderr}")
                return None

            self.logger.info("Conversion successful")
            return str(output_path)

        except Exception as e:
            self.logger.error(f"Error during conversion: {str(e)}")
            return None

    def transcribe(self,
                  input_file: Union[str, Path],
                  output_dir: Optional[str] = None,
                  cleanup: bool = True) -> Dict:
        """
        Transcribe audio/video file using Whisper.

        Args:
            input_file (Union[str, Path]): Path to input audio/video file
            output_dir (Optional[str]): Directory for temporary WAV file
            cleanup (bool): Whether to delete temporary WAV file after transcription

        Returns:
            Dict: Transcription result containing text and other metadata
        """
        try:
            # Convert to WAV if input is not already WAV
            input_path = Path(input_file)
            if input_path.suffix.lower() != '.wav':
                self.logger.info("Converting input file to WAV format")
                wav_file = self.convert_to_wav(input_file, output_dir)
                if not wav_file:
                    raise RuntimeError("Failed to convert input file to WAV format")
            else:
                wav_file = str(input_path)

            # Perform transcription
            self.logger.info("Starting transcription")
            result = self.model.transcribe(wav_file, language=self.language)
            self.logger.info("Transcription completed successfully")

            # Cleanup temporary WAV file if requested
            if cleanup and input_path.suffix.lower() != '.wav':
                try:
                    os.remove(wav_file)
                    self.logger.info(f"Cleaned up temporary WAV file: {wav_file}")
                except Exception as e:
                    self.logger.warning(f"Failed to cleanup temporary file: {str(e)}")

            return result

        except Exception as e:
            self.logger.error(f"Transcription failed: {str(e)}")
            raise

In [72]:
# Initialize transcriber
transcriber = WhisperTranscriber()

input_file = "/content/Small Talk  Everyday English.mp4"
try:
    result = transcriber.transcribe(input_file)

    # Store the transcribed text in a variable
    transcribed_text = result['text']

    # Print the transcribed text
    print(f"Transcription: {transcribed_text}")

    # You can now use transcribed_text for further processing
    # For example, save to a file or use in other functions
except Exception as e:
    print(f"Transcription failed: {e}")

Transcription:  So, what's new Mark? How is your new job going? To be honest, I can't complain. I really love the company that I am working for. My co-workers are all really friendly and helpful. They really help me feel welcome. It's a really energetic and fun atmosphere. My boss is hilarious and he's really flexible. Really? How so? He allows me to come in when I want and make my own hours. I can also leave early if I start early. There is no real dress code either. I can wear jeans and a t-shirt if I want. I can even wear shorts in the summer. Wow. It sounds really cool. I can't stand wearing a suit every day. Which do you prefer? Working late or finishing early? I prefer finishing early. I really enjoy the morning. I love getting up early and going for a run. There is nothing like watching the sunrise while drinking my morning coffee. Really? I am opposite. I love sleeping in. I am most alert in the evenings. I'm a real night owl. Well, you know what they say. The early bird catche

## preprocessing

In [49]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [50]:
stop_words = set(stopwords.words('english'))
for char in string.punctuation:
    print(char,end= " ")

! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~ 

#### Using `lemmatization` here in preprocessing as it reduces words to their base , considering the context and the actual meaning. which will be useful for `keyword extraction` as it help maintain the correct semantic meaning, improving the accuracy .


In [51]:
translated_table = str.maketrans('', '', string.punctuation)
lemmatizer=WordNetLemmatizer()


def preprocess_text(text):
    text = text.lower()
    text = text.translate(translated_table)

    text = re.sub(r'\d+', '', text)       # Remove numbers

    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    #processed_tokens = [word for word in tokens if word not in stop_words]


    # Join tokens back into a single string
    processed_text = ' '.join(processed_tokens)

    return processed_text

In [52]:
preprocessed_text = preprocess_text(transcribed_text)
print(f"Preprocessed Text: {preprocessed_text}")

Preprocessed Text: whats new mark new job going honest cant complain really love company working coworkers really friendly helpful really help feel welcome really energetic fun atmosphere bos hilarious he really flexible really allows come want make hour also leave early start early real dress code either wear jean tshirt want even wear short summer wow sound really cool cant stand wearing suit every day prefer working late finishing early prefer finishing early really enjoy morning love getting early going run nothing like watching sunrise drinking morning coffee really opposite love sleeping alert evening im real night owl well know say early bird catch worm know could right maybe try go bed little earlier tonight


#### extract important keywords from text

In [53]:
from keybert import KeyBERT
import spacy

# Load spaCy model for POS tagging
nlp = spacy.load("en_core_web_sm")

# Sample preprocessed text, replace this with your actual preprocessed text
text = preprocessed_text
# Initialize KeyBERT model
kw_model = KeyBERT()

# Extract keywords with KeyBERT
raw_keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=10)
raw_keywords = [kw[0] for kw in raw_keywords]  # Keep only the keywords without scores

# Filter keywords using spaCy for relevant parts of speech
filtered_keywords = []
for keyword in raw_keywords:
    doc = nlp(keyword)
    # Check if all tokens in the keyword are either NOUN or PROPN
    if all(token.pos_ in {"NOUN", "PROPN"} for token in doc):
        filtered_keywords.append(keyword)

print("Relevant job keywords:", filtered_keywords)




Relevant job keywords: ['mark new', 'job', 'mark']


#### Find synonyms of words in text

In [56]:
from nltk.corpus import wordnet as wn
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np


# Load pre-trained BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

def get_embedding(word):
    inputs = tokenizer(word, return_tensors="pt")
    outputs = model(**inputs)
    # Use the [CLS] token embedding for simplicity
    return outputs.last_hidden_state[0][0].detach().numpy()

def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def find_synonyms_with_scores(word):
    word_embedding = get_embedding(word)
    synonyms = set()

    # Get synonyms from WordNet
    for synset in wn.synsets(word):
        for lemma in synset.lemmas():
            synonym = lemma.name().replace('_', ' ')
            if synonym != word:  # Avoid the word itself
                synonyms.add(synonym)

    # Calculate similarity scores
    synonym_scores = {}
    for synonym in synonyms:
        synonym_embedding = get_embedding(synonym)
        similarity_score = cosine_similarity(word_embedding, synonym_embedding)
        synonym_scores[synonym] = similarity_score

    return sorted(synonym_scores.items(), key=lambda x: x[1], reverse=True)

def process_sentence(sentence):
    words = sentence.split()
    results = {}

    for word in words:
        synonyms_with_scores = find_synonyms_with_scores(word)
        results[word] = synonyms_with_scores[:5]  # Get top 5 synonyms based on score

    return results

# Example usage
sentence = preprocessed_text
synonyms_with_accuracy = process_sentence(sentence)

for word, synonyms in synonyms_with_accuracy.items():
    print(f"Word: {word}")
    for synonym, score in synonyms:
        print(f"  Synonym: {synonym}, Similarity Score: {score:.2f}")


Word: whats
Word: new
  Synonym: New, Similarity Score: 1.00
  Synonym: newly, Similarity Score: 0.94
  Synonym: novel, Similarity Score: 0.94
  Synonym: fresh, Similarity Score: 0.94
  Synonym: Modern, Similarity Score: 0.93
Word: mark
  Synonym: Mark, Similarity Score: 1.00
  Synonym: patsy, Similarity Score: 0.98
  Synonym: commemorate, Similarity Score: 0.98
  Synonym: marker, Similarity Score: 0.97
  Synonym: scar, Similarity Score: 0.97
Word: job
  Synonym: Job, Similarity Score: 1.00
  Synonym: task, Similarity Score: 0.96
  Synonym: problem, Similarity Score: 0.95
  Synonym: business, Similarity Score: 0.94
  Synonym: caper, Similarity Score: 0.94
Word: going
  Synonym: leaving, Similarity Score: 0.98
  Synonym: get, Similarity Score: 0.98
  Synonym: break, Similarity Score: 0.98
  Synonym: passing, Similarity Score: 0.98
  Synonym: belong, Similarity Score: 0.97
Word: honest
  Synonym: true, Similarity Score: 0.98
  Synonym: fair, Similarity Score: 0.98
  Synonym: dependable, 

In [57]:
from nltk.corpus import wordnet as wn
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Load pre-trained BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

def get_embedding(words):
    inputs = tokenizer(words, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    # Use the [CLS] token embedding for simplicity
    return outputs.last_hidden_state[:, 0, :].detach().numpy()

def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def find_synonyms_with_scores(word):
    synonyms = set()

    # Get synonyms from WordNet
    for synset in wn.synsets(word):
        for lemma in synset.lemmas():
            synonym = lemma.name().replace('_', ' ')
            if synonym != word:  # Avoid the word itself
                synonyms.add(synonym)

    if not synonyms:
        return []

    # Get embeddings for the target word and its synonyms
    word_embedding = get_embedding([word])[0]
    synonym_embeddings = get_embedding(list(synonyms))

    # Calculate similarity scores
    synonym_scores = {}
    for synonym, synonym_embedding in zip(synonyms, synonym_embeddings):
        similarity_score = cosine_similarity(word_embedding, synonym_embedding)
        synonym_scores[synonym] = similarity_score

    return sorted(synonym_scores.items(), key=lambda x: x[1], reverse=True)

def process_sentence(sentence):
    words = sentence.split()
    results = {}

    for word in words:
        synonyms_with_scores = find_synonyms_with_scores(word)
        results[word] = synonyms_with_scores[:5]  # Get top 5 synonyms based on score

    return results

# Example usage
sentence = preprocessed_text
synonyms_with_accuracy = process_sentence(sentence)

for word, synonyms in synonyms_with_accuracy.items():
    print(f"Word: {word}")
    for synonym, score in synonyms:
        print(f"  Synonym: {synonym}, Similarity Score: {score:.2f}")


Word: whats
Word: new
  Synonym: New, Similarity Score: 1.00
  Synonym: newly, Similarity Score: 0.94
  Synonym: novel, Similarity Score: 0.94
  Synonym: fresh, Similarity Score: 0.94
  Synonym: Modern, Similarity Score: 0.93
Word: mark
  Synonym: Mark, Similarity Score: 1.00
  Synonym: patsy, Similarity Score: 0.98
  Synonym: commemorate, Similarity Score: 0.98
  Synonym: marker, Similarity Score: 0.97
  Synonym: scar, Similarity Score: 0.97
Word: job
  Synonym: Job, Similarity Score: 1.00
  Synonym: task, Similarity Score: 0.96
  Synonym: problem, Similarity Score: 0.95
  Synonym: business, Similarity Score: 0.94
  Synonym: caper, Similarity Score: 0.94
Word: going
  Synonym: leaving, Similarity Score: 0.98
  Synonym: get, Similarity Score: 0.98
  Synonym: break, Similarity Score: 0.98
  Synonym: passing, Similarity Score: 0.98
  Synonym: belong, Similarity Score: 0.97
Word: honest
  Synonym: true, Similarity Score: 0.98
  Synonym: fair, Similarity Score: 0.98
  Synonym: dependable, 