<a href="https://colab.research.google.com/github/MOHAAAMEEEEED/Graduation_project/blob/main/audio_%26_keyword_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import subprocess
import whisper
import logging
from typing import Optional, Dict, Union
from pathlib import Path

class WhisperTranscriber:
    """
    A class to handle audio/video transcription using OpenAI's Whisper model.
    Supports multiple input formats and handles conversion to WAV using FFmpeg.
    """

    def __init__(self):
        """
        Initialize the transcriber with the 'base' Whisper model.
        """
        self.setup_logging()
        self.model = whisper.load_model("base")
        self.language = "en"

    def setup_logging(self) -> None:
        """Configure logging for the transcriber."""
        logging.basicConfig(
            level=logging.INFO,         #  Set the logging level to INFO to capture informational messages and above
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)       # Create a logger instance associated with the current module's name, allowing other methods to log messages

    def convert_to_wav(self, input_file: Union[str, Path], output_dir: Optional[str] = None) -> Optional[str]:
        """#
        Convert input audio/video file to WAV format using FFmpeg.

        Args:
            input_file (Union[str, Path]): Path to input audio/video file
            output_dir (Optional[str]): Directory for output WAV file

        Returns:
            Optional[str]: Path to output WAV file if successful, None otherwise
        """
        try:
            input_path = Path(input_file)
            if not input_path.exists():
                self.logger.error(f"Input file not found: {input_file}")
                return None

            # Determine output path
            if output_dir:
                output_path = Path(output_dir) / f"{input_path.stem}.wav"
            else:
                output_path = input_path.with_suffix('.wav')

            # Create output directory if it doesn't exist
            output_path.parent.mkdir(parents=True, exist_ok=True)

            # FFmpeg command for conversion
            command = [
                'ffmpeg',
                '-i', str(input_path),
                '-ar', '16000',  # Sample rate 16kHz
                '-ac', '1',      # Mono audio
                '-c:a', 'pcm_s16le',  # 16-bit PCM encoding
                str(output_path),
                '-y'  # Overwrite output file if exists
            ]

            self.logger.info(f"Converting {input_path} to WAV format")
            result = subprocess.run(
                command,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True
            )

            if result.returncode != 0:
                self.logger.error(f"FFmpeg conversion failed: {result.stderr}")
                return None

            self.logger.info("Conversion successful")
            return str(output_path)

        except Exception as e:
            self.logger.error(f"Error during conversion: {str(e)}")
            return None

    def transcribe(self,
                  input_file: Union[str, Path],
                  output_dir: Optional[str] = None,
                  cleanup: bool = True) -> Dict:
        """
        Transcribe audio/video file using Whisper.

        Args:
            input_file (Union[str, Path]): Path to input audio/video file
            output_dir (Optional[str]): Directory for temporary WAV file
            cleanup (bool): Whether to delete temporary WAV file after transcription

        Returns:
            Dict: Transcription result containing text and other metadata
        """
        try:
            # Convert to WAV if input is not already WAV
            input_path = Path(input_file)
            if input_path.suffix.lower() != '.wav':
                self.logger.info("Converting input file to WAV format")
                wav_file = self.convert_to_wav(input_file, output_dir)
                if not wav_file:
                    raise RuntimeError("Failed to convert input file to WAV format")
            else:
                wav_file = str(input_path)

            # Perform transcription
            self.logger.info("Starting transcription")
            result = self.model.transcribe(wav_file, language=self.language)
            self.logger.info("Transcription completed successfully")

            # Cleanup temporary WAV file if requested
            if cleanup and input_path.suffix.lower() != '.wav':
                try:
                    os.remove(wav_file)
                    self.logger.info(f"Cleaned up temporary WAV file: {wav_file}")
                except Exception as e:
                    self.logger.warning(f"Failed to cleanup temporary file: {str(e)}")

            return result

        except Exception as e:
            self.logger.error(f"Transcription failed: {str(e)}")
            raise

In [4]:
# Initialize transcriber
transcriber = WhisperTranscriber()

input_file = "C:/Users/Mohamed Walid/OneDrive/Desktop/1732456960148c78vzx5e-voicemaker.in-speech.mp3"
try:
    result = transcriber.transcribe(input_file)

    # Store the transcribed text in a variable
    transcribed_text = result['text']

    # Print the transcribed text
    print(f"Transcription: {transcribed_text}")

    # You can now use transcribed_text for further processing
    # For example, save to a file or use in other functions
except Exception as e:
    print(f"Transcription failed: {e}")

  checkpoint = torch.load(fp, map_location=device)
2024-11-24 16:12:16,527 - INFO - Converting input file to WAV format
2024-11-24 16:12:16,527 - INFO - Converting C:\Users\Mohamed Walid\OneDrive\Desktop\1732456960148c78vzx5e-voicemaker.in-speech.mp3 to WAV format
2024-11-24 16:12:16,859 - INFO - Conversion successful
2024-11-24 16:12:16,859 - INFO - Starting transcription
2024-11-24 16:12:19,837 - INFO - Transcription completed successfully
2024-11-24 16:12:19,839 - INFO - Cleaned up temporary WAV file: C:\Users\Mohamed Walid\OneDrive\Desktop\1732456960148c78vzx5e-voicemaker.in-speech.wav


Transcription:  Many software engineers work as employees or contractors. Software engineers work with businesses, government agencies, civilian or military, and non-profit organizations. Some software engineers work for themselves as freelancers. Some organization.


## preprocessing

In [6]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to C:\Users\Mohamed
[nltk_data]     Walid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Mohamed
[nltk_data]     Walid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Mohamed
[nltk_data]     Walid\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
stop_words = set(stopwords.words('english'))
for char in string.punctuation:
    print(char,end= " ")

! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~ 

#### Using `lemmatization` here in preprocessing as it reduces words to their base , considering the context and the actual meaning. which will be useful for `keyword extraction` as it help maintain the correct semantic meaning, improving the accuracy .


In [9]:
translated_table = str.maketrans('', '', string.punctuation)
lemmatizer=WordNetLemmatizer()


def preprocess_text(text):
    text = text.lower()
    text = text.translate(translated_table)

    text = re.sub(r'\d+', '', text)       # Remove numbers

    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    #processed_tokens = [word for word in tokens if word not in stop_words]


    # Join tokens back into a single string
    processed_text = ' '.join(processed_tokens)

    return processed_text

In [10]:
preprocessed_text = preprocess_text(transcribed_text)
print(f"Preprocessed Text: {preprocessed_text}")

Preprocessed Text: many software engineer work employee contractor software engineer work business government agency civilian military nonprofit organization software engineer work freelancer organization


#### extract important keywords from text

In [22]:
from keybert import KeyBERT
import spacy


# Load spaCy model for POS tagging
nlp = spacy.load("en_core_web_sm")

# Sample preprocessed text, replace this with your actual preprocessed text
text = preprocessed_text
# Initialize KeyBERT model
kw_model = KeyBERT()

# Extract keywords with KeyBERT
raw_keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=10)
raw_keywords = [kw[0] for kw in raw_keywords]  # Keep only the keywords without scores

# Filter keywords using spaCy for relevant parts of speech
filtered_keywords = []
for keyword in raw_keywords:
    doc = nlp(keyword)
    # Check if all tokens in the keyword are either NOUN or PROPN
    if all(token.pos_ in {"NOUN", "PROPN"} for token in doc):
        filtered_keywords.append(keyword)

print("Relevant job keywords:", filtered_keywords)


2024-11-24 16:35:44,124 - INFO - Use pytorch device_name: cuda
2024-11-24 16:35:44,125 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Relevant job keywords: ['employee contractor', 'software engineer', 'contractor software', 'contractor', 'freelancer organization', 'work freelancer', 'agency civilian', 'freelancer', 'engineer work']


#### Find synonyms of words in text

In [60]:
filtered_keywords

['employee contractor',
 'software engineer',
 'contractor software',
 'contractor',
 'freelancer organization',
 'work freelancer',
 'agency civilian',
 'freelancer',
 'engineer work']

In [61]:
fol=' '.join(filtered_keywords)
#fol   
doc=nlp(fol)
doc

employee contractor software engineer contractor software contractor freelancer organization work freelancer agency civilian freelancer engineer work

In [None]:
from nltk.util import ngrams


# Function to fetch synonyms for a word using WordNet
def get_synonyms(word):
    """Fetch a set of synonyms for a word using WordNet."""
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return synonyms

# Function to generate n-grams (1-gram and 2-gram) from the tokens
def generate_ngrams(tokens, n=2):
    """Generate n-grams from the list of tokens."""
    n_grams = ngrams(tokens, n)
    return [' '.join(gram) for gram in n_grams]

# Function to combine each word in the text with its synonyms
def combine_with_synonyms(doc):
    """Combine each word in the text with its synonyms."""
    combined_dict = {}
    for token in doc:
        word = token.text.lower()  # Convert to lowercase for consistency
        synonyms = get_synonyms(word)
        combined_dict[word] = list(synonyms)  # Store as a list of synonyms
    return combined_dict

In [65]:
# Get the synonyms for each token in the processed text
result = combine_with_synonyms(doc)

# Get tokens from doc
tokens = [token.text.lower() for token in doc]

# Generate unigrams (1-grams) and bigrams (2-grams)
unigrams = generate_ngrams(tokens, n=1)
bigrams = generate_ngrams(tokens, n=2)

In [66]:
# Print the results
print("Unigrams:", unigrams)
print("Bigrams:", bigrams)

# Print the synonyms
print("Synonyms:", result)

Unigrams: ['employee', 'contractor', 'software', 'engineer', 'contractor', 'software', 'contractor', 'freelancer', 'organization', 'work', 'freelancer', 'agency', 'civilian', 'freelancer', 'engineer', 'work']
Bigrams: ['employee contractor', 'contractor software', 'software engineer', 'engineer contractor', 'contractor software', 'software contractor', 'contractor freelancer', 'freelancer organization', 'organization work', 'work freelancer', 'freelancer agency', 'agency civilian', 'civilian freelancer', 'freelancer engineer', 'engineer work']
Synonyms: {'employee': ['employee'], 'contractor': ['declarer', 'contractile_organ', 'contractor'], 'software': ['computer_software', 'software_package', 'software', 'package', 'software_system', 'software_program'], 'engineer': ['orchestrate', 'applied_scientist', 'organize', 'technologist', 'mastermind', 'engine_driver', 'engineer', 'locomotive_engineer', 'direct', 'railroad_engineer', 'organise'], 'freelancer': ['independent', 'free-lance', 's

In [67]:
# Function to fetch synonyms for a word using WordNet
def get_synonyms(word):
    """Fetch a set of synonyms for a word using WordNet."""
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return synonyms

# Function to generate n-grams (1-gram and 2-gram) from the tokens
def generate_ngrams(tokens, n=2):
    """Generate n-grams from the list of tokens."""
    n_grams = ngrams(tokens, n)
    return [' '.join(gram) for gram in n_grams]

# Function to combine each word or phrase (bigram) with its synonyms
def combine_with_synonyms(doc, n=2):
    """Combine each word in the text with its synonyms (including bigrams)."""
    combined_dict = {}
    tokens = [token.text.lower() for token in doc]  # Get list of tokens from doc
    n_grams = generate_ngrams(tokens, n)  # Generate n-grams (1 or 2)
    
    for gram in n_grams:
        synonyms_for_bigram = set()
        words_in_bigram = gram.split()  # Split bigram into individual words
        
        # Get synonyms for each word in the bigram
        for word in words_in_bigram:
            synonyms_for_bigram.update(get_synonyms(word))
        
        combined_dict[gram] = list(synonyms_for_bigram)  # Store the synonyms for the bigram
    
    return combined_dict

In [68]:
# Get the synonyms for each token or bigram in the processed text
result = combine_with_synonyms(doc, n=2)

# Print the result
print(result)

{'employee contractor': ['contractor', 'declarer', 'employee', 'contractile_organ'], 'contractor software': ['computer_software', 'software_package', 'software', 'contractor', 'package', 'software_system', 'software_program', 'declarer', 'contractile_organ'], 'software engineer': ['software', 'engine_driver', 'package', 'software_program', 'railroad_engineer', 'direct', 'organize', 'organise', 'computer_software', 'software_package', 'applied_scientist', 'technologist', 'mastermind', 'engineer', 'software_system', 'locomotive_engineer', 'orchestrate'], 'engineer contractor': ['orchestrate', 'applied_scientist', 'organize', 'contractor', 'technologist', 'mastermind', 'engine_driver', 'engineer', 'locomotive_engineer', 'declarer', 'contractile_organ', 'direct', 'railroad_engineer', 'organise'], 'software contractor': ['computer_software', 'software_package', 'software', 'contractor', 'package', 'software_system', 'software_program', 'declarer', 'contractile_organ'], 'contractor freelance

In [75]:
from nltk.corpus import wordnet

# Function to calculate similarity between words using Wu-Palmer Similarity
def get_similarity(word1, word2):
    """Calculate the similarity between two words using WordNet's Wu-Palmer similarity."""
    syn1 = wordnet.synsets(word1)
    syn2 = wordnet.synsets(word2)
    
    if syn1 and syn2:
        # Calculate similarity between the first synsets of both words
        return syn1[0].wup_similarity(syn2[0])  # Wu-Palmer similarity (range: 0 to 1)
    return 0  # Return 0 if no similarity found

# Function to combine each bigram with its synonyms and similarity
def combine_with_synonyms_and_similarity(doc, n=2):
    """Combine each bigram in the text with its synonyms and calculate similarity."""
    combined_dict = {}
    tokens = [token.text.lower() for token in doc]  # Tokenize and lowercase
    n_grams = generate_ngrams(tokens, n)  # Generate n-grams
    
    for gram in n_grams:
        synonyms_with_scores = {}
        words_in_bigram = gram.split()  # Split bigram into individual words
        
        for word in words_in_bigram:
            synonyms = get_synonyms(word)  # Get synonyms for the word
            
            for synonym in synonyms:
                if word != synonym:  # Avoid self-similarity
                    similarity_score = get_similarity(word, synonym)
                    synonyms_with_scores[synonym] = similarity_score
        
        combined_dict[gram] = synonyms_with_scores  # Store the bigram with synonyms and scores
    
    return combined_dict

# Example Usage
result = combine_with_synonyms_and_similarity(doc, n=2)
print(result)


{'employee contractor': {'declarer': 0.5217391304347826, 'contractile_organ': 0.26666666666666666}, 'contractor software': {'declarer': 0.5217391304347826, 'contractile_organ': 0.26666666666666666, 'computer_software': 1.0, 'software_package': 1.0, 'package': 0.3076923076923077, 'software_system': 1.0, 'software_program': 1.0}, 'software engineer': {'computer_software': 1.0, 'software_package': 1.0, 'package': 0.3076923076923077, 'software_system': 1.0, 'software_program': 1.0, 'orchestrate': 0.16666666666666666, 'applied_scientist': 0.75, 'organize': 0.2, 'technologist': 0.75, 'mastermind': 0.7058823529411765, 'engine_driver': 0.6, 'locomotive_engineer': 0.6, 'direct': 0.11764705882352941, 'railroad_engineer': 0.6, 'organise': 0.16666666666666666}, 'engineer contractor': {'orchestrate': 0.16666666666666666, 'applied_scientist': 0.75, 'organize': 0.2, 'technologist': 0.75, 'mastermind': 0.7058823529411765, 'engine_driver': 0.6, 'locomotive_engineer': 0.6, 'direct': 0.11764705882352941,