In [None]:
%pip install transformers pandas

In [None]:
import pandas as pd
import numpy as np
from transformers import BertModel, BertTokenizer
import torch

# Initialize BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased')

# Function to extract features from transcript using BERT
def extract_features(transcript):
    inputs = tokenizer(transcript, return_tensors='pt')
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    mean_last_hidden_states = torch.mean(last_hidden_states, dim=1).squeeze().detach().numpy()
    return mean_last_hidden_states

# Read CSV file
df = pd.read_csv('transcriptions (1353).csv')

# Create an empty list to store feature vectors and identifiers
feature_vectors = []
identifiers = []

# Iterate through each transcript in the DataFrame
for index, row in df.iterrows():
    identifier = row['Video File'] 
    transcript = row['Transcription']  
    
    # Extract features for the current transcript
    feature_vector = extract_features(transcript)
    
    # Save the feature vector as a NumPy array
    np.save(f'feature_vectors/{identifier}.npy', feature_vector)
    
    # keep track of identifiers for reference
    identifiers.append(identifier)

# Save identifiers to a text file (will remove this later)
with open('identifiers.txt', 'w') as f:
    for identifier in identifiers:
        f.write(f'{identifier}\n')


Ver2: With Logging

In [None]:
import os
import pandas as pd
import numpy as np
from transformers import BertModel, BertTokenizer
import torch
from tqdm import tqdm
import logging

# Set up logging for tracking purposes 
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased')

# Function to extract features from transcript using BERT
def extract_features(transcript):
    inputs = tokenizer(transcript, return_tensors='pt')
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    mean_last_hidden_states = torch.mean(last_hidden_states, dim=1).squeeze().detach().numpy()
    return mean_last_hidden_states

# Ensure the feature_vectors directory exists
output_dir = 'feature_vectors'
os.makedirs(output_dir, exist_ok=True)

# Read CSV file
csv_file = 'transcriptions (1353).csv'
try:
    df = pd.read_csv(csv_file)
    logging.info(f"Successfully read {csv_file}")
except FileNotFoundError:
    logging.error(f"The file {csv_file} was not found.")
    exit()
except Exception as e:
    logging.error(f"An error occurred while reading {csv_file}: {e}")
    exit()

# Check if required columns exist
required_columns = ['Video File', 'Transcription']
if not all(col in df.columns for col in required_columns):
    logging.error(f"The file {csv_file} does not contain the required columns: {required_columns}")
    exit()

# Create an empty list to store identifiers
identifiers = []

# Iterate through each transcript in the DataFrame
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing transcripts"):
    identifier = row['Video File']
    transcript = row['Transcription']
    
    try:
        # Extract features for the current transcript
        feature_vector = extract_features(transcript)
        
        # Save the feature vector as a NumPy array
        np.save(os.path.join(output_dir, f'{identifier}.npy'), feature_vector)
        
        # Optionally, keep track of identifiers for reference
        identifiers.append(identifier)
        logging.info(f"Processed and saved features for {identifier}")
    except Exception as e:
        logging.error(f"An error occurred while processing {identifier}: {e}")

# Save identifiers to a text file (optional)
identifiers_file = 'identifiers.txt'
try:
    with open(identifiers_file, 'w') as f:
        for identifier in identifiers:
            f.write(f'{identifier}\n')
    logging.info(f"Identifiers saved to {identifiers_file}")
    logging.info("Feature extraction and saving completed.")
except Exception as e:
    logging.error(f"An error occurred while saving identifiers to {identifiers_file}: {e}")
