## Data Processing & Embedding
### > Data PreProcessing - Cleaning, Chunking
* Lemmatization & Tokenization
* Normalizing the columns
* Splitting the Chunks
* Loading and Preprocessing transcripts
* Converting to LangChain docs and Split Chunks
* Saving Processed Data as processed_transcripts.csv file
* Validating creating Data validation file as validation_report.txt
### > Vector and Embedding
* Vectorizing using FAISS
* Embedding with all-MiniLM-L6-v2 Model
* Log Function

## Step1: Import Libraries

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import logging
import pandas as pd
import os
import re
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mercy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
metadata_file = '../data/ServiceNow_Youtube_Metadata_Clean.csv'
transcript_file = '../data/video_metadata_with_transcripts.csv'
output_file = '../data/processed_transcripts.csv'
validation_report = '../logs/validation_report.txt'

for file in [metadata_file, transcript_file]:
    if not os.path.exists(file):
        logging.error(f'File not found: {file}')
        raise FileNotFoundError(f'File not found: {file}')

In [3]:
from pathlib import Path

BASE_DIR = Path().resolve().parent
DATA_DIR = BASE_DIR / 'data'
LOG_DIR = BASE_DIR / 'logs'
FAISS_DIR = BASE_DIR / "faiss_store"

DATA_DIR.mkdir(parents=True, exist_ok=True)
LOG_DIR.mkdir(parents=True, exist_ok=True)
FAISS_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
df_meta = pd.read_csv('../data/ServiceNow_Youtube_Metadata_Clean.csv')
print("Metadata columns:", df_meta.columns.tolist())

df_trans = pd.read_csv('../data/video_metadata_with_transcripts.csv')
print("Transcript columns:", df_trans.columns.tolist())

Metadata columns: ['Number', 'Youtube_link', 'Subject', 'title', 'channel', 'description', 'length', 'publish_date', 'views', 'error']
Transcript columns: ['Number', 'Youtube_link', 'Subject', 'title', 'channel', 'description', 'length', 'publish_date', 'views', 'error', 'transcript']


## Step2: Data Preprocessing

In [5]:
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isna(text):
        logging.warning('Empty text encountered in clean_text')
        return ''
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower().strip()

def tokenize_and_lemmatize(text):
    if not text:
        logging.warning('Empty text encountered in tokenize_and_lemmatize')
        return ''
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens if token]
    return ' '.join(lemmatized)

def load_and_process_transcripts(transcript_path):
    df = pd.read_csv(transcript_path)
    
    df = df.rename(columns={
        'Youtube_link': 'youtube_link',
        'Number': 'video_id',
        'Subject': 'subject',
        'title': 'title',
        'transcript': 'transcript'
    })
    
    df['cleaned_transcript'] = df['transcript'].apply(clean_text).apply(tokenize_and_lemmatize)
    
    logging.info(f'Loaded and processed {len(df)} transcript records')
    return df

def prepare_langchain_docs(df):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = []
    missing_transcripts = []
    
    for _, row in df.iterrows():
        if not row['cleaned_transcript']:
            missing_transcripts.append(row['video_id'])
            logging.warning(f'Missing transcript for video_id {row["video_id"]}')
            continue
        
        metadata = {
            'title': row['title'],
            'url': row['youtube_link'],
            'subject': row['subject'],
            'video_id': row['video_id'],
        }
        
        chunks = text_splitter.split_text(row['cleaned_transcript'])
        for i, chunk in enumerate(chunks):
            doc = Document(page_content=chunk, metadata={**metadata, 'chunk_id': f"{row['video_id']}_{i}"})
            docs.append(doc)
            
        logging.info(f'Created {len(chunks)} chunks for video_id {row["video_id"]}')
        
    logging.info(f'Created {len(docs)} total LangChain documents')
    return docs, missing_transcripts

df_clean = load_and_process_transcripts(transcript_file)
langchain_docs, missing = prepare_langchain_docs(df_clean)

print(f"Processed {len(langchain_docs)} document chunks.")
if missing:
    print(f"Missing transcripts for video IDs: {missing}")


Processed 328 document chunks.


## Step2.1: Preprocess Transcripts


In [6]:
def preprocess_transcripts():
    df_clean = load_and_process_transcripts(transcript_file)
    langchain_docs, missing_transcripts = prepare_langchain_docs(df_clean)
    processed_data = []
    for doc in langchain_docs:
        processed_data.append({
            'video_id': doc.metadata['video_id'],    
            'chunk_id': doc.metadata['chunk_id'],
            'text': doc.page_content,
            'subject': doc.metadata['subject']
        })
    df_processed = pd.DataFrame(processed_data)
    if df_processed.empty:
        logging.error('No processed data generated')
        raise ValueError('No processed data generated')
    try:
        df_processed.to_csv(output_file, index=False, encoding='utf-8')
        logging.info(f'Saved {len(df_processed)} transcript chunks to {output_file}')
    except Exception as e:
        logging.error(f'Error saving processed data: {e}')
        raise
    df_transcripts = pd.read_csv(transcript_file)
    metadata_success = df_transcripts['title'].notnull().sum()
    missing_trans = df_transcripts['transcript'].isna().sum()
    total_chunks = len(df_processed)
    sample_chunk = df_processed['text'].iloc[0] if not df_processed.empty else 'No chunks'
    with open(validation_report, 'w') as f:
        f.write(f'Transcripts: {len(df_transcripts)} videos, {missing_trans} missing\n')
        f.write(f'Processed Chunks: {total_chunks}\n')
        f.write(f'Sample Chunk:\n{sample_chunk}\n')
        if missing_transcripts:
            f.write(f'Missing transcripts for video IDs: {missing_transcripts}\n')
    logging.info(f'Validation report saved to {validation_report}')
    print(f'Metadata success: {metadata_success}/{len(df_transcripts)}')
    print(f'Missing transcripts: {missing_trans}/{len(df_transcripts)}')
    print(f'Processed chunks: {total_chunks}')
    if missing_transcripts:
        print(f'Missing transcripts for video IDs: {missing_transcripts}')
    print(df_processed.head())

preprocess_transcripts()


Metadata success: 21/22
Missing transcripts: 0/22
Processed chunks: 328
   video_id chunk_id                                               text  \
0         1      1_0  hey folk how you doing chris thanky here and i...   
1         1      1_1  and issue and in those project youre going to ...   
2         1      1_2  project the project status report any issue th...   
3         1      1_3  be it would be crazy right it would take you f...   
4         1      1_4  person and asking him this question and asking...   

                                             subject  
0  An AI Agent that knows everything about your P...  
1  An AI Agent that knows everything about your P...  
2  An AI Agent that knows everything about your P...  
3  An AI Agent that knows everything about your P...  
4  An AI Agent that knows everything about your P...  


In [None]:
try:
    df_processed = pd.read_csv('../data/processed_transcripts.csv')
    print(f'Total chunks: {len(df_processed)}')
    print(f'Unique videos: {df_processed["video_id"].nunique()}')
    print(df_processed[['video_id', 'chunk_id', 'text', 'subject']].head())
except FileNotFoundError:
    logging.error(f'Output file not found: {output_file}')
    print(f'Output file not found: {output_file}')

Total chunks: 328
Unique videos: 22
   video_id chunk_id                                               text  \
0         1      1_0  hey folk how you doing chris thanky here and i...   
1         1      1_1  and issue and in those project youre going to ...   
2         1      1_2  project the project status report any issue th...   
3         1      1_3  be it would be crazy right it would take you f...   
4         1      1_4  person and asking him this question and asking...   

                                             subject  
0  An AI Agent that knows everything about your P...  
1  An AI Agent that knows everything about your P...  
2  An AI Agent that knows everything about your P...  
3  An AI Agent that knows everything about your P...  
4  An AI Agent that knows everything about your P...  


## Step2.2: Cleaning Processed Chunks

In [None]:
import pandas as pd
import re
from nltk.tokenize import sent_tokenize

df = pd.read_csv("../Data/processed_transcripts.csv")

def clean_text(text):
    text = re.sub(r"\s+", " ", text)
    filler_patterns = [
        r"\b(hi|hey|hello|folks|how you doing|thank you|thanks|welcome|today|i am|i'm|this is .*?)\b",
        r"\b(thanky here|and i\b|let's talk about|so today|in this video)\b",
        r"\b(folk|id like|to you|to this session|chris)\b",
    ]
    for pattern in filler_patterns:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE)
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()

def chunk_transcript_by_sentences(text, max_sentences=4):
    sentences = sent_tokenize(text)
    return [' '.join(sentences[i:i+max_sentences]) for i in range(0, len(sentences), max_sentences)]

all_chunks = []
all_video_ids = []
all_subjects = []

for idx, row in df.iterrows():
    transcript = row["text"]
    video_id = row["video_id"]
    subject = row["subject"] 
    if pd.isna(transcript):
        continue
    cleaned = clean_text(transcript)
    chunks = chunk_transcript_by_sentences(cleaned)
    all_chunks.extend(chunks)
    all_video_ids.extend([video_id] * len(chunks))
    all_subjects.extend([subject] * len(chunks)) 

output_df = pd.DataFrame({
    "chunk_id": [f"chunk_{i}" for i in range(len(all_chunks))],
    "video_id": all_video_ids,
    "subject": all_subjects,
    "text": all_chunks
})

output_df.to_csv("../Data/processed_cleaned_chunks.csv", index=False)
output_df.head()

Unnamed: 0,chunk_id,video_id,subject,text
0,chunk_0,1,An AI Agent that knows everything about your P...,and to you where i plan to show you the greate...
1,chunk_1,1,An AI Agent that knows everything about your P...,and issue and in those project youre going to ...
2,chunk_2,1,An AI Agent that knows everything about your P...,project the project status report any issue th...
3,chunk_3,1,An AI Agent that knows everything about your P...,be it would be crazy right it would take you f...
4,chunk_4,1,An AI Agent that knows everything about your P...,person and asking him this question and asking...


## Step3: Vector and Embedding

In [13]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

df = pd.read_csv("../data/processed_cleaned_chunks.csv")

langchain_docs = [
    Document(
        page_content=row["text"],
        metadata={
            "chunk_id": row["chunk_id"],
            "video_id": row["video_id"]
        }
    )
    for _, row in df.iterrows()
]

with open('../logs/chunk_previews.txt', 'w', encoding='utf-8') as f:
    for i, doc in enumerate(langchain_docs):
        preview = doc.page_content[:100].replace('\n', ' ')
        f.write(f"[{i+1}] Chunk ID: {doc.metadata['chunk_id']} | Video ID: {doc.metadata['video_id']} | Text: {preview}...\n")

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

faiss_index = FAISS.from_documents(langchain_docs, embedding_model)

faiss_index.save_local(str(FAISS_DIR))


## Step4: Validation Function

In [None]:
def validate_data(df, df_processed):
    """Validate input and processed data."""
    duplicates = df[df.duplicated(subset=['video_id', 'youtube_link'], keep=False)]
    if not duplicates.empty:
        logging.warning(f'Found {len(duplicates)} duplicate video_id/youtube_link entries')
    
    invalid_links = df[~df['youtube_link'].str.contains(r'youtube\.com|youtu\.be', na=False)]
    if not invalid_links.empty:
        logging.warning(f'Found {len(invalid_links)} invalid YouTube links')
    
    chunk_lengths = df_processed['text'].str.len()
    chunk_stats = {
        'avg_length': chunk_lengths.mean(),
        'min_length': chunk_lengths.min(),
        'max_length': chunk_lengths.max()
    }
    logging.info(f'Chunk length stats: {chunk_stats}')
    return chunk_stats

chunk_stats = validate_data(df_clean, df_processed)
with open(validation_report, 'a') as f:
    f.write(f'Chunk Length Stats: Average={chunk_stats["avg_length"]:.1f}, Min={chunk_stats["min_length"]}, Max={chunk_stats["max_length"]}\n')

## Step5: Log Update

In [None]:
with open('../logs/project_log.md', 'a') as f:
    f.write('## Data Collection and Preprocessing\n')
    f.write('- Loaded YouTube video from `data/SNOW_YT_Videos.csv`.\n')
    f.write('- Loaded YouTube video metadata to `data/ServiceNow_Youtube_Metadata_Clean.csv`.\n')
    f.write('- Loaded YouTube video transcripts to `data/video_metadata_with_transcripts.csv`.\n')
    f.write('- Preprocessed transcripts with NLTK lemmatization and LangChain text splitting (chunk_size=500, overlap=50).\n')
    f.write(f'- Processed 22 videos, generating 328 chunks.\n')
    f.write('- Saved processed data to `data/processed_transcripts.csv`.\n')
    f.write('- Challenges: Resolved KeyError by standardizing column names (e.g., Number to video_id).\n')
    f.write('- Validation report saved to `logs/validation_report.txt`.\n')
    f.write('- Chunk Preview Data saved to `logs/chunk_preview.csv`.\n')
    f.write('- FAISS Store as faiss_store\n')
    f.write(f'  Average: {chunk_stats["avg_length"]:.1f}\n')
    f.write(f'  Minimum: {chunk_stats["min_length"]}\n')
    f.write(f'  Maximum: {chunk_stats["max_length"]}\n')