In [33]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import time
import re
from spellchecker import SpellChecker
import spacy
from tqdm import tqdm
from torch.cuda.amp import autocast  # For mixed precision
from torch.utils.data import DataLoader, Dataset  # For parallel data loading

# Start total timer
start_time = time.time()

# Load spaCy model
nlp = spacy.load("en_core_web_sm")
spell = SpellChecker()

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2').to('cuda')

# Clean question function
def clean_question(text):
    # Lowercase and remove punctuation
    text = text.lower().strip()
    text = re.sub(r'[^a-z0-9\s]', '', text)

    # Correct typos (optional)
    words = text.split()
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
    text = ' '.join(corrected_words)

    # Lemmatize and remove stopwords
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]

    return ' '.join(tokens)

# Clean answer function
def clean_answer(text):
    # Trim whitespace
    text = text.strip()
    # Remove markdown/HTML tags (example)
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'\*\*', '', text)  # Remove bold markers
    return text

# Dataset class for parallel data loading
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        return self.texts[idx]

# Generate embeddings in batches
def generate_embeddings_batch(texts, batch_size=128):
    embeddings = []
    dataset = TextDataset(texts)
    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0, shuffle=False)  # Set num_workers=0
    
    for batch in tqdm(dataloader, desc="Generating embeddings"):
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to('cuda')
        with torch.no_grad(), autocast():  # Mixed precision
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    return embeddings

# Load data
print("Loading data...")
D2 = pd.read_csv('data/FAQ Answering/Preprocessed data/D2.csv')

# Rename columns
D2 = D2.rename(columns={'query': 'question', 'finalpassage': 'answer'})

# Apply cleaning functions
print("Cleaning data...")
D2["cleaned_query"] = [clean_question(q) for q in tqdm(D2["question"], desc="Cleaning questions")]
D2["cleaned_answer"] = [clean_answer(a) for a in tqdm(D2["answer"], desc="Cleaning answers")]

# Generate embeddings for cleaned queries in batches
print("Generating embeddings...")
D2["embedding"] = generate_embeddings_batch(D2["cleaned_query"].tolist(), batch_size=128)

# Save embeddings to disk
print("Saving data...")
D2.to_csv("data/FAQ Answering/Preprocessed embedding/D2_emb.csv", index=False)

# End total timer
total_time = time.time() - start_time
print(f"Total execution time: {total_time:.2f} seconds")

Loading data...
Cleaning data...


Cleaning questions: 100%|██████████| 130560/130560 [3:54:12<00:00,  9.29it/s]  
Cleaning answers: 100%|██████████| 130560/130560 [00:00<00:00, 254262.92it/s]


Generating embeddings...


  with torch.no_grad(), autocast():  # Mixed precision
Generating embeddings: 100%|██████████| 1020/1020 [00:17<00:00, 57.17it/s]


Saving data...
Total execution time: 14349.12 seconds


In [32]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import time
import re
from spellchecker import SpellChecker
import spacy
from tqdm import tqdm
from torch.cuda.amp import autocast  # For mixed precision
from torch.utils.data import DataLoader, Dataset  # For parallel data loading

# Start total timer
start_time = time.time()

# Load spaCy model
nlp = spacy.load("en_core_web_sm")
spell = SpellChecker()

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2').to('cuda')

# Clean question function
def clean_question(text):
    # Lowercase and remove punctuation
    text = text.lower().strip()
    text = re.sub(r'[^a-z0-9\s]', '', text)

    # Correct typos (optional)
    words = text.split()
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
    text = ' '.join(corrected_words)

    # Lemmatize and remove stopwords
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]

    return ' '.join(tokens)

# Clean answer function
def clean_answer(text):
    # Trim whitespace
    text = text.strip()
    # Remove markdown/HTML tags (example)
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'\*\*', '', text)  # Remove bold markers
    return text

# Dataset class for parallel data loading
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        return self.texts[idx]

# Generate embeddings in batches
def generate_embeddings_batch(texts, batch_size=128):
    embeddings = []
    dataset = TextDataset(texts)
    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0, shuffle=False)  # Set num_workers=0
    
    for batch in tqdm(dataloader, desc="Generating embeddings"):
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to('cuda')
        with torch.no_grad(), autocast():  # Mixed precision
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    return embeddings

# Load data
print("Loading data...")
D3 = pd.read_csv('data/FAQ Answering/Preprocessed data/D3.csv')



# Apply cleaning functions
print("Cleaning data...")
D3["cleaned_query"] = [clean_question(q) for q in tqdm(D3["question"], desc="Cleaning questions")]
D3["cleaned_answer"] = [clean_answer(a) for a in tqdm(D3["answer"], desc="Cleaning answers")]

# Generate embeddings for cleaned queries in batches
print("Generating embeddings...")
D3["embedding"] = generate_embeddings_batch(D3["cleaned_query"].tolist(), batch_size=128)

# Save embeddings to disk
print("Saving data...")
D3.to_csv("data/FAQ Answering/Preprocessed embedding/D3_emb.csv", index=False)

# End total timer
total_time = time.time() - start_time
print(f"Total execution time: {total_time:.2f} seconds")

Loading data...
Cleaning data...


Cleaning questions: 100%|██████████| 414/414 [00:04<00:00, 98.82it/s] 
Cleaning answers: 100%|██████████| 414/414 [00:00<?, ?it/s]


Generating embeddings...


  with torch.no_grad(), autocast():  # Mixed precision
Generating embeddings: 100%|██████████| 4/4 [00:00<00:00, 11.50it/s]


Saving data...
Total execution time: 8.77 seconds


In [28]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import time
import re
from spellchecker import SpellChecker
import spacy
from tqdm import tqdm
from torch.cuda.amp import autocast  # For mixed precision
from torch.utils.data import DataLoader, Dataset  # For parallel data loading

# Start total timer
start_time = time.time()

# Load spaCy model
nlp = spacy.load("en_core_web_sm")
spell = SpellChecker()

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2').to('cuda')

# Clean question function
def clean_question(text):
    # Lowercase and remove punctuation
    text = text.lower().strip()
    text = re.sub(r'[^a-z0-9\s]', '', text)

    # Correct typos (optional)
    words = text.split()
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
    text = ' '.join(corrected_words)

    # Lemmatize and remove stopwords
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]

    return ' '.join(tokens)

# Clean answer function
def clean_answer(text):
    # Trim whitespace
    text = text.strip()
    # Remove markdown/HTML tags (example)
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'\*\*', '', text)  # Remove bold markers
    return text

# Dataset class for parallel data loading
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        return self.texts[idx]

# Generate embeddings in batches
def generate_embeddings_batch(texts, batch_size=128):
    embeddings = []
    dataset = TextDataset(texts)
    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0, shuffle=False)  # Set num_workers=0
    
    for batch in tqdm(dataloader, desc="Generating embeddings"):
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to('cuda')
        with torch.no_grad(), autocast():  # Mixed precision
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    return embeddings

# Load data
print("Loading data...")
D5 = pd.read_csv('data/FAQ Answering/Preprocessed data/D5.csv')

# Rename columns
D5 = D5.rename(columns={'query': 'question', 'finalpassage': 'answer'})

# Apply cleaning functions
print("Cleaning data...")
D5["cleaned_query"] = [clean_question(q) for q in tqdm(D5["question"], desc="Cleaning questions")]
D5["cleaned_answer"] = [clean_answer(a) for a in tqdm(D5["answer"], desc="Cleaning answers")]

# Generate embeddings for cleaned queries in batches
print("Generating embeddings...")
D5["embedding"] = generate_embeddings_batch(D5["cleaned_query"].tolist(), batch_size=128)

# Save embeddings to disk
print("Saving data...")
D5.to_csv("data/FAQ Answering/Preprocessed embedding/D5_emb.csv", index=False)

# End total timer
total_time = time.time() - start_time
print(f"Total execution time: {total_time:.2f} seconds")

Loading data...
Cleaning data...


Cleaning questions: 100%|██████████| 200/200 [00:03<00:00, 50.20it/s] 
Cleaning answers: 100%|██████████| 200/200 [00:00<?, ?it/s]


Generating embeddings...


  with torch.no_grad(), autocast():  # Mixed precision
Generating embeddings: 100%|██████████| 2/2 [00:00<00:00,  2.19it/s]


Saving data...
Total execution time: 7.22 seconds
