In [1]:
import pandas as pd
import spacy

In [2]:
# Load data
questions = pd.read_csv('data\\questions.csv')

Get named entities for queries

In [3]:
# Get named entities
nlp = spacy.load("en_core_web_sm")

result = []

for idx, row in questions.iterrows():
    query_ents = []

    doc = nlp(row['title'])

    for ent in doc.ents:
        query_ents.append(ent)

    doc = nlp(row['query'])

    for ent in doc.ents:
        query_ents.append(ent)
    
    result.append(query_ents)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from collections import defaultdict

query_entities = defaultdict(list)

for idx, doc in enumerate(result):
    query_entities[idx] = result[idx]

Noun chunks

In [6]:
# Get noun chunks

result = []

for idx, row in questions.iterrows():
    query_chunks = []

    doc = nlp(row['title'])

    for chunk in doc.noun_chunks:
        query_chunks.append(chunk)

    doc = nlp(row['query'])

    for chunk in doc.noun_chunks:
        query_chunks.append(chunk)
    
    result.append(query_chunks)

In [7]:
from collections import defaultdict

query_chunks = defaultdict(list)

for idx, doc in enumerate(result):
    query_chunks[idx] = result[idx]

Question summary

In [8]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [9]:
result = []

for idx, row in questions.iterrows():

    # Sample text for summarization
    input_text = row['query']

    # Tokenize input text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)

    # Generate summary
    summary_ids = model.generate(input_ids, max_length=100, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    result.append(summary)

In [10]:
# Save queries
query_summaries = defaultdict(list)

for idx, sum in enumerate(result):
    query_summaries[idx] = sum

In [11]:
import pandas as pd

# Initialize an empty DataFrame with specific columns
columns = ['title', 'query', 'named_entities', 'summary', 'chunks']
processed_queries = pd.DataFrame(columns={col: [] for col in columns})

processed_queries['title'] = questions['title']
processed_queries['query'] = questions['query']
processed_queries['named_entities'] = query_entities
processed_queries['summary'] = query_summaries
processed_queries['chunks'] = query_chunks

In [12]:
processed_queries.to_csv('data\\processed_queries.csv', index=False)