In [3]:
import json
from tqdm import tqdm

def process_document(doc, qrels, query_dict):
    doc_id = doc['doc_id']
    doc_text = doc['text']
    related_query = next((entry['query_id'] for entry in qrels if entry['doc_id'] == doc_id and entry.get('relevance', 0) >= 1), None)
    
    if related_query and related_query in query_dict:

        combined_text =   query_dict[related_query] + " " + doc_text
    else:

        combined_text = doc_text
    
    return {"doc_id": doc_id, "text": combined_text}

def merge_queries_with_documents(qrels_path, queries_path, docs_path, output_path):

    with open(qrels_path, 'r', encoding='utf-8') as file:
        qrels = json.load(file)
    

    with open(queries_path, 'r', encoding='utf-8') as file:
        queries = json.load(file)

    query_dict = {item['query_id']: item['text'] for item in queries}
    

    with open(docs_path, 'r', encoding='utf-8') as file:
        docs = json.load(file)
    

    combined_texts = []
    for doc in tqdm(docs, desc="Processing documents"):
        processed_doc = process_document(doc, qrels, query_dict)
        combined_texts.append(processed_doc)
    

    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(combined_texts, file, ensure_ascii=False, indent=4)


qrels_path = './DataSets/Quora/JSON/quora-test-qrels.json'
queries_path = './DataSets/Quora/JSON/quora-test-queries.json'
docs_path = './DataSets/Quora/JSON/quora-test-docs.json'
output_path = './DataSets/Quora/JSON/quora-test-docs_combined.json'
merge_queries_with_documents(qrels_path, queries_path, docs_path, output_path)

Processing documents:  96%|█████████▌| 501806/522931 [08:25<00:22, 946.74it/s] 

In [4]:
import json
def json_to_text(input_json_path, output_text_path):

    with open(input_json_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    

    with open(output_text_path, 'w', encoding='utf-8') as outfile:
        for item in data:

            outfile.write(item['text'] + '\n')

input_json_path = './DataSets/Quora/quora-test-docs_combined.json'
output_text_path = "./DataSets/Quora/quora-test-docs_combined.txt"
json_to_text(input_json_path, output_text_path)

In [1]:
from Services.TextProcessor import TextProcessor
from tqdm import tqdm

def process_file(input_file_path, output_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as infile:
        lines = infile.readlines()

    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        for line in tqdm(lines, desc="Processing lines"):
                cleaned_text = TextProcessor.process_text_for_embedding(line)
                outfile.write(' '.join(cleaned_text) + '\n')

process_file("./DataSets/Quora/quora-test-docs_combined.txt", "./DataSets/Quora/Quora_processed.txt")

Processing lines: 100%|██████████| 522931/522931 [05:21<00:00, 1625.51it/s]


In [2]:
import fasttext

model = fasttext.train_unsupervised("./DataSets/Quora/quora-test-docs_processed.txt", model='skipgram', thread =16,dim=600)
model.save_model('./Models/quora_model.bin')