In [1]:
import logging
import re
from typing import List, Tuple
from rank_bm25 import BM25Okapi
import tqdm
import nltk
import os
import csv 
import json 

nltk.download('punkt')

def generate_result_file(k, bm25_ranking_path, view_results,random_split_train_file):
    # print("Load Question Index")
    question_index = load_question_index(random_split_train_file)
    # print("Load Gold Tables")
    # answers, gold_table_path = load_answers(random_split_train_file)
    # table_index = load_table_index(full_table_index_file)
    results = []
    count_retrieved_gold_table = 0
    total_queries = 0
    with open(bm25_ranking_path, 'r') as file:
        reader = csv.reader(file, delimiter="\t")
        current_qid = None
        current_object = None
        # print("Reading from rankings started")
        rank = 1
        print(f"Calculating results for Recall@{k}")
        for row in reader:
            qid, tid, retrieval_score = row
            [question,gold_table_path,answer] = question_index["nu-"+qid]
            gold_table_id = find_key_in_contexts(gold_table_path)

            if gold_table_id == tid:
                count_retrieved_gold_table += 1

            if qid != current_qid:
                rank  = 1
                if current_object:
                    total_queries += 1
                    results.append(current_object)

                current_qid = qid
                current_object = {
                    "question-id": qid,
                    "question": question,
                    "answer": answer,
                    "Gold Table": str(gold_table_id),
                    "k": k,
                    "retrieved tables": []
                }

            current_object["retrieved tables"].append({
                "rank": rank,
                "RetScore": retrieval_score,
                "TableID": tid,
                # "Full Table": table_index[tid]
            })
            rank = rank + 1

        if current_object:
            results.append(current_object)

    # Write results to the JSON file
    recall = count_retrieved_gold_table / total_queries
    
    # with tqdm(total=len(results), desc="Writing results to JSON file") as pbar:
    with open(view_results, 'w') as out_file:
        for result in results:
            json.dump(result, out_file, indent=4)
            out_file.write("\n")
                # pbar.update(1)
    return view_results, recall

def find_key_in_contexts(search_value):
    file_path = "/scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/tableId_to_path.json"
    try:
        with open(file_path, 'r') as file:
            contexts = json.load(file)
            for entry in contexts:
                for key, value in entry.items():
                    if str(value) == str(search_value):
                        return key
            return None
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
        return None
    except json.JSONDecodeError:
        print(f"Error: The file {file_path} contains invalid JSON.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def load_question_index(file_path):
    question_index = {}
    with open(file_path, 'r') as file:
        reader = csv.reader(file, delimiter="\t")
        for row in reader:
            qid, question, gold_table_path , answer = row
            question_index[qid] = [question,gold_table_path,answer]
    return question_index


# Define text processing functions
def format_text(text):
    text = text.lower().strip()
    if text == "n/a" or text == "?":
        text = ""
    text = re.sub(r"[^\w\d]+", " ", text).replace("_", " ")
    text = " ".join(text.split())
    text = text.strip()
    return text if text else ""

def tokenize_text(text):
    return format_text(text).split()

# Read tables from file
def read_tables(file_path) -> List[Tuple[str, str]]:
    tables = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) < 2:
                continue
            table_id, table_text = parts[0], parts[1]
            tables.append((table_id, table_text))
    return tables

# Read questions from file
def read_questions(file_path) -> List[Tuple[str, str]]:
    questions = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) < 2:
                continue
            question_id, question_text = parts[0], parts[1]
            questions.append((question_id, question_text))
    return questions

# Build BM25 index
def build_bm25_index(tables: List[Tuple[str, str]]):
    table_ids = [table[0] for table in tables]
    corpus = [tokenize_text(table[1]) for table in tables]
    bm25_index = BM25Okapi(corpus)
    return bm25_index, table_ids

# Retrieve top k tables for a query
def retrieve_top_tables(bm25_index, table_ids, query: str, k: int = 5):
    query_tokens = tokenize_text(query)
    scores = bm25_index.get_scores(query_tokens)
    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
    top_tables = [(table_ids[i], scores[i]) for i in top_indices]
    return top_tables

# Main function to process files and generate output
def rankTableswithBM25(k):

    logging.basicConfig(level=logging.INFO)
    full_table_index_file_without_token= "/scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/wtq_full_table_linearized.tsv"
    questions_file = '/scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/wtq_question_index.tsv'
    ranking_output_file = f'/scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/ranking_bm25_k_{k}.tsv'
    os.makedirs(os.path.dirname(ranking_output_file), exist_ok=True)
     # Number of top tables to retrieve per querys

    # Read tables and questions
    tables = read_tables(full_table_index_file_without_token)
    questions = read_questions(questions_file)

    # Build BM25 index
    logging.info("Building BM25 index...")
    bm25_index, table_ids = build_bm25_index(tables)

    # Retrieve top k tables for each query and write to output file
    logging.info("Retrieving top tables for each query...")
    with open(ranking_output_file, 'w', encoding='utf-8') as out_f:
        for question_id, query in tqdm.tqdm(questions, desc="Processing queries"):
            top_tables = retrieve_top_tables(bm25_index, table_ids, query, k)
            for table_id, score in top_tables:
                out_f.write(f"{question_id}\t{table_id}\t{score}\n")
    
    logging.info(f"Results written to {ranking_output_file}")
    return ranking_output_file

def main():
    k_list= [1,5,10,30,50,100]
    for k in k_list:
        print(k)
        bm25_ranking_file = rankTableswithBM25(k)
        random_split_train_file = "/scratch/asing725/IP/Multi-Table-RAG/WikiTableQuestions/data/pristine-unseen-tables.tsv"
        question_index = load_question_index(random_split_train_file)

        # bm25_ranking_path = "/scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/ranking_bm25.tsv"
        store_json_path = f"/scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/retrievalResult_full_table_BM25_k_{k}.json"
        view_results, recall = generate_result_file(k,bm25_ranking_file,store_json_path,random_split_train_file)
        print(f"Results are saved in {view_results} and the Recall@{k} is {round(recall,2)}")

if __name__ == "__main__":
    main()
# To Gen BM25 Ranking change k, and output_file.

[nltk_data] Downloading package punkt to /home/asing725/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


1


INFO:root:Building BM25 index...
INFO:root:Retrieving top tables for each query...
Processing queries: 100%|██████████| 4344/4344 [00:15<00:00, 285.76it/s]
INFO:root:Results written to /scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/ranking_bm25_k_1.tsv


Calculating results for Recall@1


INFO:root:Building BM25 index...


Results are saved in /scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/retrievalResult_full_table_BM25_k_1.json and the Recall@1 is 0.24
5


INFO:root:Retrieving top tables for each query...
Processing queries: 100%|██████████| 4344/4344 [00:15<00:00, 284.54it/s]
INFO:root:Results written to /scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/ranking_bm25_k_5.tsv


Calculating results for Recall@5


INFO:root:Building BM25 index...


Results are saved in /scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/retrievalResult_full_table_BM25_k_5.json and the Recall@5 is 0.36
10


INFO:root:Retrieving top tables for each query...
Processing queries: 100%|██████████| 4344/4344 [00:15<00:00, 285.78it/s]
INFO:root:Results written to /scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/ranking_bm25_k_10.tsv


Calculating results for Recall@10


INFO:root:Building BM25 index...


Results are saved in /scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/retrievalResult_full_table_BM25_k_10.json and the Recall@10 is 0.41
30


INFO:root:Retrieving top tables for each query...
Processing queries: 100%|██████████| 4344/4344 [00:15<00:00, 285.19it/s]
INFO:root:Results written to /scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/ranking_bm25_k_30.tsv


Calculating results for Recall@30


INFO:root:Building BM25 index...


Results are saved in /scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/retrievalResult_full_table_BM25_k_30.json and the Recall@30 is 0.51
50


INFO:root:Retrieving top tables for each query...
Processing queries: 100%|██████████| 4344/4344 [00:16<00:00, 270.15it/s]
INFO:root:Results written to /scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/ranking_bm25_k_50.tsv


Calculating results for Recall@50


INFO:root:Building BM25 index...


Results are saved in /scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/retrievalResult_full_table_BM25_k_50.json and the Recall@50 is 0.57
100


INFO:root:Retrieving top tables for each query...
Processing queries: 100%|██████████| 4344/4344 [00:15<00:00, 276.71it/s]
INFO:root:Results written to /scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/ranking_bm25_k_100.tsv


Calculating results for Recall@100
Results are saved in /scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/retrievalResult_full_table_BM25_k_100.json and the Recall@100 is 0.65


In [11]:
import os
import google.generativeai as genai

genai.configure(api_key="AIzaSyD1hg61ZrKgwK1J2IQfKY8rWH-7dNaV-vM")
# model = genai.GenerativeModel("gemini-1.5-pro")
# response = model.generate_content("Explain how AI works")
# print(response.text)

# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(model_name="gemini-1.5-pro",generation_config=generation_config)

chat_session = model.start_chat(
  history=[
  ]
)

response = chat_session.send_message("Hi")

print(response.text)

# prompt = "You are an advanced AI specialized in analyzing and summarizing structured data within tables. Your task is to generate a meaningful title and description for the provided table. The title should be accurately reflecting the table's primary content. The description should be detailed, summarizing the context and covering key information that can answer potential questions derived from the table's data.  Follow these guidelines:  1. Address the overall purpose and key trends in the description.  2. Include any notable patterns, relationships, or gaps in the data.  3. Ensure the description is comprehensive and avoids excessive repetition.4. Description must Include full form of all abbreviations used in the table. Here is the linearized table: "



# alt = """You are an advanced AI specialized in analyzing and summarizing structured data within tables. Your task is to generate a meaningful title and description for the provided table. The title should be concise and clearly reflect the table's primary content, while the description should summarize the context and provide insights into the key data. The description must be tailored for query ranking, meaning it should help identify the table's relevance to possible questions. Guidelines: 
# 1. Table Title:    
#     a.Provide a concise, clear title that reflects the content and context of the table.   
#     b.Be mindful of all shorthand terms in the table and expand them in the title and description. 

# 2.Table Description: 
#     Summarize the main purpose of the table, the key columns, and the types of data it contains.   
#     a. Must include abbreviations or terms that might be unclear, expand them in the description.   
#     b. Identify and explain any patterns or notable trends. For example, if certain data points frequently appear together or if there are gaps, mention them.   
#     c. Point out the significance of key relationships in the data (e.g., correlations between columns or rows).   
#     d. If there are columns with categorical data (e.g., 'Region', 'Category'), mention their distribution and any specific categories of interest.  
#     e. Ensure the description answers potential questions about the data, such as 'What key information is this table providing?' or 'How can this table be used in answering specific questions?'"""

# response = model.generate_content("query Here")
# print(response.text)
# # 

AttributeError: module 'google.generativeai' has no attribute 'GenerativeModel'

In [None]:
import csv
def linearize_table(file_path):
    # retriever_sequence = []
    # reader_sequence = []
    with open(file_path, mode='r') as file:
        reader = csv.reader(file)
        rows = list(reader)
        # headers = rows[0] 
        # data_rows = rows[1:] 

        # # Data Cleaning 
        # headers = [header.replace("\n", " ") for header in headers]
        # data_rows = [[cell.replace("\n", " ") for cell in row] for row in rows]
        table_string = ""
        for row in rows:
            table_string = table_string + str(row)
        return table_string
    
table_string  = linearize_table("/scratch/asing725/IP/Multi-Table-RAG/WikiTableQuestions/csv/204-csv/1.csv")


In [5]:
table_string

''

In [3]:
import csv
import os
def linearize_table(file_path):
    with open(file_path, mode='r') as file:
        reader = csv.reader(file)
        rows = list(reader)
        table_string = ""
        for row in rows:
            table_string = table_string + str(row)
        return table_string

def getTable(id_counter = 0):
    all_table_dict = {}
    for i in range(5):    
        print(f"##########20{i}-csv###########")
        directory =   f"/scratch/asing725/IP/Multi-Table-RAG/WikiTableQuestions/csv/20{i}-csv"  
        all_tables = sorted([f for f in os.listdir(directory) if f.endswith('.csv')])
        
        for table in all_tables:
            table_path = os.path.join(directory, str(table))
            table_string = linearize_table(table_path)
            
            all_table_dict[id_counter]=table_string
            id_counter += 1
            total_number_of_tables = id_counter

    return all_table_dict

i = 0
all_table_dict = getTable()
for key,value in all_table_dict.items():
    i = i + 1
    if i > 6:
        break
    print("Key = ",key," , Table:",value)
    
    

##########200-csv###########
##########201-csv###########
##########202-csv###########
##########203-csv###########
##########204-csv###########
Key =  0  , Table: ['Year', 'Title', 'Chart-PositionsUK', 'Chart-PositionsUS', 'Chart-PositionsNL', 'Comments']['1969', 'Renaissance', '60', '–', '10', '']['1971', 'Illusion', '–', '–', '–', '1976 (UK)']['1972', 'Prologue', '–', '–', '–', '']['1973', 'Ashes Are Burning', '–', '171', '–', '']['1974', 'Turn of the Cards', '–', '94', '–', '1975 (UK)']['1975', 'Scheherazade and Other Stories', '–', '48', '–', '']['1977', 'Novella', '–', '46', '–', '1977 (January in US, August in UK, as the band moved to the Warner Bros Music Group)']['1978', 'A Song for All Seasons', '35', '58', '–', 'UK:Silver']['1979', "Azure d'Or", '73', '125', '–', '']['1981', 'Camera Camera', '–', '196', '–', '']['1983', 'Time-Line', '–', '207', '–', '']['2001', 'Tuscany', '–', '–', '–', '']['2013', 'Grandine il Vento', '–', '–', '–', '']
Key =  1  , Table: ['Year', 'Title', 

In [4]:
all_table_dict

{0: '[\'Year\', \'Title\', \'Chart-PositionsUK\', \'Chart-PositionsUS\', \'Chart-PositionsNL\', \'Comments\'][\'1969\', \'Renaissance\', \'60\', \'–\', \'10\', \'\'][\'1971\', \'Illusion\', \'–\', \'–\', \'–\', \'1976 (UK)\'][\'1972\', \'Prologue\', \'–\', \'–\', \'–\', \'\'][\'1973\', \'Ashes Are Burning\', \'–\', \'171\', \'–\', \'\'][\'1974\', \'Turn of the Cards\', \'–\', \'94\', \'–\', \'1975 (UK)\'][\'1975\', \'Scheherazade and Other Stories\', \'–\', \'48\', \'–\', \'\'][\'1977\', \'Novella\', \'–\', \'46\', \'–\', \'1977 (January in US, August in UK, as the band moved to the Warner Bros Music Group)\'][\'1978\', \'A Song for All Seasons\', \'35\', \'58\', \'–\', \'UK:Silver\'][\'1979\', "Azure d\'Or", \'73\', \'125\', \'–\', \'\'][\'1981\', \'Camera Camera\', \'–\', \'196\', \'–\', \'\'][\'1983\', \'Time-Line\', \'–\', \'207\', \'–\', \'\'][\'2001\', \'Tuscany\', \'–\', \'–\', \'–\', \'\'][\'2013\', \'Grandine il Vento\', \'–\', \'–\', \'–\', \'\']',
 1: '[\'Year\', \'Title\', 

In [5]:
print(len((all_table_dict).keys()))

2108


In [6]:
import csv 
def load_table_index(file_path):
    table_index = {}
    with open(file_path, 'r') as file:
        reader = csv.reader(file, delimiter="\t")
        for row in reader:
            table_id, full_table = row
            table_index[table_id] = full_table
    return table_index

table_index=load_table_index("/scratch/asing725/IP/Multi-Table-RAG/TableRAG_mine/src/ColBERT/data/wtq_full_table_index.tsv")

In [7]:
print(table_index['0'])

<SOT> [table title] <EOT> <BOC> Year <SOC> Title <SOC> Chart-PositionsUK <SOC> Chart-PositionsUS <SOC> Chart-PositionsNL <SOC> Comments <EOC> <BOR>1969 <SOR> Renaissance <SOR> 60 <SOR> – <SOR> 10 <SOR>  <EOR> <BOR>1971 <SOR> Illusion <SOR> – <SOR> – <SOR> – <SOR> 1976 (UK) <EOR> <BOR>1972 <SOR> Prologue <SOR> – <SOR> – <SOR> – <SOR>  <EOR> <BOR>1973 <SOR> Ashes Are Burning <SOR> – <SOR> 171 <SOR> – <SOR>  <EOR> <BOR>1974 <SOR> Turn of the Cards <SOR> – <SOR> 94 <SOR> – <SOR> 1975 (UK) <EOR> <BOR>1975 <SOR> Scheherazade and Other Stories <SOR> – <SOR> 48 <SOR> – <SOR>  <EOR> <BOR>1977 <SOR> Novella <SOR> – <SOR> 46 <SOR> – <SOR> 1977 (January in US, August in UK, as the band moved to the Warner Bros Music Group) <EOR> <BOR>1978 <SOR> A Song for All Seasons <SOR> 35 <SOR> 58 <SOR> – <SOR> UK:Silver <EOR> <BOR>1979 <SOR> Azure d'Or <SOR> 73 <SOR> 125 <SOR> – <SOR>  <EOR> <BOR>1981 <SOR> Camera Camera <SOR> – <SOR> 196 <SOR> – <SOR>  <EOR> <BOR>1983 <SOR> Time-Line <SOR> – <SOR> 207 <SOR> 

In [None]:
import google.generativeai as genai
import time
genai.configure(api_key="AIzaSyD-lurOjjqQx_9YuhCObQVzpl9kWOGbQL8")
# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
  model_name="gemini-1.5-flash",
  generation_config=generation_config,
)
chat_session = model.start_chat(
  history=[]
)
flag = 0
print("Starting...")
# with open("/scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/table_summary.txt", "w") as file:
#    pass
for id,table in table_index.items():
    print(id)
    if int(id)<=1621:
        continue
    print(id,table)
    time.sleep(5)
    prompt = """You are an advanced AI specialized in analyzing and summarizing structured data within tables. Your task is to generate a meaningful title and description for the provided table. The title should be concise and clearly reflect the table's primary content, while the description should summarize the context and provide insights into the key data. The description must be tailored for query ranking, meaning it should help identify the table's relevance to possible questions. Guidelines: 
    1. Table Title:    
    a.Provide a concise, clear title that reflects the content and context of the table.   
    b.Be mindful of all shorthand terms in the table and expand them in the title and description. 
    2.Table Description: 
    Summarize the main purpose of the table, the key columns, and the types of data it contains.   
    a. Must include abbreviations or terms that might be unclear, expand them in the description.   
    b. Identify and explain any patterns or notable trends. For example, if certain data points frequently appear together or if there are gaps, mention them.   
    c. Point out the significance of key relationships in the data (e.g., correlations between columns or rows).   
    d. If there are columns with categorical data (e.g., 'Region', 'Category'), mention their distribution and any specific categories of interest.  
    e. Ensure the description answers potential questions about the data, such as 'What key information is this table providing?' or 'How can this table be used in answering specific questions?'
    Here is the Table: """
    prompt = prompt + table
    # print(prompt)
    # Assuming `response` is an instance of the GenerateContentResponse class,
# you need to access the result attribute and then extract the 'text' part.
    with open("/scratch/asing725/IP/Multi-Table-RAG/TableRAG_mine/src/ColBERT/data/table_summary.txt", "a") as file:
        # Send the message and get the response
        response = chat_session.send_message(prompt)
        print(id)
        # Assuming `response.result` holds the data in a dictionary-like structure
        # text_response = response['candidates'][0]['content']['parts'][0]['text']
        
        # Writing the text to the file
        if(id!='0'):
          file.write(",")
        file.write("["+id + ":" + str(response.text)+"]")
        file.write("\n")
        
        # Print the response for verification
        # print(text_response)
        
        # If flag is 3, break the loop
        # if flag == 1400:
        #     break
        flag = flag + 1


Starting...
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
27

1574
1575
1575 <SOT> [table title] <EOT> <BOC> World Record <SOC> Snatch <SOC> Akakios Kakiasvilis (GRE) <SOC> 188 kg <SOC> Athens, Greece <SOC> 27 November 1999 <EOC> <BOR>World Record <SOR> Clean & Jerk <SOR> Szymon Kołecki (POL) <SOR> 232 kg <SOR> Sofia, Bulgaria <SOR> 29 April 2000 <EOR> <BOR>World Record <SOR> Total <SOR> Akakios Kakiasvilis (GRE) <SOR> 412 kg <SOR> Athens, Greece <SOR> 27 November 1999 <EOR> <BOR>Asian Record <SOR> Snatch <SOR> Kourosh Bagheri (IRI) <SOR> 187 kg <SOR> Sydney, Australia <SOR> 24 September 2000 <EOR> <BOR>Asian Record <SOR> Clean & Jerk <SOR> Ilya Ilyin (KAZ) <SOR> 226 kg <SOR> Doha, Qatar <SOR> 5 December 2006 <EOR> <BOR>Asian Record <SOR> Total <SOR> Kourosh Bagheri (IRI) <SOR> 407 kg <SOR> Antalya, Turkey <SOR> 9 November 2001 <EOR> <BOR>Games Record <SOR> Snatch <SOR> Bakhyt Akhmetov (KAZ) <SOR> 185 kg <SOR> Busan, South Korea <SOR> 8 October 2002 <EOR> <BOR>Games Record <SOR> Clean & Jerk <SOR> Ilya Ilyin (KAZ) <SOR> 226 kg <SOR> Doha, Qatar <

In [20]:
import re

def remove_tokens_in_question_index(input_file: str, output_file: str):
    """
    Converts the table entries in the input file by removing special tokens and writes to an output file.

    Args:
        input_file (str): Path to the input tsv file with table IDs and tables.
        output_file (str): Path to the output tsv file with linearized tables.
    """
    # Define the regex to remove unwanted tokens
    token_pattern = re.compile(r"<[^>]+>|-|")

    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            parts = line.strip().split('\t', 1)  # Split line into table_id and table
            if len(parts) != 2:
                continue

            table_id, table = parts
            # Remove unwanted tokens
            linear_table = token_pattern.sub("", table).strip()
            # Remove extra spaces
            linear_table = re.sub(r"\s+", " ", linear_table)

            # Write the processed line to the output file
            outfile.write(f"{table_id}\t{linear_table}\n")

if __name__ == "__main__":
    input_file = "/scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/wtq_full_table_index.tsv"
    output_file = "/scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/wtq_full_table_linearized.tsv"

    remove_tokens_in_question_index(input_file, output_file)
    print(f"Linearized table, removed tokens and written to {output_file}")

Linearized table, removed tokens and written to /scratch/asing725/IP/Multi-Table-RAG/TableRAG/src/ColBERT/data/wtq_full_table_linearized.tsv


Hi there! How can I help you today?

