# CONVERTING DATA TO DATA WITHOUT SUITABLE ANSWER

In [15]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os

## Acessing dataset

In [16]:
# Function to read and parse dev.jsonl
def read_dev_jsonl(file_path):
    with open(file_path, 'r') as file:
        dev_data = [json.loads(line) for line in file]
    return dev_data

# Paths to your files
dev_jsonl_path = '.../raw_dataset/wikisql/data/dev.jsonl'
test_jsonl_path = '.../raw_dataset/wikisql/data/test.jsonl'
train_jsonl_path = '.../raw_dataset/wikisql/data/train.jsonl'

# Read the files
dev_data = read_dev_jsonl(dev_jsonl_path)
test_data = read_dev_jsonl(test_jsonl_path)
train_data = read_dev_jsonl(train_jsonl_path)


# Example: Accessing the first line of each file
print(dev_data[:1])
print(test_data[:1])
print(train_data[:1])

[{'phase': 1, 'table_id': '1-10015132-11', 'question': 'What position does the player who played for butler cc (ks) play?', 'sql': {'sel': 3, 'conds': [[5, 0, 'Butler CC (KS)']], 'agg': 0}}]
[{'phase': 1, 'table_id': '1-10015132-16', 'question': "What is terrence ross' nationality", 'sql': {'sel': 2, 'conds': [[0, 0, 'Terrence Ross']], 'agg': 0}}]
[{'phase': 1, 'table_id': '1-1000181-1', 'question': 'Tell me what the notes are for South Australia ', 'sql': {'sel': 5, 'conds': [[3, 0, 'SOUTH AUSTRALIA']], 'agg': 0}}]


## Write new .json files with percentage of swapped table_ids
in dev.jsonl, train.jsonl and test.jsonl

In [17]:
# Function to write data to a new dev.jsonl file
def write_dev_jsonl(file_path, data):
    with open(file_path, 'w') as file:
        for entry in data:
            file.write(json.dumps(entry) + '\n')

# Function to randomly swap 20% of the table_ids, modify answers, and append to original data
def swap_table_ids_and_modify_answers(file_path, percentage=20):
    dev_data = read_dev_jsonl(file_path)
    total_lines = len(dev_data)
    num_to_swap = int(total_lines * (percentage / 100))
    
    # Get unique table_ids
    table_ids = list(set(entry['table_id'] for entry in dev_data))
    
    modified_data = []
    for _ in range(num_to_swap):
        entry = random.choice(dev_data)
        new_table_id = random.choice(table_ids)
        while new_table_id == entry['table_id']:
            new_table_id = random.choice(table_ids)
        
        new_entry = entry.copy()
        new_entry['table_id'] = new_table_id
        new_entry['sql'] = {"sel": -1, "conds": [], "agg": -1} # Reset the sql query
        modified_data.append(new_entry)
    
    # Combine original data with modified data
    combined_data = dev_data.copy()  # Make a copy of the original data
    combined_data.extend(modified_data)

    # Create new file name
    base_name = os.path.basename(file_path)
    dir_name = os.path.dirname(file_path)
    new_file_name = base_name.replace('.jsonl', f'_{percentage}percent_swapped.jsonl')
    new_file_path = os.path.join(dir_name, new_file_name)
    
    # Write the combined data to the new file
    write_dev_jsonl(new_file_path, combined_data)
    
    print(f"New file created: {new_file_path}")
    return new_file_path

# Swap table_ids randomly
swap_table_ids_and_modify_answers(dev_jsonl_path)
swap_table_ids_and_modify_answers(test_jsonl_path)
swap_table_ids_and_modify_answers(train_jsonl_path)

New file created: /Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/raw_dataset/wikisql/data/dev_20percent_swapped.jsonl
New file created: /Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/raw_dataset/wikisql/data/test_20percent_swapped.jsonl
New file created: /Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/raw_dataset/wikisql/data/train_20percent_swapped.jsonl


'/Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/raw_dataset/wikisql/data/train_20percent_swapped.jsonl'

## Print lines swapped dev, test, train files:

In [19]:
# Function to print the last 10 lines of a JSONL file
def print_last_10_lines(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for line in lines[-10:]:
            print(line.strip())

# Function to print lines where ['sql'] = {"sel": None, "conds": [], "agg": None}
def print_lines_with_no_sql_answer(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            entry = json.loads(line)
            if entry['sql'] == {"sel": None, "conds": [], "agg": None}:
                print(line.strip())

# Print the last 10 lines of the created files
print("Last 10 lines of the new dev file:")
print_last_10_lines('/Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/raw_dataset/wikisql/data/dev_20percent_swapped.jsonl')
print("\n")

print("Last 10 lines of the new test file:")
print_last_10_lines('/Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/raw_dataset/wikisql/data/test_20percent_swapped.jsonl')
print("\n")

print("Last 10 lines of the new train file:")
print_last_10_lines('/Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/raw_dataset/wikisql/data/train_20percent_swapped.jsonl')
print("\n")

# Print lines where ['sql'] = {"sel": None, "conds": [], "agg": None}
print("Lines with no suitable SQL answer in the new dev file:")
print_lines_with_no_sql_answer('/Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/raw_dataset/wikisql/data/dev_20percent_swapped.jsonl')
print("\n")

print("Lines with no suitable SQL answer in the new test file:")
print_lines_with_no_sql_answer('/Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/raw_dataset/wikisql/data/test_20percent_swapped.jsonl')
print("\n")

print("Lines with no suitable SQL answer in the new train file:")
print_lines_with_no_sql_answer('/Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/raw_dataset/wikisql/data/train_20percent_swapped.jsonl')

Last 10 lines of the new dev file:
{"phase": 2, "table_id": "2-17311797-10", "question": "What is the 9th runner-up with a top 18/20/24/30 greater than 17 and a 5th runner-up of 2?", "sql": {"sel": -1, "conds": [], "agg": -1}}
{"phase": 2, "table_id": "2-16827273-1", "question": "What was the record on the game that was played on october 27?", "sql": {"sel": -1, "conds": [], "agg": -1}}
{"phase": 2, "table_id": "2-1074011-3", "question": "What was the record after the game before Jan 7?", "sql": {"sel": -1, "conds": [], "agg": -1}}
{"phase": 2, "table_id": "2-10560886-9", "question": "Name the Year which has a Label of atco records and a Type of album? Question 2", "sql": {"sel": -1, "conds": [], "agg": -1}}
{"phase": 2, "table_id": "1-160510-1", "question": "Which Player has United States as Nationality, forward as Position and a greater than 5 Round?", "sql": {"sel": -1, "conds": [], "agg": -1}}
{"phase": 2, "table_id": "2-18880018-2", "question": "How many people attended when Wake 