#### Considering 25% of the dataset

In [1]:
import os
import shutil
import random


In [2]:

# Define the source and destination directories
source_dir = 'train2014'
destination_dir = 'Quarter_Dataset'

# Create the destination directory if it doesn't exist
if not os.path.exists(destination_dir):
    os.makedirs(destination_dir)


In [3]:

# Get a list of all files in the source directory
all_files = os.listdir(source_dir)

# Filter out non-image files (optional, assuming your images are in common formats like .jpg, .png)
image_files = [f for f in all_files if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]

# Calculate 25% of the total number of image files
num_files_to_copy = len(image_files) // 4


In [4]:

# Randomly select 25% of the image files
selected_files = random.sample(image_files, num_files_to_copy)

# Copy the selected files to the destination directory
for file_name in selected_files:
    src_file = os.path.join(source_dir, file_name)
    dst_file = os.path.join(destination_dir, file_name)
    shutil.copy(src_file, dst_file)

print(f"Copied {num_files_to_copy} files to {destination_dir}")


Copied 20695 files to Quarter_Dataset


Filtering the corresponding questions and annotations from the json files

In [5]:
import os
import json
import re


In [9]:

def filter_json_by_image_ids(dataset_dir, input_json_path, output_json_path=None):
    # Create the output JSON filename if not provided
    if output_json_path is None:
        output_json_path = f'Filtered_{os.path.basename(input_json_path)}'
    
    # Read the JSON file
    with open(input_json_path, 'r') as file:
        data = json.load(file)
    
    # Ensure data has the correct structure
    if "questions" not in data:
        raise ValueError("The JSON file does not have a 'questions' key.")
    
    questions = data["questions"]

    # Define the set of image extensions to consider
    image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.bmp')
    
    # Extract image IDs from the filenames in the dataset folder
    image_ids = set()
    pattern = re.compile(r'COCO_train2014_(\d+)\.(png|jpg|jpeg|gif|bmp)', re.IGNORECASE)
    for filename in os.listdir(dataset_dir):
        match = pattern.match(filename)
        if match:
            image_ids.add(int(match.group(1)))

    # Filter the JSON entries
    filtered_data = [entry for entry in questions if entry['image_id'] in image_ids]

    # Write the filtered entries to the output JSON file
    with open(output_json_path, 'w') as file:
        json.dump({"questions": filtered_data}, file, indent=4)

    print(f"Filtered JSON entries have been written to {output_json_path}")

In [11]:

# Example usage
dataset_dir = "Quarter_Dataset"
input_json_path = "v2_OpenEnded_mscoco_train2014_questions.json"  # Replace with your actual JSON file name
output_json_path = "Filtered_jsons/Questions.json"  # Optional: Replace with desired output JSON file path if needed

filter_json_by_image_ids(dataset_dir, input_json_path, output_json_path)


Filtered JSON entries have been written to Filtered_jsons/Questions.json


In [13]:
def filter_annotations_by_image_ids(dataset_dir, input_json_path, output_json_path=None):
    # Create the output JSON filename if not provided
    if output_json_path is None:
        output_json_path = f'Filtered_{os.path.basename(input_json_path)}'
    
    # Read the JSON file
    with open(input_json_path, 'r') as file:
        data = json.load(file)
    
    # Ensure data has the correct structure
    if "annotations" not in data:
        raise ValueError("The JSON file does not have an 'annotations' key.")
    
    annotations = data["annotations"]

    # Define the set of image extensions to consider
    image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.bmp')
    
    # Extract image IDs from the filenames in the dataset folder
    image_ids = set()
    pattern = re.compile(r'COCO_train2014_(\d+)\.(png|jpg|jpeg|gif|bmp)', re.IGNORECASE)
    for filename in os.listdir(dataset_dir):
        match = pattern.match(filename)
        if match:
            image_ids.add(int(match.group(1)))

    # Filter the JSON entries
    filtered_annotations = [entry for entry in annotations if entry['image_id'] in image_ids]

    # Write the filtered entries to the output JSON file
    with open(output_json_path, 'w') as file:
        json.dump({"annotations": filtered_annotations}, file, indent=4)

    print(f"Filtered JSON entries have been written to {output_json_path}")



In [15]:
# Example usage
dataset_dir = 'Quarter_Dataset'
input_json_path = 'v2_mscoco_train2014_annotations.json'  # Replace with your actual JSON file name
output_json_path = 'Filtered_jsons/Annotations.json'  # Optional: Replace with desired output JSON file path if needed

filter_annotations_by_image_ids(dataset_dir, input_json_path, output_json_path)


Filtered JSON entries have been written to Filtered_jsons/Annotations.json


Converting Json files to CSV files

In [7]:
import json
import os

# Define the path for the JSON file and the output text file
json_file_path = 'Filtered_jsons/Questions.json'
output_dir = 'Final_dataset'
output_file_path = os.path.join(output_dir, 'train_image_ids.txt')

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)


    # Load the JSON data from the file
with open(json_file_path, 'r') as file:
    data = json.load(file)

question_ids = [str(question['question_id']) for question in data['questions']]

# Write the question_ids to the train_img_list.txt file
with open('train_img_list.txt', 'w') as file:
    for question_id in question_ids:
        file.write(question_id + '\n')

print("question_ids have been written to train_img_list.txt")


question_ids have been written to train_img_list.txt


In [1]:
import json
import csv

# Load annotations.json
with open('Filtered_jsons/Annotations.json', 'r') as f:
    annotations_data = json.load(f)

# Load questions.json
with open('Filtered_jsons/Questions.json', 'r') as f:
    questions_data = json.load(f)

# Create a dictionary to map question_id to question text
question_id_to_text = {question['question_id']: question['question'] for question in questions_data['questions']}

# Create a list to store rows for the CSV
csv_data = []

# Iterate through annotations to extract relevant information
for annotation in annotations_data['annotations']:
    question_id = annotation['question_id']
    # Look up the corresponding question text using the question_id
    question = question_id_to_text.get(question_id, '')
    # Extract the multiple choice answer and image_id
    answer = annotation['multiple_choice_answer']
    image_id = annotation['image_id']
    # Append the data to the CSV data list
    csv_data.append([question, answer, image_id])

# Write the CSV file
with open('data_train.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Question', 'Answer', 'Image_id'])  # Write header
    writer.writerows(csv_data)

print("CSV file created successfully.")


CSV file created successfully.


#### Splitting the Validation dataset in a similar fashiom

In [3]:
import os
import shutil
import random



In [4]:

# Define the source and destination directories
source_dir = 'val2014'
destination_dir = 'Validation_Quarter_Dataset'

# Create the destination directory if it doesn't exist
if not os.path.exists(destination_dir):
    os.makedirs(destination_dir)



In [5]:

# Get a list of all files in the source directory
all_files = os.listdir(source_dir)

# Filter out non-image files (optional, assuming your images are in common formats like .jpg, .png)
image_files = [f for f in all_files if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]

# Calculate 25% of the total number of image files
num_files_to_copy = len(image_files) // 4


# Randomly select 25% of the image files
selected_files = random.sample(image_files, num_files_to_copy)

# Copy the selected files to the destination directory
for file_name in selected_files:
    src_file = os.path.join(source_dir, file_name)
    dst_file = os.path.join(destination_dir, file_name)
    shutil.copy(src_file, dst_file)

print(f"Copied {num_files_to_copy} files to {destination_dir}")

Copied 10126 files to Validation_Quarter_Dataset


In [6]:

import os
import json
import re



In [13]:

def filter_json_by_image_ids(dataset_dir, input_json_path, output_json_path=None):
    # Create the output JSON filename if not provided
    if output_json_path is None:
        output_json_path = f'Filtered_{os.path.basename(input_json_path)}'
    
    # Read the JSON file
    with open(input_json_path, 'r') as file:
        data = json.load(file)
    
    # Ensure data has the correct structure
    if "questions" not in data:
        raise ValueError("The JSON file does not have a 'questions' key.")
    
    questions = data["questions"]

    # Define the set of image extensions to consider
    image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.bmp')
    
    # Extract image IDs from the filenames in the dataset folder
    image_ids = set()
    pattern = re.compile(r'COCO_val2014_(\d+)\.(png|jpg|jpeg|gif|bmp)', re.IGNORECASE)
    for filename in os.listdir(dataset_dir):
        match = pattern.match(filename)
        if match:
            image_ids.add(int(match.group(1)))

    # Filter the JSON entries
    filtered_data = [entry for entry in questions if entry['image_id'] in image_ids]

    # Write the filtered entries to the output JSON file
    with open(output_json_path, 'w') as file:
        json.dump({"questions": filtered_data}, file, indent=4)

    print(f"Filtered JSON entries have been written to {output_json_path}")

# Example usage


In [14]:
dataset_dir = "Validation_Quarter_Dataset"
input_json_path = "v2_OpenEnded_mscoco_val2014_questions.json"  # Replace with your actual JSON file name
output_json_path = "Filtered_jsons/Val_Questions.json"  # Optional: Replace with desired output JSON file path if needed

filter_json_by_image_ids(dataset_dir, input_json_path, output_json_path)


Filtered JSON entries have been written to Filtered_jsons/Val_Questions.json


In [15]:

def filter_annotations_by_image_ids(dataset_dir, input_json_path, output_json_path=None):
    # Create the output JSON filename if not provided
    if output_json_path is None:
        output_json_path = f'Filtered_{os.path.basename(input_json_path)}'
    
    # Read the JSON file
    with open(input_json_path, 'r') as file:
        data = json.load(file)
    
    # Ensure data has the correct structure
    if "annotations" not in data:
        raise ValueError("The JSON file does not have an 'annotations' key.")
    
    annotations = data["annotations"]

    # Define the set of image extensions to consider
    image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.bmp')
    
    # Extract image IDs from the filenames in the dataset folder
    image_ids = set()
    pattern = re.compile(r'COCO_val2014_(\d+)\.(png|jpg|jpeg|gif|bmp)', re.IGNORECASE)
    for filename in os.listdir(dataset_dir):
        match = pattern.match(filename)
        if match:
            image_ids.add(int(match.group(1)))

    # Filter the JSON entries
    filtered_annotations = [entry for entry in annotations if entry['image_id'] in image_ids]

    # Write the filtered entries to the output JSON file
    with open(output_json_path, 'w') as file:
        json.dump({"annotations": filtered_annotations}, file, indent=4)

    print(f"Filtered JSON entries have been written to {output_json_path}")


In [16]:


# Example usage
dataset_dir = 'Validation_Quarter_Dataset'
input_json_path = 'v2_mscoco_val2014_annotations.json'  # Replace with your actual JSON file name
output_json_path = 'Filtered_jsons/Val_Annotations.json'  # Optional: Replace with desired output JSON file path if needed

filter_annotations_by_image_ids(dataset_dir, input_json_path, output_json_path)


Filtered JSON entries have been written to Filtered_jsons/Val_Annotations.json


In [17]:

import json
import os

# Define the path for the JSON file and the output text file
json_file_path = 'Filtered_jsons/Val_Questions.json'
output_dir = 'Final_dataset'
output_file_path = os.path.join(output_dir, 'val_image_ids.txt')

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)


    # Load the JSON data from the file
with open(json_file_path, 'r') as file:
    data = json.load(file)

question_ids = [str(question['question_id']) for question in data['questions']]

# Write the question_ids to the train_img_list.txt file
with open('val_img_list.txt', 'w') as file:
    for question_id in question_ids:
        file.write(question_id + '\n')

print("question_ids have been written to val_img_list.txt")


question_ids have been written to val_img_list.txt


In [18]:

import json
import csv

# Load annotations.json
with open('Filtered_jsons/Val_Annotations.json', 'r') as f:
    annotations_data = json.load(f)

# Load questions.json
with open('Filtered_jsons/Val_Questions.json', 'r') as f:
    questions_data = json.load(f)

# Create a dictionary to map question_id to question text
question_id_to_text = {question['question_id']: question['question'] for question in questions_data['questions']}

# Create a list to store rows for the CSV
csv_data = []

# Iterate through annotations to extract relevant information
for annotation in annotations_data['annotations']:
    question_id = annotation['question_id']
    # Look up the corresponding question text using the question_id
    question = question_id_to_text.get(question_id, '')
    # Extract the multiple choice answer and image_id
    answer = annotation['multiple_choice_answer']
    image_id = annotation['image_id']
    # Append the data to the CSV data list
    csv_data.append([question, answer, image_id])

# Write the CSV file
with open('data_eval.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Question', 'Answer', 'Image_id'])  # Write header
    writer.writerows(csv_data)

print("CSV file created successfully.")


CSV file created successfully.


#### Generating combined files from both train and validation

In [19]:
import pandas as pd

# Read the first CSV file
df1 = pd.read_csv('Final_Dataset/data_train.csv')

# Read the second CSV file
df2 = pd.read_csv('Final_Dataset/data_eval.csv')

# Merge the two dataframes
merged_df = pd.concat([df1, df2], ignore_index=True)

# Write the merged dataframe to a new CSV file
merged_df.to_csv('Final_Dataset/data.csv', index=False)

print("CSV files merged successfully.")


CSV files merged successfully.


In [21]:
import pandas as pd

# Read the merged CSV file
merged_df = pd.read_csv('Final_Dataset/data.csv')

# Extract the "Question" and "Answer" columns
question_answer_data = merged_df[['Question', 'Answer']]

# Write the data to a text file
with open('Final_Dataset/all_pairs.txt', 'w') as file:
    for index, row in question_answer_data.iterrows():
        file.write(f"{row['Question']}\n")
        file.write(f"{row['Answer']}\n")

print("Data written")


Data written


In [29]:
import pandas as pd

# Read the merged CSV file
merged_df = pd.read_csv('Final_Dataset/data.csv')

# Extract unique answers as a DataFrame
unique_answers_df = pd.DataFrame(merged_df['Answer'].unique(), columns=['Unique_Answers'])

# Write unique answers DataFrame to a text file
unique_answers_df.to_csv('answers.txt', index=False, header=False, sep='\n')

print("Unique answers DataFrame written.")


Unique answers DataFrame written.


In [2]:
import csv

# Read the CSV file
csv_filename = 'Final_Dataset/data_train.csv'
image_ids = set()  # Using a set to avoid duplicates

with open(csv_filename, 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        image_id = row[2]  # Assuming the image_id is in the third column
        image_ids.add(image_id)

# Write the image IDs to a text file
txt_filename = 'Final_Dataset/train_image_ids.txt'

with open(txt_filename, 'w') as txtfile:
    for image_id in sorted(image_ids):  # Sort the IDs if needed
        txtfile.write(image_id + '\n')

print(f'Image IDs have been written to {txt_filename}')


Image IDs have been written to Final_Dataset/train_image_ids.txt


In [3]:
import csv

# Read the CSV file
csv_filename = 'Final_Dataset/data_eval.csv'
image_ids = set()  # Using a set to avoid duplicates

with open(csv_filename, 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        image_id = row[2]  # Assuming the image_id is in the third column
        image_ids.add(image_id)

# Write the image IDs to a text file
txt_filename = 'Final_Dataset/val_image_ids.txt'

with open(txt_filename, 'w') as txtfile:
    for image_id in sorted(image_ids):  # Sort the IDs if needed
        txtfile.write(image_id + '\n')

print(f'Image IDs have been written to {txt_filename}')


Image IDs have been written to Final_Dataset/val_image_ids.txt
