In [6]:
# Import necessary libraries
import os
import pandas as pd
import requests
import tarfile # Import tarfile for extraction

# Define the URL for the dataset archive
archive_url = "https://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/archives/fgvc-aircraft-2013b.tar.gz"
archive_filename = "fgvc-aircraft-2013b.tar.gz"

# Define a base directory to store all data
base_data_dir = "aircraft_data"
os.makedirs(base_data_dir, exist_ok=True)

# Path for the downloaded archive
archive_path = os.path.join(base_data_dir, archive_filename)

print(f"Downloading dataset archive to '{archive_path}'...")

try:
    response = requests.get(archive_url, stream=True)
    response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
    with open(archive_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"Successfully downloaded: {archive_filename}")

    print(f"Extracting '{archive_filename}'...")
    # The archive usually extracts into a directory named 'fgvc-aircraft-2013b' or similar
    # We'll check the contents of base_data_dir to find the actual extracted folder
    with tarfile.open(archive_path, "r:gz") as tar:
        tar.extractall(path=base_data_dir)
    print("Extraction complete.")

except requests.exceptions.RequestException as e:
    print(f"Error downloading {archive_filename}: {e}")
except tarfile.ReadError as e:
    print(f"Error extracting {archive_filename}: {e}")

# Update data_dir to point to the directory containing the annotation files after extraction
# The annotation files are typically located at aircraft_data/fgvc-aircraft-2013b/data/
data_dir = os.path.join(base_data_dir, "fgvc-aircraft-2013b", "data")

print(f"Annotation files expected to be in: '{data_dir}'")

# You can optionally check if the directory exists and list its contents
if os.path.exists(data_dir):
    print(f"Contents of '{data_dir}': {os.listdir(data_dir)}")
else:
    print(f"Error: Expected data directory '{data_dir}' not found after extraction. Please check the archive structure.")

# Define the annotation files that will be read from the extracted data_dir
annotation_files = [
    "images_variant_test.txt",
    "images_variant_train.txt",
    "variants.txt",
    "images.txt"
]

Downloading dataset archive to 'aircraft_data/fgvc-aircraft-2013b.tar.gz'...
Successfully downloaded: fgvc-aircraft-2013b.tar.gz
Extracting 'fgvc-aircraft-2013b.tar.gz'...


  tar.extractall(path=base_data_dir)


Extraction complete.
Annotation files expected to be in: 'aircraft_data/fgvc-aircraft-2013b/data'
Contents of 'aircraft_data/fgvc-aircraft-2013b/data': ['images_manufacturer_val.txt', 'images_manufacturer_train.txt', 'images_variant_train.txt', 'images_val.txt', 'images_family_test.txt', 'images_family_val.txt', 'images', 'images_test.txt', 'images_family_trainval.txt', 'images_variant_trainval.txt', 'variants.txt', 'images_manufacturer_trainval.txt', 'images_box.txt', 'families.txt', 'images_train.txt', 'images_variant_test.txt', 'images_manufacturer_test.txt', 'images_family_train.txt', 'manufacturers.txt', 'images_variant_val.txt']


In [7]:
# Load the annotation files into pandas DataFrames

# variants.txt: Contains a list of all aircraft variants
variants_df = pd.read_csv(os.path.join(data_dir, "variants.txt"), header=None, names=["variant"])
print("\nVariants DataFrame (first 5 rows):")
display(variants_df.head())

# images_df: Contains a list of all images with their IDs
# Since images.txt is not directly present, we will combine images_train.txt, images_val.txt, and images_test.txt
images_train_paths_df = pd.read_csv(os.path.join(data_dir, "images_train.txt"), sep=' ', header=None, names=["image_id", "image_path"])
images_val_paths_df = pd.read_csv(os.path.join(data_dir, "images_val.txt"), sep=' ', header=None, names=["image_id", "image_path"])
images_test_paths_df = pd.read_csv(os.path.join(data_dir, "images_test.txt"), sep=' ', header=None, names=["image_id", "image_path"])

images_df = pd.concat([images_train_paths_df, images_val_paths_df, images_test_paths_df], ignore_index=True)
print("\nImages DataFrame (combined from train/val/test - first 5 rows):")
display(images_df.head())

# images_variant_train.txt: Image IDs and their corresponding variants for the training set
# Read the file as a single column and then split it into two at the first space
train_variants_raw = pd.read_csv(os.path.join(data_dir, "images_variant_train.txt"), header=None, engine='python', on_bad_lines='warn')
train_variants_data = train_variants_raw[0].str.split(n=1, expand=True)
train_variants_df = pd.DataFrame({'image_id': pd.to_numeric(train_variants_data[0]), 'variant': train_variants_data[1]})
print("\nTraining Variants DataFrame (first 5 rows):")
display(train_variants_df.head())

# images_variant_test.txt: Image IDs and their corresponding variants for the test set
# Apply the same robust parsing method
test_variants_raw = pd.read_csv(os.path.join(data_dir, "images_variant_test.txt"), header=None, engine='python', on_bad_lines='warn')
test_variants_data = test_variants_raw[0].str.split(n=1, expand=True)
test_variants_df = pd.DataFrame({'image_id': pd.to_numeric(test_variants_data[0]), 'variant': test_variants_data[1]})
print("\nTesting Variants DataFrame (first 5 rows):")
display(test_variants_df.head())


Variants DataFrame (first 5 rows):


Unnamed: 0,variant
0,707-320
1,727-200
2,737-200
3,737-300
4,737-400



Images DataFrame (combined from train/val/test - first 5 rows):


Unnamed: 0,image_id,image_path
0,1025794,
1,1340192,
2,56978,
3,698580,
4,450014,



Training Variants DataFrame (first 5 rows):


Unnamed: 0,image_id,variant
0,1025794,707-320
1,1340192,707-320
2,56978,707-320
3,698580,707-320
4,450014,707-320



Testing Variants DataFrame (first 5 rows):


Unnamed: 0,image_id,variant
0,1514522,707-320
1,747566,707-320
2,1008575,707-320
3,717480,707-320
4,991569,707-320


In [8]:
# Example: Merge training variants with image paths
full_train_df = pd.merge(train_variants_df, images_df, on="image_id")
print("\nFull Training Data (Image Path and Variant - first 5 rows):")
display(full_train_df.head())

# If you also wanted to download the images, you would typically use a similar `requests.get` approach
# but specify the `fgvc-aircraft-2013.tar.gz` file for download and then extract it.


Full Training Data (Image Path and Variant - first 5 rows):


Unnamed: 0,image_id,variant,image_path
0,1025794,707-320,
1,1340192,707-320,
2,56978,707-320,
3,698580,707-320,
4,450014,707-320,
