# Data Collection:

Data Loading: Use the code to fetch data from The MET API and save it to a CSV file (metdata.csv).

In [None]:
import numpy as np
import cv2
import requests
import csv
import os
import pandas as pd

# Use The MET API to get the object IDs
r = requests.get("https://collectionapi.metmuseum.org/public/collection/v1/objects")
r_json = r.json()
total = r_json['total']
print("There are {} valid objects in this dataset".format(total))
objectIDs = r_json['objectIDs']

# get the column headers from the first object ID
prefix = "https://collectionapi.metmuseum.org/public/collection/v1/objects/"
url = prefix + str(objectIDs[0])
r = requests.get(url)
col_headers = r.json().keys()

# create the csv file and write the first row
filename = 'metdata.csv'  # Adjust the file path as needed

# remove if it already exists
try:
    os.remove(filename)
except OSError:
    pass

print("The column headers are the following: {}".format(col_headers))

# Write data to CSV
with open(filename, 'w') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(col_headers)
    
    for index, ID in enumerate(objectIDs):
        url = prefix + str(ID)
        r = requests.get(url)
        values = r.json().values()
        csv_writer.writerow(values)
        # print out each of the rows by their index
        print(index)


There are 487594 valid objects in this dataset
The column headers are the following: dict_keys(['objectID', 'isHighlight', 'accessionNumber', 'accessionYear', 'isPublicDomain', 'primaryImage', 'primaryImageSmall', 'additionalImages', 'constituents', 'department', 'objectName', 'title', 'culture', 'period', 'dynasty', 'reign', 'portfolio', 'artistRole', 'artistPrefix', 'artistDisplayName', 'artistDisplayBio', 'artistSuffix', 'artistAlphaSort', 'artistNationality', 'artistBeginDate', 'artistEndDate', 'artistGender', 'artistWikidata_URL', 'artistULAN_URL', 'objectDate', 'objectBeginDate', 'objectEndDate', 'medium', 'dimensions', 'measurements', 'creditLine', 'geographyType', 'city', 'state', 'county', 'country', 'region', 'subregion', 'locale', 'locus', 'excavation', 'river', 'classification', 'rightsAndReproduction', 'linkResource', 'metadataDate', 'repository', 'objectURL', 'tags', 'objectWikidata_URL', 'isTimelineWork', 'GalleryNumber'])
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
1

# Data Organization:

File Structure & GitHub:
Create a directory structure.
Add the metdata.csv file to the project's GitHub repository.

In [None]:
import glob

# Create file structure
data_folder = ''  # Adjust the folder structure as needed
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

# Add data to GitHub
# Assuming you have already initialized a Git repository and added a remote to GitHub
# You can add and commit the data file to the repository
os.system('git add data/metdata.csv')
os.system('git commit -m "Added MET dataset"')
os.system('git push origin master')


# Data Definition:
Understanding Data Features:
Analyze column names, data types, description, counts, unique values, and ranges.

In [None]:
# Load the CSV file into a DataFrame
df = pd.read_csv('metdata.csv')

# Analyze column names, data types, description, counts, unique values, and ranges
column_names = df.columns
data_types = df.dtypes
description = df.describe()
unique_values_counts = df.nunique()
value_ranges = df.max() - df.min()

# Print or visualize the analysis
print("Column Names:", column_names)
print("Data Types:", data_types)
print("Description:", description)
print("Unique Values Counts:", unique_values_counts)
print("Value Ranges:", value_ranges)

# Data Cleaning:
Handle missing values and duplicates.

In [None]:
# Handle missing values
df_cleaned = df.dropna()

# Handle duplicates
df_cleaned = df_cleaned.drop_duplicates()

# Save the cleaned data to a new CSV file
df_cleaned.to_csv('/metdata_cleaned.csv', index=False)
