# Data Collection:

Data Loading: Use the code to fetch data from The MET API and save it to a CSV file (metdata.csv).

In [1]:
import numpy as np
import cv2
import requests
import csv
import os
import pandas as pd

# Use The MET API to get the object IDs
r = requests.get("https://collectionapi.metmuseum.org/public/collection/v1/objects")
r_json = r.json()
total = r_json['total']
print("There are {} valid objects in this dataset".format(total))
objectIDs = r_json['objectIDs']

# get the column headers from the first object ID
prefix = "https://collectionapi.metmuseum.org/public/collection/v1/objects/"
url = prefix + str(objectIDs[0])
r = requests.get(url)
col_headers = r.json().keys()

# create the csv file and write the first row
filename = 'metdata.csv'  # Adjust the file path as needed

# remove if it already exists
try:
    os.remove(filename)
except OSError:
    pass

print("The column headers are the following: {}".format(col_headers))

# Write data to CSV
with open(filename, 'w') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(col_headers)
    
    for index, ID in enumerate(objectIDs):
        url = prefix + str(ID)
        r = requests.get(url)
        values = r.json().values()
        csv_writer.writerow(values)
        # print out each of the rows by their index
        print(index)
        
        # Add condition to break after 1000 iterations
        if index >= 999:
            break


There are 487632 valid objects in this dataset
The column headers are the following: dict_keys(['objectID', 'isHighlight', 'accessionNumber', 'accessionYear', 'isPublicDomain', 'primaryImage', 'primaryImageSmall', 'additionalImages', 'constituents', 'department', 'objectName', 'title', 'culture', 'period', 'dynasty', 'reign', 'portfolio', 'artistRole', 'artistPrefix', 'artistDisplayName', 'artistDisplayBio', 'artistSuffix', 'artistAlphaSort', 'artistNationality', 'artistBeginDate', 'artistEndDate', 'artistGender', 'artistWikidata_URL', 'artistULAN_URL', 'objectDate', 'objectBeginDate', 'objectEndDate', 'medium', 'dimensions', 'measurements', 'creditLine', 'geographyType', 'city', 'state', 'county', 'country', 'region', 'subregion', 'locale', 'locus', 'excavation', 'river', 'classification', 'rightsAndReproduction', 'linkResource', 'metadataDate', 'repository', 'objectURL', 'tags', 'objectWikidata_URL', 'isTimelineWork', 'GalleryNumber'])
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
1

# Data Organization:

File Structure & GitHub:
Create a directory structure.
Add the metdata.csv file to the project's GitHub repository.

In [2]:
import glob

# # Create file structure
# data_folder = ''  # Adjust the folder structure as needed
# if not os.path.exists(data_folder):
#     os.makedirs(data_folder)

# Add data to GitHub
# Assuming you have already initialized a Git repository and added a remote to GitHub
# You can add and commit the data file to the repository
os.system('git add metdata.csv')
os.system('git commit -m "Added MET dataset"')
os.system('git push origin master')


fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git


32768

# Data Definition:
Understanding Data Features:
Analyze column names, data types, description, counts, unique values, and ranges.

In [3]:
df = pd.read_csv('metdata.csv')

# Filter to include only numeric columns
numeric_df = df.select_dtypes(include=[np.number])

# Initialize an empty dictionary to hold the value ranges
value_ranges = {}

# Loop through each numeric column and calculate its range
for column in numeric_df.columns:
    max_value = numeric_df[column].max()
    min_value = numeric_df[column].min()
    value_ranges[column] = max_value - min_value

# Display the value ranges
for column, range_ in value_ranges.items():
    print(f"{column}: {range_}")

objectID: 3277
accessionYear: 140.0
period: nan
dynasty: nan
reign: nan
portfolio: nan
artistBeginDate: 295.0
artistEndDate: 8318.0
objectBeginDate: 323
objectEndDate: 306
subregion: nan
locale: nan
locus: nan
excavation: nan
river: nan
classification: nan
linkResource: nan
GalleryNumber: 262.0


In [4]:
# Assuming 'metdata.csv' is your file path
df = pd.read_csv('metdata.csv')

# Analyze column names, data types, description, counts, unique values
column_names = df.columns
data_types = df.dtypes
description = df.describe()
unique_values_counts = df.nunique()

# Filter to include only numeric columns to avoid any type-related errors
numeric_df = df.select_dtypes(include=[np.number])

# Now calculate the value ranges for these numeric columns
value_ranges = numeric_df.max() - numeric_df.min()

# Print or visualize the analysis
print("Column Names:", column_names)
print("Data Types:", data_types)
print("Description:", description)
print("Unique Values Counts:", unique_values_counts)
print("Value Ranges for Numeric Columns:", value_ranges)


Column Names: Index(['objectID', 'isHighlight', 'accessionNumber', 'accessionYear',
       'isPublicDomain', 'primaryImage', 'primaryImageSmall',
       'additionalImages', 'constituents', 'department', 'objectName', 'title',
       'culture', 'period', 'dynasty', 'reign', 'portfolio', 'artistRole',
       'artistPrefix', 'artistDisplayName', 'artistDisplayBio', 'artistSuffix',
       'artistAlphaSort', 'artistNationality', 'artistBeginDate',
       'artistEndDate', 'artistGender', 'artistWikidata_URL', 'artistULAN_URL',
       'objectDate', 'objectBeginDate', 'objectEndDate', 'medium',
       'dimensions', 'measurements', 'creditLine', 'geographyType', 'city',
       'state', 'county', 'country', 'region', 'subregion', 'locale', 'locus',
       'excavation', 'river', 'classification', 'rightsAndReproduction',
       'linkResource', 'metadataDate', 'repository', 'objectURL', 'tags',
       'objectWikidata_URL', 'isTimelineWork', 'GalleryNumber'],
      dtype='object')
Data Types: objec

# Data Cleaning:
Handle missing values and duplicates.

In [11]:
df

Unnamed: 0,objectID,isHighlight,accessionNumber,accessionYear,isPublicDomain,primaryImage,primaryImageSmall,additionalImages,constituents,department,...,classification,rightsAndReproduction,linkResource,metadataDate,repository,objectURL,tags,objectWikidata_URL,isTimelineWork,GalleryNumber
0,1,False,1979.486.1,1979.0,False,,,[],"[{'constituentID': 164292, 'role': 'Maker', 'n...",The American Wing,...,,,,2021-04-06T04:41:04.967Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/search/1,,,False,
1,2,False,1980.264.5,1980.0,False,,,[],"[{'constituentID': 1079, 'role': 'Maker', 'nam...",The American Wing,...,,,,2021-04-06T04:41:04.967Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/search/2,,,False,
2,3,False,67.265.9,1967.0,False,,,[],,The American Wing,...,,,,2021-04-06T04:41:04.967Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/search/3,,,False,
3,4,False,67.265.10,1967.0,False,,,[],,The American Wing,...,,,,2024-01-10T04:57:19.843Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/search/4,,,False,
4,5,False,67.265.11,1967.0,False,,,[],,The American Wing,...,,,,2024-01-10T04:57:19.843Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/search/5,,,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,3274,False,62.89.10,1962.0,True,https://images.metmuseum.org/CRDImages/ad/orig...,https://images.metmuseum.org/CRDImages/ad/web-...,[],"[{'constituentID': 1557, 'role': 'Maker', 'nam...",The American Wing,...,,,,2023-02-07T04:46:51.34Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/searc...,,https://www.wikidata.org/wiki/Q116370596,False,774.0
996,3275,False,41.34.3,1941.0,False,,,[],"[{'constituentID': 1558, 'role': 'Maker', 'nam...",The American Wing,...,,,,2023-02-07T04:46:51.34Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/searc...,,https://www.wikidata.org/wiki/Q116391822,False,774.0
997,3276,False,41.34.5,1941.0,True,https://images.metmuseum.org/CRDImages/ad/orig...,https://images.metmuseum.org/CRDImages/ad/web-...,[],"[{'constituentID': 1559, 'role': 'Maker', 'nam...",The American Wing,...,,,,2023-02-07T04:46:51.34Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/searc...,,https://www.wikidata.org/wiki/Q116370590,False,774.0
998,3277,False,62.89.7,1962.0,True,https://images.metmuseum.org/CRDImages/ad/orig...,https://images.metmuseum.org/CRDImages/ad/web-...,[],"[{'constituentID': 1559, 'role': 'Maker', 'nam...",The American Wing,...,,,,2023-02-07T04:46:51.34Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/searc...,,https://www.wikidata.org/wiki/Q116370584,False,774.0


In [13]:
df_cleaned

Unnamed: 0,objectID,isHighlight,accessionNumber,accessionYear,isPublicDomain,primaryImage,primaryImageSmall,additionalImages,constituents,department,...,classification,rightsAndReproduction,linkResource,metadataDate,repository,objectURL,tags,objectWikidata_URL,isTimelineWork,GalleryNumber
0,1,False,1979.486.1,1979.0,False,,,[],"[{'constituentID': 164292, 'role': 'Maker', 'n...",The American Wing,...,,,,2021-04-06T04:41:04.967Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/search/1,,,False,
1,2,False,1980.264.5,1980.0,False,,,[],"[{'constituentID': 1079, 'role': 'Maker', 'nam...",The American Wing,...,,,,2021-04-06T04:41:04.967Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/search/2,,,False,
2,3,False,67.265.9,1967.0,False,,,[],,The American Wing,...,,,,2021-04-06T04:41:04.967Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/search/3,,,False,
3,4,False,67.265.10,1967.0,False,,,[],,The American Wing,...,,,,2024-01-10T04:57:19.843Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/search/4,,,False,
4,5,False,67.265.11,1967.0,False,,,[],,The American Wing,...,,,,2024-01-10T04:57:19.843Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/search/5,,,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,3274,False,62.89.10,1962.0,True,https://images.metmuseum.org/CRDImages/ad/orig...,https://images.metmuseum.org/CRDImages/ad/web-...,[],"[{'constituentID': 1557, 'role': 'Maker', 'nam...",The American Wing,...,,,,2023-02-07T04:46:51.34Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/searc...,,https://www.wikidata.org/wiki/Q116370596,False,774.0
996,3275,False,41.34.3,1941.0,False,,,[],"[{'constituentID': 1558, 'role': 'Maker', 'nam...",The American Wing,...,,,,2023-02-07T04:46:51.34Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/searc...,,https://www.wikidata.org/wiki/Q116391822,False,774.0
997,3276,False,41.34.5,1941.0,True,https://images.metmuseum.org/CRDImages/ad/orig...,https://images.metmuseum.org/CRDImages/ad/web-...,[],"[{'constituentID': 1559, 'role': 'Maker', 'nam...",The American Wing,...,,,,2023-02-07T04:46:51.34Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/searc...,,https://www.wikidata.org/wiki/Q116370590,False,774.0
998,3277,False,62.89.7,1962.0,True,https://images.metmuseum.org/CRDImages/ad/orig...,https://images.metmuseum.org/CRDImages/ad/web-...,[],"[{'constituentID': 1559, 'role': 'Maker', 'nam...",The American Wing,...,,,,2023-02-07T04:46:51.34Z,"Metropolitan Museum of Art, New York, NY",https://www.metmuseum.org/art/collection/searc...,,https://www.wikidata.org/wiki/Q116370584,False,774.0


In [14]:
# Drop rows where all values are missing
df_cleaned = df.dropna(how='all')

# Save the cleaned data to a new CSV file
df_cleaned.to_csv('metdata_cleaned.csv', index=False)