# Script to create a CSV that can be used with VanderBot to update labels/titles in Wikidata

First open the necessary data files. 

In [None]:
import pandas as pd

# Read the commons_images.csv file into a dataframe
commons_images_df = pd.read_csv('../commons_images.csv', na_filter=False, dtype = str)

# Read in the spreadsheet with the captions: duplicated_titles_Jims_identifications.csv
titles_df = pd.read_csv('updated_titles.csv', na_filter=False, dtype = str)
titles_df.head()

Go though each of the lines in the titles_df and create a new line in the output CSV file

In [None]:
# Read in files necessary for deleting on Title properties
item_metadata_df = pd.read_csv('../act_artworks.csv', na_filter=False, dtype = str)
item_metadata_df.head()

In [None]:
# Create an empty list to store the dicts for the rows of the new dataframe
rows = []

# Iterate over the rows of the captions_df dataframe
for index, row in titles_df.iterrows():
    # Get the ACT ID of the artwork
    act_id = row['act_id']

    # Get the new title of the item
    title = row['new_title']

    # remove any newline characters from the caption
    title = title.replace('\n', ' ')

    # replace any double spaces with single spaces
    title = title.replace('  ', ' ')

    # Look up the Q ID of the artwork in the commons_images_df dataframe using the ACT ID
    qid = commons_images_df.loc[commons_images_df['local_identifier'] == act_id, 'qid'].values[0]

    # Create a dict with the data for the new row
    row_dict = {'qid': qid, 'label_en': title, 'title_uuid': '', 'title': title}

    # Append the dict to the rows list
    rows.append(row_dict)

# Create a new dataframe from the rows list
new_df = pd.DataFrame(rows)

# Write the new dataframe to a CSV file
new_df.to_csv('new_titles.csv', index=False)
print('done')


# Create the CSV file with the IDs of the title claims that need to be deleted by VanderDeleteBot

In [None]:
# Create an empty list to store the dicts for the title_uuid values
title_uuid_list = []

# Iterate over the rows of the item metadata dataframe and get the qid and title_uuid values
for index, row in item_metadata_df.iterrows():
    # Check if the row qid value matches a qid value in the new_df dataframe
    if row['qid'] not in new_df['qid'].values:
        continue
    title_uuid = row['title_uuid']
    qid = row['qid']
    title_uuid_dict = {'qid': qid, 'title_uuid': title_uuid}
    title_uuid_list.append(title_uuid_dict)

# Create a dataframe from the list of title_uuid dicts
title_uuid_df = pd.DataFrame(title_uuid_list)

# Save the dataframe to a CSV file
title_uuid_df.to_csv('deletions.csv', index=False)
