In [None]:
# imports
import pandas as pd


First, need to get the statement UUIDs from the Wikidata upload record and put them in a separate CSV file that VanderDeleteBot can use to remove the P31 statements.


In [None]:
# Load the Wikidata upload file data
wikidata_data = pd.read_csv('act_artworks.csv', na_filter=False, dtype = str)

# Load the Richardson data dump data
richardson_data = pd.read_csv('richardson.csv', na_filter=False, dtype = str)


In [None]:
# Find the qid and instance_of_uuid values for each row in the wikidata_data dataframe that has a value 
# in the classification column in the richardson_data dataframe. Use the act column in the wikidata_data
# dataframe and the RecordNumber column in the richardson_data dataframe to match rows.
# Create a new dataframe with the qid and instance_of_uuid values.

deletions_df = pd.DataFrame(columns=['qid', 'instance_of_uuid'])
for index, row in richardson_data.iterrows():
    if row['classification'] != '':
        try:
            qid = wikidata_data.loc[wikidata_data['act'] == row['RecordNumber'], 'qid'].iloc[0]
            instance_of_uuid = wikidata_data.loc[wikidata_data['act'] == row['RecordNumber'], 'instance_of_uuid'].iloc[0]
            deletions_df = deletions_df.append({'qid': qid, 'instance_of_uuid': instance_of_uuid}, ignore_index=True)
        except:
            print('No match found for ' + row['RecordNumber'])

# Write the deletions_df dataframe to a csv file
deletions_df.to_csv('deletions.csv', index=False)


Now we need to change the existing P31 statements to improved ones using the classification label/qid mappings from the category_breakdown.csv file.

In [8]:
classification_mappings = pd.read_csv('category_breakdown.csv', na_filter=False, dtype = str)

# Step through each row in the richardson_data dataframe and replace the value in the instance_of column 
# of the wikidata_data dataframe based on the classification mappings and the corresponding value in the 
# classification column of the richardson_data dataframe.
for index, row in richardson_data.iterrows():
    if row['classification'] != '':
        try:
            # Replace the value in the instance_of column of the wikidata_data dataframe with the mapped value for the classification value.
            classification = row['classification']
            instance_of = classification_mappings.loc[classification_mappings['classification_label'] == classification, 'qid'].iloc[0]
            wikidata_data.loc[wikidata_data['act'] == row['RecordNumber'], 'instance_of'] = instance_of

            # Set the values of the instance_of_uuid, instance_of_ref1_hash, and instance_of_ref1_retrieved_nodeId columns in the row to empty strings.
            wikidata_data.loc[wikidata_data['act'] == row['RecordNumber'], 'instance_of_uuid'] = ''
            wikidata_data.loc[wikidata_data['act'] == row['RecordNumber'], 'instance_of_ref1_hash'] = ''
            wikidata_data.loc[wikidata_data['act'] == row['RecordNumber'], 'instance_of_ref1_retrieved_nodeId'] = ''
        except:
            #print('No match found for ' + row['RecordNumber'])
            pass

# Sort the wikidata_data dataframe by the qid column
wikidata_data = wikidata_data.sort_values(by=['qid'])

# Write the wikidata_data dataframe to a csv file
wikidata_data.to_csv('act_artworks_modified.csv', index=False)
