In [7]:
import requests

def make_api_call():
    url = "https://data.depositar.io/api/action/package_search"
    
    # Define the query parameters
    params = {
        "facet.field": '["keywords_facet"]',
        "facet.limit": 1000,
        "rows": 0
    }
    
    try:
        # Send the GET request
        response = requests.get(url, params=params)
        
        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            data = response.json()
            return data
        else:
            print(f"API request failed with status code: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

if __name__ == "__main__":
    api_data = make_api_call()

In [8]:
import pandas as pd
data=api_data["result"]['facets']['keywords_facet']
# 使用pandas的DataFrame函數將字典轉換為DataFrame，並讓pandas自動生成索引
df = pd.DataFrame(list(data.items()), columns=['ID', 'Value'])

# 顯示DataFrame
print(df)

            ID  Value
0     Q9578202     51
1    Q11070045     49
2     Q7481418     48
3      Q484000     46
4         Q865     46
..         ...    ...
465  Q96977107      1
466  Q96977148      1
467  Q97173495      1
468        Q98      1
469  Q98692446      1

[470 rows x 2 columns]


In [9]:
def get_claims_for_item(item_id):
    base_url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "ids": item_id,
        "format": "json",
        "props": "claims"
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    if item_id in data["entities"]:
        claims = data["entities"][item_id]["claims"]
        return claims
    else:
        return None

d = {}  # 使用字典來存儲數據
total_pid=[]
for i in range(50):  #利用top50的關鍵字發展概要的PID結構
    item_id = df["ID"][i] 
    item_claims = get_claims_for_item(item_id)
    d[item_id] = list(item_claims.keys())
    total_pid.extend(list(item_claims.keys()))
#print(d)


In [10]:
resultantList = []
 
for element in total_pid:
    if element not in resultantList:
        resultantList.append(element)

#print(resultantList)

In [11]:
import gzip
import os
import time
global last_edit_time
# 定義壓縮檔案路徑
gzip_file_path = 'datasets.jsonl.gz'

# 定義解壓後的檔案路徑
output_file_path = 'datasets.jsonl'

# 解壓縮.gz檔案並儲存為.jsonl檔案
with gzip.open(gzip_file_path, 'rb') as gzip_file:
    file_stat = os.fstat(gzip_file.fileno())
    last_edit_time = file_stat.st_mtime
    last_edit_time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(last_edit_time))
    with open(output_file_path, 'wb') as output_file:
        while True:
            chunk = gzip_file.read(1024)
            if not chunk:
                break
            output_file.write(chunk)


In [12]:
import json

def process_data_qid(data):
    title = data['title']
    # name = data['name'] #資料集在depositar網址末碼
    keywords = data.get('keywords', [])
    return {title: keywords}

result_data = {}
with open(output_file_path, 'r') as jsonl_file:
    for line in jsonl_file:
        data = json.loads(line)
        result_data.update(process_data_qid(data))


In [13]:
# Create a new dictionary to store the filtered data
filtered_data_PID = {}
time=0
# Iterate over the keys in result_data
for key, value in result_data.items():
    time = time+1
    #print(time,end=" ")
    for qid in value:
        if qid in d:
            filtered_data_PID[key] = d[qid]
            break



In [14]:
# Create a new DataFrame
df = pd.DataFrame(columns=resultantList, index=filtered_data_PID.keys())

# Fill the DataFrame with values from filtered_data
for row_name, pids in filtered_data_PID.items():
    df.loc[row_name, pids] = 1

# Fill NaN values with 0
df.fillna(0, inplace=True)

# Display the resulting DataFrame
#print(df)

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between rows (documents)
cosine_sim_matrix = cosine_similarity(df)

# Create a DataFrame to display the cosine similarity matrix
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, columns=df.index, index=df.index)

# Display the cosine similarity matrix
#print(cosine_sim_df)


In [16]:
# Specify the name of the excel file
file_name = 'SimilarityofFilesfromPIDs.xlsx'
  
# Save the DataFrame to the Excel file using 'utf-8-sig' encoding
cosine_sim_df.to_excel(file_name, encoding='UTF-8', index=True)  # Set index=False to avoid saving the DataFrame index

print('SimilarityofQIDfromPIDs record successfully exported into Excel File')




SimilarityofQIDfromPIDs record successfully exported into Excel File


In [17]:
df = pd.read_excel('SimilarityofFilesfromPIDs.xlsx')
df.head(5)

Unnamed: 0.1,Unnamed: 0,2023-08-05 Nanshan Gravesite Panoramic Images -- Ricoh Theta Z1,2023-08-05 Nanshan Gravesite Images -- Sony ZV-E10,0802進度報告,COVID-19 Web Archives (Taiwan),LTSER Lyudao 生態觀測–造礁珊瑚多樣性,LTSER Lyudao 生態觀測–珊瑚入添,LTSER Lyudao 珊瑚礁水下聲景調查資料,LTSER Lyudao 綠島珊瑚礁水下聲景 (202210-202303),LTSER Lyudao 海洋觀測-海溫,...,Cartien vant Canael en den Inwyck van Wankan(魍港水道與入口的小海圖),Afbeeldinge van de Westzijde vant Eijlandt Formosa(福爾摩沙島西部圖),Descripcion del Pverto de los Olandeses en Ysla Hermosa(艾爾摩沙島荷蘭人港口描述圖),Packan also tselve beseijlt is door Jacob Noordeloos(北港，即如此由Jacob Noordeloos航行完成的),澎湖至大員島、魍港、漁夫灣海圖,澎湖與福爾摩沙島圖,臺灣海峽(含廈門灣、澎湖群島、臺灣西南沿岸)圖,魍港至大員圖,臺灣海峽(含廈門灣、澎湖群島、臺灣西南沿岸)海圖,地下水流模型(M值)
0,2023-08-05 Nanshan Gravesite Panoramic Images ...,1.0,1.0,0.124035,0.11873,0.0,0.0,0.205196,0.205196,0.0,...,0.153393,0.153393,0.2,0.153393,0.153393,0.153393,0.153393,0.153393,0.153393,0.256495
1,2023-08-05 Nanshan Gravesite Images -- Sony ZV...,1.0,1.0,0.124035,0.11873,0.0,0.0,0.205196,0.205196,0.0,...,0.153393,0.153393,0.2,0.153393,0.153393,0.153393,0.153393,0.153393,0.153393,0.256495
2,0802進度報告,0.124035,0.124035,1.0,0.055225,0.0,0.0,0.063628,0.063628,0.0,...,0.047565,0.047565,0.041345,0.047565,0.047565,0.047565,0.047565,0.047565,0.047565,0.095443
3,COVID-19 Web Archives (Taiwan),0.11873,0.11873,0.055225,1.0,0.0,0.0,0.152269,0.152269,0.0,...,0.216272,0.216272,0.257249,0.216272,0.216272,0.216272,0.216272,0.216272,0.216272,0.441579
4,LTSER Lyudao 生態觀測–造礁珊瑚多樣性,0.0,0.0,0.0,0.0,1.0,1.0,0.229416,0.229416,1.0,...,0.171499,0.171499,0.149071,0.171499,0.171499,0.171499,0.171499,0.171499,0.171499,0.0


In [18]:
def process_data(df, to_search, threshold=0.5, top_n=3):
    filtered_rows = df[df.iloc[:, 0] == to_search]
    filtered_rows = filtered_rows[filtered_rows.iloc[:, 1:] > threshold]
    top_indices = filtered_rows.iloc[0, 1:].sort_values(ascending=False).index[:top_n]

    results = []
    for index, filename in enumerate(top_indices):
        score = filtered_rows.at[filtered_rows.index[0], filename]  # 修改这里
        result = f"{index+1}: {filename} ,分數是{score}"
        results.append(result)

    return results

In [19]:
## 隨機產生
import random
import time
from IPython.display import clear_output

for i in range(10):
    allfiles = df.iloc[:,0].tolist()
    random_integer = random.randint(1, len(allfiles))

    to_search = allfiles[random_integer]
    results = process_data(df, to_search)
    print(f"輸入的文件名稱是: 「{to_search}」")
    print("===============================")
    for result in results:
        print(result)
    time.sleep(5)
    clear_output(wait=True)  # 清除输出
print("=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=")
print("Demo完成")
print("=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=")

=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=
Demo完成
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=
