In [1]:
import pandas as pd

def load_excel(file_name):
    """Load the main Excel file."""
    return pd.read_excel(file_name)

def load_csv(file_name):
    """Load a CSV file."""
    return pd.read_csv(file_name)

def save_csv(df, file_name):
    """Save DataFrame to a CSV file."""
    df.to_csv(file_name, index=False)

def add_labels(df, label):
    """Add a label column to the DataFrame."""
    df['Label'] = label
    return df

In [2]:
# Load the main Excel file (HCR data)
hcr_data = load_excel('HCR 2023 Preliminary File.xlsx')

# List of universities and their corresponding CSV files
universities = {
    'Australian National University': 'Incites Researchers - Australian National University.csv',
    'Monash University': 'Incites Researchers - Monash University.csv',
    'University of Adelaide': 'Incites Researchers - University of Adelaide.csv',
    'University of Melbourne': 'Incites Researchers - University of Melbourne.csv',
    'University of New South Wales Sydney': 'Incites Researchers - University of New South Wales Sydney.csv',
    'University of Queensland': 'Incites Researchers - University of Queensland.csv',
    'University of Sydney': 'Incites Researchers - University of Sydney.csv',
    'University of Western Australia': 'Incites Researchers - University of Western Australia.csv'
}
print(hcr_data["primaryinstitution"])

0                            University of Valencia
1                        Chaim Sheba Medical Center
2                           University of Edinburgh
3                           University of Cambridge
4              Iran University Science & Technology
                           ...                     
7541                       Eotvos Lorand University
7542                   City University of Hong Kong
7543                        ShanghaiTech University
7544          Hospital for Sick Children (SickKids)
7545    Huazhong University of Science & Technology
Name: primaryinstitution, Length: 7546, dtype: object


In [3]:
hcr_usyd_data = hcr_data[hcr_data['primaryinstitution'] == 'University of Sydney']


In [4]:
usyd_data = load_csv("Incites -USYD.csv")
usyd_data["Name"]

0                      Holmes, Edward C.
1        Islam, Sheikh Mohammed Shariful
2                           Tao, Dacheng
3                          George, Jacob
4                         Woodward, Mark
                      ...               
39321             Brown, David Alexander
39322                 Farquharson, K. A.
39323                         Lusink, V.
39324                  Ngwira, Memory M.
39325                    Wheatley, J. R.
Name: Name, Length: 39326, dtype: object

In [5]:
# usyd_data[['Lastname', 'Firstname']] = usyd_data['Name'].str.split(',', expand=True)
# usyd_data['Lastname'] = usyd_data['Lastname'].str.strip()
# usyd_data['Firstname'] = usyd_data['Firstname'].str.strip()
# print(usyd_data[['Lastname', 'Firstname']].head())

# 使用 str.split 分割 'Name' 列，expand=True 生成新的 DataFrame
name_split = usyd_data['Name'].str.split(',', expand=True)

# 将分割后的 DataFrame 的第一列和第二列分别赋值给 'Lastname' 和 'Firstname'
usyd_data['Lastname'] = name_split[0].str.strip()
usyd_data['Firstname'] = name_split[1].str.strip()


In [6]:
usyd_data['hcr_label'] = 0
hcrnum = 0
for index, row in hcr_usyd_data.iterrows():
    hcrnum += 1
    firstname = row['firstname']
    lastname = row['lastname']
    # 使用条件表达式找到所有匹配的行
    mask = (usyd_data["Lastname"] == lastname) & (usyd_data["Firstname"] == firstname)
    
    if mask.any():
        # 找到第一个匹配行的索引
        first_match_index = mask.idxmax()
        
        # 只增加第一个找到的行的 hcr_label
        usyd_data.loc[first_match_index, 'hcr_label'] += 1
        
        # 打印更新后的值
        print(f"Updated row index {first_match_index}: hcr_label = {usyd_data.loc[first_match_index, 'hcr_label']}")

    print(f"Processed: {firstname}; {lastname}")  # 输出当前处理的名字
print(usyd_data)

Updated row index 74: hcr_label = 1
Processed: Adrian; Bauman
Updated row index 37: hcr_label = 1
Processed: Albert Y.; Zomaya
Updated row index 5331: hcr_label = 1
Processed: Alexander B.; McBratney
Updated row index 1239: hcr_label = 1
Processed: Anita; Ho-Baillie
Updated row index 686: hcr_label = 1
Processed: Arne; Geschke
Updated row index 97: hcr_label = 1
Processed: Budiman; Minasny
Updated row index 2: hcr_label = 1
Processed: Dacheng; Tao
Updated row index 2: hcr_label = 2
Processed: Dacheng; Tao
Updated row index 142: hcr_label = 1
Processed: David A.; Hensher
Updated row index 0: hcr_label = 1
Processed: Edward C.; Holmes
Updated row index 25: hcr_label = 1
Processed: Emmanuel; Stamatakis
Updated row index 28: hcr_label = 1
Processed: Georgina V.; Long
Updated row index 87: hcr_label = 1
Processed: Glenda M.; Halliday
Updated row index 23: hcr_label = 1
Processed: Ian B.; Hickie
Updated row index 3: hcr_label = 1
Processed: Jacob; George
Updated row index 7: hcr_label = 1
Pr

In [7]:
number = 0
for i in usyd_data['hcr_label']:
    number += i
print(number)
print(hcrnum)

28
28


In [8]:
# 将 DataFrame 保存到 Excel 文件
usyd_data.to_excel('usyd_data_output.xlsx', index=False, engine='openpyxl')
