In [None]:
#from modelscope import snapshot_download
#model_dir = snapshot_download('LLM-Research/Meta-Llama-3-8B-Instruct', cache_dir='.')

In [3]:
import pandas as pd

In [39]:
class StreamGenerator:
    def __init__(self):
        self.counter = {}

    def generate_code(self, school: str, department: str, professor: str) -> str:
        # 学校-专业作为主键
        key = f"{school}-{department}"

        # if没有该学校-专业的记录，新建
        if key not in self.counter:
            self.counter[key] = 0

        self.counter[key] += 1

        # 格式化编码
        school_code = school.zfill(3)  # 学校代码
        dept_code = department[:3].upper()  # 取专业分类前3个字符的大写字母
        prof_number = str(self.counter[key]).zfill(3)  # 将编号补零成3位

        # 生成编码
        unique_code = f"{school_code}-{dept_code}-{prof_number}"

        return unique_code

In [40]:
# 读取 CSV 文件  
df = pd.read_csv('Physics_with_code.csv').drop_duplicates()   

# 初始化 StreamGenerator 实例  
generator = StreamGenerator()  

# 为每一行生成流水编码  
df['Unique_Code'] = df.apply(lambda row: generator.generate_code(str(row['code']), 'Physics', row['Faculty']), axis=1)  

# 保存到新的 CSV 文件  
df.to_csv('output_with_codes.csv', index=False)  

In [41]:
import pandas as pd  

# 读取CSV文件  
file_path = 'output_with_codes.csv'  
df = pd.read_csv(file_path)  

# 检查'unique_code'列中的重复值  
duplicates = df[df.duplicated(subset=['Unique_Code'], keep=False)]  

# 输出重复行  
if not duplicates.empty:  
    print("以下是'unique_code'列中重复的行：")  
    print(duplicates)  
else:  
    print("'unique_code'列中没有重复的行。")

'unique_code'列中没有重复的行。


In [42]:
print(df['Unique_Code'])

0      064-PHY-001
1      064-PHY-002
2      064-PHY-003
3      064-PHY-004
4      064-PHY-005
          ...     
267    411-PHY-001
268    411-PHY-002
269    411-PHY-003
270    411-PHY-004
271    411-PHY-005
Name: Unique_Code, Length: 272, dtype: object


In [46]:
import pandas as pd  

# 读取CSV文件  
file_path = 'output_with_codes.csv'  
df = pd.read_csv(file_path)  

# 定义映射规则  
priority_mapping = {  
    4: 'Low',  
    3: 'Medium',  
    2: 'High',  
    1: 'Very High',  
}  

# 检查是否有'send_priority'列，并进行映射  
if 'send_priority' in df.columns:  
    df['send_priority'] = df['send_priority'].map(priority_mapping)  
else:  
    print("'send_priority'列在数据集中不存在。")  

# 保存为新的CSV文件，保存映射后的结果  
output_file = 'output_with_codes.csv'  
df.to_csv(output_file, index=False)  

print(f"已生成文件：{output_file}")

已生成文件：output_with_codes.csv


In [47]:
import pandas as pd  

# 读取CSV文件  
df = pd.read_csv('output_with_codes.csv')  

# 打开一个TXT文件以写入  
with open('Physics.txt', 'w', encoding='utf-8') as file:  
    # 遍历每一行，格式化文本并写入文件  
    for _, row in df.iterrows():  
        # 格式化每一行  
        line = (f"Ranked {row['Rank']}, {row['University']} has a faculty member "  
                f"named {row['Faculty']} who holds the title of {row['Title']}. "  
                f"They are involved in {row['Research']} research. "  
                f"More information can be found on their website: {row['Website']}. "  
                f"Contact them via email: {row['Email']}. "  
                f"Status: {row['Status']}, Chinese: {row['is_chinese']}, "  
                f"Send Priority: {row['send_priority']}, "  
                f"Full Research: {row['full_research']}, "  
                f"Unique Code: {row['Unique_Code']}.\n")  
        # 写入TXT文件  
        file.write(line)  

## 学校代码

In [12]:
import pandas as pd  

# 读取CSV文件  
input_file = 'qs-world-rankings-2025.csv'  
df = pd.read_csv(input_file)  

# 截取前500行  
df_top_500 = df.head(500)  

# 按'Institution Name'列的首字母排序  
df_sorted_by_name = df_top_500.sort_values(by='Institution Name')  

# 生成编号列，从001到500  
df_sorted_by_name['code'] = range(1, 501)  
df_sorted_by_name['code'] = df_sorted_by_name['code'].apply(lambda x: str(x).zfill(3))  

# 只保留'Institution Name'和'code'列  
df_result = df_sorted_by_name[['Institution Name', 'code']]  

# 保存为新的CSV文件  
output_file = 'schol_code.csv'  
df_result.to_csv(output_file, index=False)  

print(f"已生成文件：{output_file}")

已生成文件：schol_code.csv


In [25]:
import pandas as pd  
import warnings  
# 忽略FutureWarning  
warnings.simplefilter(action='ignore', category=FutureWarning)  


# 读取Physics.csv文件  
physics_file = 'Physics.csv'  
physics_df = pd.read_csv(physics_file)  

# 读取schol_code.csv文件  
school_code_file = 'schol_code.csv'  
school_code_df = pd.read_csv(school_code_file)  

# 确保'school_code_df'中的'code'列为字符串格式  
school_code_df['code'] = school_code_df['code'].astype(str)  

# 合并两个数据框，基于Physics.csv中的'University'列和schol_code.csv中的'Institution Name'列  
result_df = pd.merge(physics_df, school_code_df, left_on='University', right_on='Institution Name', how='left')  
# 确保先将'code'列转为字符串  
result_df.loc[:, 'code'] = result_df.loc[:, 'code'].astype(str)  

# 然后再进行zfill操作  
result_df.loc[:, 'code'] = result_df['code'].apply(lambda x: str(x).zfill(3))
# 保存合并后的数据框为新的CSV文件  
output_file = 'Physics_with_code.csv'  
result_df.to_csv(output_file, index=False)  

print(f"已生成文件：{output_file}")

已生成文件：Physics_with_code.csv
