In [40]:
import pymysql
import os
import json

password = os.getenv('MYSQL_ROOT_KEY')

# 辅助函数，用于处理页面号码
def extract_pages(pages):
    if isinstance(pages, int):
        pages = [pages]
    return ', '.join(str(page) for page in pages)

# 构建SQL插入语句的函数
def build_sql_insert(table, columns, values):
    formatted_values = []
    for value in values:
        if isinstance(value, str):
            if value.startswith("(") and value.endswith(")"):
                # 对于子查询，直接添加，不需要引号
                formatted_values.append(value)
            else:
                # 对于普通字符串，包括日期时间，确保替换内部单引号并用一对单引号包围
                formatted_values.append("'" + value.replace("'", "''") + "'")
        else:
            # 对于整数或其他非字符串值，直接转换为字符串
            formatted_values.append(str(value))
    
    # 构建并返回最终的SQL插入语句
    return f"INSERT INTO {table} ({', '.join(columns)}) VALUES ({', '.join(formatted_values)});"


# 运行SQL命令的函数，执行并返回结果
def run_sql_commands(connection, commands):
    results = []
    try:
        with connection.cursor() as cursor:
            for command in commands:
                cursor.execute(command)
                if command.strip().upper().startswith('SELECT'):
                    results.append(cursor.fetchall())
        connection.commit()
    except pymysql.MySQLError as e:
        print(f"Database error: {e}")
    except Exception as e:
        print(f"Exception in transaction: {e}")
    return results

# 加载JSON数据的函数
def load_data(file_path):
    try:
        with open(file_path, 'r') as file:
            return json.load(file)
    except Exception as e:
        print(f"Failed to load data from {file_path}: {e}")
        return None

# 创建数据库连接
connection = pymysql.connect(host='localhost', user='root', password=password, db='test', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)


In [41]:
# 加载数据
data = load_data('data/json/gpt.json')
if data is None:
    exit(1)

sql_commands = []
existing_courses = set()  # 用于跟踪已经处理过的课程代码

# 查询现有的课程代码，以避免重复插入
try:
    with connection.cursor() as cursor:
        cursor.execute("SELECT course_code FROM courses")
        existing_courses = set(course['course_code'] for course in cursor.fetchall())
except pymysql.MySQLError as e:
    print(f"Database error: {e}")

# 插入课程数据
for file_title, details in data.items():
    # 提取课程代码，假设格式为 'EE6405_W1_Introduction_to_NLP'
    course_code = file_title.split('_')[0]
    if course_code not in existing_courses:  # 检查是否已存在
        course_name = "Natural Language Processing"  # 假设所有课程名称相同，或者可以从某处提取
        sql_commands.append(build_sql_insert('courses', ['course_code', 'course_name'], [course_code, course_name]))
        existing_courses.add(course_code)  # 添加到已存在课程集合中，防止重复处理

# 执行SQL命令
run_sql_commands(connection, sql_commands)


[]

AWS S3 

In [42]:
import boto3

# 初始化S3客户端
s3 = boto3.resource('s3')

for bucket in s3.buckets.all():
    print(bucket.name)

for files in bucket.objects.all():
    print(files.key)

pass-gpt
code/Week 1.ipynb
code/Week 10.ipynb
code/Week 11.ipynb
code/Week 2.ipynb
code/Week 3.ipynb
code/Week 4.ipynb
code/Week 5.ipynb
code/Week 6.ipynb
code/Week 7.ipynb
code/Week 8.ipynb
code/Week 9.ipynb
slides/EE6405_W10_ A survey of NLP applications across diverse industries_For Students.pdf
slides/EE6405_W11_ Deep-dive into NLP_For Students.pdf
slides/EE6405_W1_Introduction to NLP_For Students.pdf
slides/EE6405_W2_Linguistic Analysis and Information Extraction_For Students.pdf
slides/EE6405_W3_Term Weighting Scheme and Topic Modelling_For Students.pdf
slides/EE6405_W4_Traditional ML and NLP Applications_For Students.pdf
slides/EE6405_W5_EMaWE_For Students.pdf
slides/EE6405_W6_NM.pdf
slides/EE6405_W7_Transformer.pdf
slides/EE6405_W8_HPT_For Students.pdf
slides/EE6405_W9_TLLMs_For Students.pdf


Slides Upload

In [43]:
import datetime
sql_commands = []

for file_title, details in data.items():
    course_code = file_title.split('_')[0]
    file_name = file_title
    title = details["Title"]
    material_type = details['Material Type']
    file_path = f"slides/{file_title}"  
    teaching_week = int(details['Teaching Week'].replace('week ', '')) # 从 "week 1" 提取 "1"
    creation_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 

    # 构建插入到files表的SQL命令
    insert_command = build_sql_insert(
        'files', 
        ['course_id', 'file_name', 'title', 'file_type', 'file_path', 'teaching_week', 'creation_date'], 
        [
            f"(SELECT course_id FROM courses WHERE course_code='{course_code}')",  # 从courses表中获取course_id
            file_name, 
            title,
            material_type,
            file_path,  
            teaching_week,  # 整数值转换为字符串，但不加引号
            creation_date
        ]
    )
    sql_commands.append(insert_command)

# 执行SQL命令
run_sql_commands(connection, sql_commands)


[]

Concept

In [44]:
# 初始化概念插入的SQL命令列表
sql_commands_concepts = []
concept_to_subconcepts = []  # 存储概念与其子概念的关系，用于延后处理

for file_title, details in data.items():
    file_name = file_title
    concepts = details.get('Concepts', [])  # 获取概念列表
    if concepts:  # 如果存在概念
        for concept in concepts:
            concept_name = concept['name']
            concept_pages = extract_pages(concept.get('Page', ''))
            # 插入概念
            insert_concept = build_sql_insert(
                'concepts',
                ['file_id', 'concept_name', 'concept_page'],
                [f"(SELECT file_id FROM files WHERE file_name='{file_name}')", concept_name, concept_pages]
            )
            sql_commands_concepts.append(insert_concept)
            
            # 暂存子概念信息，稍后处理
            subconcepts = concept.get('Subconcepts', [])
            if subconcepts:
                for subconcept in subconcepts:
                    subconcept_name = subconcept['name']
                    subconcept_pages = extract_pages(subconcept.get('Page', ''))
                    concept_to_subconcepts.append((file_name, concept_name, subconcept_name, subconcept_pages))

# 执行概念的插入
run_sql_commands(connection, sql_commands_concepts)


[]

In [45]:
# 查询并缓存所有的 concept_id 和对应的 file_id
concept_ids = {}
try:
    with connection.cursor() as cursor:
        for file_name, concept_name, _, _ in concept_to_subconcepts:
            # 确保我们只查询一次每个概念
            if (file_name, concept_name) not in concept_ids:
                query = f"SELECT concept_id, (SELECT file_id FROM files WHERE file_name='{file_name}') as file_id FROM concepts WHERE concept_name='{concept_name}' AND file_id=(SELECT file_id FROM files WHERE file_name='{file_name}')"
                cursor.execute(query)
                result = cursor.fetchone()
                if result:
                    concept_ids[(file_name, concept_name)] = (result['file_id'], result['concept_id'])
except pymysql.MySQLError as e:
    print(f"Database error during fetching ids: {e}")   

print(concept_ids)

{('EE6405_W1_Introduction_to_NLP', 'Overview of NLP'): (1, 1), ('EE6405_W1_Introduction_to_NLP', 'Historical Background'): (1, 2), ('EE6405_W1_Introduction_to_NLP', 'Approaches to NLP'): (1, 3), ('EE6405_W1_Introduction_to_NLP', 'Preprocessing Techniques'): (1, 4), ('EE6405_W2_Linguistic Analysis and Information Extraction', 'Introduction to Information Extraction'): (2, 5), ('EE6405_W2_Linguistic Analysis and Information Extraction', 'Named Entity Recognition (NER)'): (2, 6), ('EE6405_W2_Linguistic Analysis and Information Extraction', 'Part-Of-Speech Tagging'): (2, 7), ('EE6405_W2_Linguistic Analysis and Information Extraction', 'Dependency Parsing'): (2, 8), ('EE6405_W3_Term Weighting Scheme and Topic Modelling_For Students', 'Term Weighting Schemes'): (3, 9), ('EE6405_W3_Term Weighting Scheme and Topic Modelling_For Students', 'Topic Modeling'): (3, 10), ('EE6405_W3_Term Weighting Scheme and Topic Modelling_For Students', 'Dimensionality Reduction'): (3, 11), ('EE6405_W4_Traditiona

In [46]:
# 使用缓存的ID构建子概念的插入命令
sql_commands_subconcepts = []
for file_name, parent_concept_name, subconcept_name, subconcept_pages in concept_to_subconcepts:
    if (file_name, parent_concept_name) in concept_ids:
        file_id, parent_id = concept_ids[(file_name, parent_concept_name)]
        insert_subconcept = build_sql_insert(
            'concepts',
            ['parent_id', 'file_id', 'concept_name', 'concept_page'],
            [str(parent_id), str(file_id), subconcept_name, subconcept_pages]
        )
        sql_commands_subconcepts.append(insert_subconcept)

# 执行子概念的插入
run_sql_commands(connection, sql_commands_subconcepts)

[]

In [None]:
 #TODO: 两个章节有summary, 但是很混乱。
# for summary in details.get('Summary', []):
#     summary_page = ', '.join(map(str, summary.get('page', summary.get('Page', []))))
#     key_points = ', '.join(summary.get('keyPoints', ''))
#     sql_commands.append(build_sql_insert('summaries', ['file_id', 'summary_page', 'key_points'], [f"(SELECT file_id FROM files WHERE file_name='{file_name}')", f"'{summary_page}'", f"'{key_points}'"]))

#TODO: 公式有时候是个列表（line 324)，有时候直接变成了介绍后面跟着页面。
# for example_type, examples in [('formula', subconcept.get('Formula', [])), ('code', subconcept.get('Code', []))]:
#     for example in examples:
#         example_description = example['description']
#         example_pages = extract_pages(example.get('Page', []))

#         sql_commands.append(build_sql_insert('examples', ['concept_id', 'example_name', 'example_page', 'example_description', 'example_type'], [f"(SELECT concept_id FROM concepts WHERE concept_name='{subconcept_name}')", f"'{example_description}'", f"'{example_pages}'", f"'{example_description}'", f"'{example_type}'"]))

