In [None]:
import pandas as pd
import os
import zipfile
import shutil
import random
import numpy as np

def process_weather_bottle_data(data_path, output_matrix_path, output_recipes_path):
    """
    学生需要实现的处理函数
    The processing function that students need to implement
    
    输入/Input:
    - data_path: 包含多个CSV文件的文件夹路径 (Folder path containing multiple CSV files)
    输出/Output:
    - matrix_path: 完整的溶解度矩阵的CSV文件输出路径 (Output path for the complete solubility matrix CSV file)
    - recipes_path: 羽毛状晶体产生条件列表的CSV文件输出路径 (Output path for the feather-like crystal recipes list CSV file)
    """
    
    # 用于存储每个乙醇浓度对应的溶解度数据
    solubility_data = {}
    
    # 获取文件夹中所有的CSV文件
    csv_files = [f for f in os.listdir(data_path) if f.endswith(".csv")]
    
    # 处理每个CSV文件
    for filename in csv_files:
        file_path = os.path.join(data_path, filename)
        
        # 从文件名中提取乙醇浓度（如"10.csv" -> 10）
        mass_fraction = int(os.path.splitext(filename)[0])
        
        # 读取CSV文件，空格表示缺失值，并转置数据
        df = pd.read_csv(file_path, header=None, na_values=[' ']).T
        
        # 设置列名
        df.columns = ["Temperature (K)", "Solubility (g)"]
        
        # 将数据转换为数值类型
        df["Temperature (K)"] = pd.to_numeric(df["Temperature (K)"], errors='coerce')
        df["Solubility (g)"] = pd.to_numeric(df["Solubility (g)"], errors='coerce')
        
        # 删除温度无效的行
        df = df.dropna(subset=["Temperature (K)"])
        
        # 将数据转换为Series格式，温度作为索引
        solubility_series = df.set_index("Temperature (K)")["Solubility (g)"]
        filled_series = solubility_series.copy()
        
        # 填充缺失值：使用前后两个有效值的平均值
        for i in range(len(solubility_series)):
            if pd.isna(solubility_series.iloc[i]):
                # 找到前面最近的有效值
                prev_val = solubility_series.iloc[:i].dropna().iloc[-1]
                # 找到后面最近的有效值
                next_val = solubility_series.iloc[i+1:].dropna().iloc[0]
                # 用前后两个值的平均数来填补空值
                filled_series.iloc[i] = (prev_val + next_val) / 2
        
        # 存储处理后的数据
        solubility_data[mass_fraction] = filled_series.to_dict()
    
    # 创建溶解度矩阵
    solubility_matrix = pd.DataFrame(solubility_data).T
    
    # 设置表格的行列名称
    solubility_matrix.index.name = "Mass fraction (Ethanol), %"
    solubility_matrix.columns.name = "Temperature (K)"
    
    # 按乙醇浓度和温度排序
    solubility_matrix = solubility_matrix.sort_index(axis=0).sort_index(axis=1)
    
    # 保留4位小数
    solubility_matrix = solubility_matrix.round(4)
    
    # 找出溶解度低于10g的条件（羽毛状晶体）
    critical_value = 10 
    
    # 将矩阵转换为长格式，方便筛选
    stacked = solubility_matrix.stack().reset_index()
    stacked.columns = ["Mass fraction (Ethanol), %", "Temperature (K)", "Solubility (g)"]
    
    # 筛选出溶解度低于10g的行
    feather_recipes_df = stacked[stacked["Solubility (g)"] < critical_value]
    
    # 调整列顺序并按溶解度排序
    feather_recipes_df = feather_recipes_df[['Solubility (g)', 'Temperature (K)', "Mass fraction (Ethanol), %"]]
    feather_recipes_df = feather_recipes_df.sort_values(by="Solubility (g)", ascending=True)

    # 保存溶解度矩阵
    solubility_matrix.to_csv(output_matrix_path, float_format='%.4f')
        
    # 保存羽毛状晶体配方
    feather_recipes_df.to_csv(output_recipes_path, index=False, float_format='%.4f')
    
    print(f"处理完成！")

In [None]:
#---------Submission example-------#
import zipfile
import os


# 学生调用示例（训练集）
# Student call example (training set)
TRAIN_PATH = "data_training"
process_weather_bottle_data(data_path=TRAIN_PATH, output_matrix_path="submission_training_matrix.csv", output_recipes_path="submission_training_recipes.csv")

In [None]:
# import test data
#"DATA_PATH" is an encrypted environment variable for the test set. It can be accessed by the system for scoring after submission, but contestants cannot directly download it.

#if os.environ.get('DATA_PATH'):
#    DATA_PATH = os.environ.get("DATA_PATH") + "/"  
#else:
#    #During the execution of the Baseline, since it is unable to read the test set, this error message will appear, which is a normal phenomenon
#    print("During the execution of the Baseline, since it is unable to read the test set, this error message will appear, which is a normal phenomenon.")  
#calculate_monthly_range(data_path = DATA_PATH + "data_testA.csv", submission_path = "submission_testA.csv") #求解Public测试集方程
#calculate_monthly_range(data_path = DATA_PATH + "data_testB.csv", submission_path = "submission_testB.csv") #求解Private测试集方程

# 处理验证集
# Process validation set
process_weather_bottle_data(data_path="data_validation", output_matrix_path="submission_validation_matrix.csv", output_recipes_path="submission_validation_recipes.csv")

# 处理测试集
# Process test set
process_weather_bottle_data(data_path="data_testing", output_matrix_path="submission_testing_matrix.csv", output_recipes_path="submission_testing_recipes.csv")

In [None]:
# 定义要打包的文件和压缩文件的名称
# Define the files to be packaged and the name of the compressed file
files_to_zip = ['submission_validation_matrix.csv', 'submission_validation_recipes.csv', 'submission_testing_matrix.csv', 'submission_testing_recipes.csv']
zip_filename = 'submission.zip'

# 创建zip文件
# Create a zip file
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for file in files_to_zip:
        if os.path.exists(file):
            zipf.write(file, os.path.basename(file))
        else:
            print(f"Warning: File not found and will not be added to the zip: {file}")

print(f'{zip_filename} has been created successfully!')