In [1]:
import pandas as pd
base_url = "/home/hx/Dev/Project/CAM/ProProcess"

In [2]:
data = []
with open(base_url+'/examples/final/pathcontext.txt', 'r') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        parts = line.split()
        label = parts[0]
        value_groups = parts[1:]
        for vg in value_groups:
            # 拆分并转换数值
            values = vg.split(',')
            try:
                num_values = [int(x) for x in values]
            except ValueError:
                continue  # 跳过无法转换的数值组
            # 添加标签和数值到数据列表
            data.append([label] + num_values)

# 确定列名（假设所有数值组长度相同）
if data:
    num_columns = len(data[0]) - 1
    columns = ['label'] + [f'val_{i+1}' for i in range(num_columns)]
else:
    columns = ['label']

df = pd.DataFrame(data, columns=columns)

KeyboardInterrupt: 

In [3]:
df.shape

(2836927, 4)

In [4]:
print(df)

                                  label  val_1   val_2  val_3
0        2025-01-10_16-13-26_构造_字符串.cpp      1       1      2
1        2025-01-10_16-13-26_构造_字符串.cpp      3       1      2
2        2025-01-10_16-13-26_构造_字符串.cpp      4       1      2
3        2025-01-10_16-13-26_构造_字符串.cpp      5       2      6
4        2025-01-10_16-13-26_构造_字符串.cpp      7       3      8
...                                 ...    ...     ...    ...
2836922                       llama.cpp      6  437900    351
2836923                       llama.cpp      6  437901    117
2836924                       llama.cpp      6  437905   2040
2836925                       llama.cpp      6   98325    915
2836926                       llama.cpp      6     580    271

[2836927 rows x 4 columns]


In [15]:

# 假设 df 是你的原始 DataFrame
vector = (
    df.groupby('label', group_keys=False)
    # 显式包含分组列以避免警告，并确保抽样后保留 label
    .apply(lambda g: g.sample(n=258, random_state=42, replace=True), include_groups=True)
    # 二次分组时直接选取目标列，避免分组列干扰
    .groupby('label')[['val_1', 'val_2', 'val_3']]
    # 合并三列为向量
    .apply(lambda g: g.values.flatten())
)

# 转换为 DataFrame
result_df = pd.DataFrame(vector.tolist(), index=vector.index)

  .apply(lambda g: g.sample(n=258, random_state=42, replace=True), include_groups=True)


In [16]:
result_df.shape

(100, 774)

In [12]:
import numpy as np
import hashlib

In [17]:
# ========================
# 数据预处理函数
# ========================
def stable_hash(label, digest_bits=32):
    """生成稳定的哈希整数标签"""
    hex_hash = hashlib.sha256(str(label).encode()).hexdigest()
    full_hash = int(hex_hash, 16)
    return abs(full_hash & ((1 << digest_bits) - 1))

# ========================
# 数据增强管道
# ========================
class DataAugmenter:
    def __init__(self, noise_scale=0.01):
        self.noise_scale = noise_scale
        
    def add_noise(self, data):
        """添加高斯噪声并保留原始数据"""
        # 创建副本用于添加噪声
        noisy_data = data.copy()
        noisy_values = noisy_data.values + np.random.normal(
            scale=self.noise_scale, 
            size=noisy_data.shape
        )
        return pd.DataFrame(noisy_values, 
                           index=noisy_data.index,
                           columns=noisy_data.columns)

# ========================
# 主处理流程
# ========================
def process_data(original_df):
    # 创建副本保护原始数据
    processed_df = original_df.copy()
    
    # 步骤1：哈希化标签
    processed_df.index = processed_df.index.map(stable_hash)
    processed_df.index.name = "hashed_label"
    
    # 步骤2：数据增强
    augmenter = DataAugmenter(noise_scale=0.01)
    augmented_df = augmenter.add_noise(processed_df)
    
    # 步骤3：合并数据集
    combined_df = pd.concat(
        [processed_df, augmented_df],
        keys=['original', 'augmented'],
        names=['data_type', 'hashed_label']
    ).reset_index(level='data_type')
    
    return combined_df

# ========================
# 执行处理
# ========================
# 假设已有原始处理后的 result_df（来自之前步骤）
final_df = process_data(result_df)

# 验证结果
print(f"总数据量: {len(final_df)}")
print(f"原始数据示例:")
print(final_df[final_df['data_type'] == 'original'].head())
print(f"\n增强数据示例:")
print(final_df[final_df['data_type'] == 'augmented'].head())

总数据量: 200
原始数据示例:
             data_type      0         1      2      3         4      5    6  \
hashed_label                                                                  
760531894     original  114.0   21586.0  165.0  165.0   15431.0  168.0  2.0   
4093240987    original    6.0  123320.0  493.0  481.0    4172.0    2.0  8.0   
2375987114    original    7.0  168657.0    2.0    6.0  168715.0  495.0  4.0   
933509859     original   53.0  172376.0    6.0  165.0  169008.0    8.0  6.0   
1811766566    original  138.0  185053.0  784.0    6.0  182566.0  782.0  6.0   

                     7      8  ...    764    765       766    767    768  \
hashed_label                   ...                                         
760531894      33568.0    8.0  ...    6.0    6.0   22975.0    6.0    6.0   
4093240987    122421.0  165.0  ...    8.0  165.0  123793.0  165.0    8.0   
2375987114    122512.0    2.0  ...  492.0    6.0   16309.0  192.0  191.0   
933509859     170071.0  114.0  ...    6.0   86.0

In [20]:
final_df.shape

(200, 775)

In [22]:
print(final_df)

              data_type            0              1            2           3  \
hashed_label                                                                   
760531894      original   114.000000   21586.000000   165.000000  165.000000   
4093240987     original     6.000000  123320.000000   493.000000  481.000000   
2375987114     original     7.000000  168657.000000     2.000000    6.000000   
933509859      original    53.000000  172376.000000     6.000000  165.000000   
1811766566     original   138.000000  185053.000000   784.000000    6.000000   
...                 ...          ...            ...          ...         ...   
682752276     augmented   181.004728  181785.000311   499.019133  744.991701   
3514949202    augmented  1522.005819    1588.994615  1526.990132    5.990203   
582245980     augmented    92.977200  353961.999141     8.000571  473.999062   
883079678     augmented   533.022965    2549.005627  2270.983882    7.990158   
3827590945    augmented   137.993720   1