<a href="https://colab.research.google.com/github/LC1332/Speaker-Grouping/blob/main/notebook/%E8%B7%A8%E5%A4%9A%E4%B8%AA%E8%A7%86%E9%A2%91%E6%8A%BD%E5%8F%96%E5%9B%BE%E7%89%87%E5%92%8C%E9%9F%B3%E9%A2%91%E7%89%B9%E5%BE%81.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- [ ] 批量载入数据，批量解压缩
- [ ] 构造meta
- [ ] clone音频抽取，wget图片特征
- [ ] 重构一个抽取类，输入meta和文件夹

# 批量载入

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!ls /content/drive/MyDrive/Speaker/data

haruhi_03.zip  liangjian_10.zip  亮剑12.zip  亮剑13.zip  亮剑15.zip  亮剑20.zip


In [4]:
import shutil
import zipfile
import os

def unzip_file(zip_file_name, zip_file_folder, target_dir="/content/"):
    """
    解压指定的 ZIP 文件到目标目录。

    参数:
    zip_file_name (str): ZIP 文件的名称。
    zip_file_folder (str): ZIP 文件所在的文件夹路径。
    target_dir (str): 解压后的目标文件夹路径，默认为 "/content/"。
    """
    # 构建完整的 ZIP 文件路径
    zip_path = os.path.join(zip_file_folder, zip_file_name)

    # 复制 ZIP 文件到目标目录
    shutil.copy(zip_path, target_dir)

    # 构建解压后的目标文件夹路径
    unzip_folder = os.path.join(target_dir, zip_file_name.replace('.zip', ''))

    # 创建目标文件夹
    if not os.path.exists(unzip_folder):
        os.makedirs(unzip_folder)

    # 解压 ZIP 文件
    with zipfile.ZipFile(os.path.join(target_dir, zip_file_name), 'r') as zip_ref:
        zip_ref.extractall(unzip_folder)

    print(f'文件已解压到 {unzip_folder}')

# ZIP 文件名列表
zip_file_names = ["亮剑12.zip", "亮剑13.zip", "亮剑15.zip", "亮剑20.zip"]

# ZIP 文件所在的文件夹路径
zip_file_folder = "/content/drive/MyDrive/Speaker/data/"

# 批量解压 ZIP 文件
for zip_file_name in zip_file_names:
    unzip_file(zip_file_name, zip_file_folder)


文件已解压到 /content/亮剑12
文件已解压到 /content/亮剑13
文件已解压到 /content/亮剑15
文件已解压到 /content/亮剑20


# 批量构造meta

In [5]:
source_dirs = []
for file_name in zip_file_names:
    abstract_name = file_name.replace('.zip', '')
    source_dirs.append(f"/content/{abstract_name}/{abstract_name}")

In [11]:
import pandas as pd

def get_meta_data( source_dir ):
    # 定义存有 JSONL 文件的目录
    jsonl_file = f"{source_dir}/meta.jsonl"

    # 读取 JSONL 文件到 pandas DataFrame
    # lines=True 表示文件中的每一行都是一个独立的 JSON 对象
    meta_data = pd.read_json(jsonl_file, lines=True)

    # 确认 'screenshot_file' 列名是否正确，如果拼写错误则修正
    if 'screeshot_file' in meta_data.columns:
        meta_data.rename(columns={'screeshot_file': 'screenshot_file'}, inplace=True)

    # 更新 audio_file 和 screenshot_file 列，增加 source_dir 路径
    meta_data['audio_file'] = meta_data['audio_file'].apply(lambda x: os.path.join(source_dir, x))
    meta_data['screenshot_file'] = meta_data['screenshot_file'].apply(lambda x: os.path.join(source_dir, x))

    return meta_data

meta_datas = [get_meta_data( source_dir ) for source_dir in source_dirs]

print(meta_datas[0].head())


    人物           人物台词          开始时间          结束时间  \
0  方立功    如果我们打掉了他的汽车  00:01:29.460  00:01:36.320   
1  方立功      他就会经过电牌得志  00:01:36.320  00:01:38.560   
2  方立功  那么他去西极据点就毫无意义  00:01:38.560  00:01:42.680   
3  方立功   我想 他会向平安县城靠拢  00:01:42.680  00:01:49.440   
4  方立功     将军岭是他的必经之地  00:01:49.440  00:01:52.400   

                        audio_file                      screenshot_file  
0  /content/亮剑12/亮剑12/audios/0.wav  /content/亮剑12/亮剑12/screeshots/0.jpg  
1  /content/亮剑12/亮剑12/audios/1.wav  /content/亮剑12/亮剑12/screeshots/1.jpg  
2  /content/亮剑12/亮剑12/audios/2.wav  /content/亮剑12/亮剑12/screeshots/2.jpg  
3  /content/亮剑12/亮剑12/audios/3.wav  /content/亮剑12/亮剑12/screeshots/3.jpg  
4  /content/亮剑12/亮剑12/audios/4.wav  /content/亮剑12/亮剑12/screeshots/4.jpg  


# 安装环境

In [12]:
!git clone --filter=blob:none --no-checkout https://github.com/LC1332/Legacy-Haruhi-1.0
%cd Legacy-Haruhi-1.0
!git sparse-checkout init --cone
!git sparse-checkout set yuki_builder/audio_feature_ext
!git checkout

%cd /content/Legacy-Haruhi-1.0/yuki_builder

!wget -q https://raw.githubusercontent.com/LC1332/simple-face-recognition/main/CLIPExtractor.py

Cloning into 'Legacy-Haruhi-1.0'...
remote: Enumerating objects: 316, done.[K
remote: Counting objects: 100% (2/2), done.[K
remote: Total 316 (delta 0), reused 2 (delta 0), pack-reused 314[K
Receiving objects: 100% (316/316), 333.37 KiB | 20.83 MiB/s, done.
Resolving deltas: 100% (6/6), done.
/content/Legacy-Haruhi-1.0
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (1/1), done.[K
remote: Total 25 (delta 0), reused 1 (delta 0), pack-reused 24[K
Receiving objects: 100% (25/25), 50.88 KiB | 16.96 MiB/s, done.
Your branch is up to date with 'origin/main'.
/content/Legacy-Haruhi-1.0/yuki_builder


In [15]:
import os
import pandas as pd
from CLIPExtractor import CLIPExtractor
from audio_feature_ext.audio_fea_ext import AudioFeatureExtraction
from tqdm import tqdm


class NewAudioFeatureExtraction(AudioFeatureExtraction):
    def extract_from_files(self, voice_files):
        features = []
        for file in tqdm(voice_files):
            try:
                feature = self.infer(file)[0]
                features.append(feature)
            except Exception as e:
                print(f"Error processing file {file}: {str(e)}")
                features.append([])
                continue
        return features


class AudioVisualExtractor:
    def __init__(self):
        self.visual_extractor = CLIPExtractor()
        self.audio_extractor = NewAudioFeatureExtraction()

    def clean_name(self, file_name):
        # 这里需要定义或复制 clean_name 函数的实现
        return file_name

    def extract_visual_features(self, screenshot_files):
        screenshots = [self.clean_name(f) for f in screenshot_files]
        return self.visual_extractor.extract(screenshots)

    def extract_audio_features(self, audio_files):
        voice_files = [self.clean_name(f) for f in audio_files]
        return self.audio_extractor.extract_from_files(voice_files)

    def extract(self, meta_data):
        # Extract visual features
        screenshots = meta_data["screenshot_file"].values
        visual_features = self.extract_visual_features(screenshots)

        # Extract audio features
        audio_files = meta_data["audio_file"].values
        audio_features = self.extract_audio_features(audio_files)

        # Append new features to the DataFrame
        meta_data['visual_feature'] = pd.Series(visual_features)
        meta_data['audio_feature'] = pd.Series(audio_features)

        return meta_data


In [18]:
# ZIP 文件所在的文件夹路径
feature_save_folder = "/content/drive/MyDrive/Speaker/feature/"

if not os.path.exists(feature_save_folder):
    os.makedirs(feature_save_folder)

In [None]:
extractor = AudioVisualExtractor()

for meta_data, zip_file_name in zip(meta_datas, zip_file_names):
    clean_name = zip_file_name.replace(".zip", "")
    print(f"now dealing with {clean_name}")
    parquet_save_name = f"{clean_name}.parquet"
    parquet_save_name = os.path.join(feature_save_folder, parquet_save_name)
    new_meta_data = extractor.extract(meta_data)
    new_meta_data.to_parquet(parquet_save_name, index=False)

now dealing with 亮剑12


100%|██████████| 61/61 [00:48<00:00,  1.27it/s]
 47%|████▋     | 451/963 [00:12<00:12, 40.66it/s]