## 划分测试集训练集

## 导入工具包

In [8]:
import os
import shutil
import random
import cv2
import pandas as pd
from tqdm import tqdm

### 获得所有类别名称

In [9]:
# 指定数据集路径
dataset_path = 'Particle Figures_full'

In [10]:
dataset_name = dataset_path.split('_')[0]
print('Dataset', dataset_name)

Dataset Particle Figures


In [11]:
!find . -iname '.ipynb_checkpoints'

./.ipynb_checkpoints
./.local/share/Trash/files/val 4/.ipynb_checkpoints
./.local/share/Trash/files/train 4/.ipynb_checkpoints
./.local/share/Trash/files/Particle Figures/.ipynb_checkpoints
./.local/share/Trash/files/Particle Figures_full/train/.ipynb_checkpoints
./.local/share/Trash/files/Particle Figures_full/train/val/.ipynb_checkpoints
./.local/share/Trash/files/Particle Figures_full/Rounded/.ipynb_checkpoints
./.local/share/Trash/files/Particle Figures_full 1/Rounded/.ipynb_checkpoints


In [12]:
!for i in `find . -iname '__MACOSX'`; do rm -rf $i;done
!for i in `find . -iname '.DS_Store'`; do rm -rf $i;done
!for i in `find . -iname '.ipynb_checkpoints'`; do rm -rf $i;done

In [13]:
classes = os.listdir(dataset_path)

In [14]:
!find . -iname '.ipynb_checkpoints'

./.local/share/Trash/files/val 4/.ipynb_checkpoints
./.local/share/Trash/files/train 4/.ipynb_checkpoints
./.local/share/Trash/files/Particle Figures/.ipynb_checkpoints
./.local/share/Trash/files/Particle Figures_full/train/.ipynb_checkpoints
./.local/share/Trash/files/Particle Figures_full/train/val/.ipynb_checkpoints
./.local/share/Trash/files/Particle Figures_full/Rounded/.ipynb_checkpoints
./.local/share/Trash/files/Particle Figures_full 1/Rounded/.ipynb_checkpoints


In [15]:
!for i in `find . -iname '.ipynb_checkpoints'`; do rm -rf $i;done

In [16]:
!find . -iname '.ipynb_checkpoints'

./.local/share/Trash/files/val 4/.ipynb_checkpoints
./.local/share/Trash/files/train 4/.ipynb_checkpoints
./.local/share/Trash/files/Particle Figures/.ipynb_checkpoints
./.local/share/Trash/files/Particle Figures_full/train/.ipynb_checkpoints
./.local/share/Trash/files/Particle Figures_full/train/val/.ipynb_checkpoints
./.local/share/Trash/files/Particle Figures_full/Rounded/.ipynb_checkpoints
./.local/share/Trash/files/Particle Figures_full 1/Rounded/.ipynb_checkpoints


In [17]:
!for i in `find . -iname '.ipynb_checkpoints'`; do rm -rf $i;done

In [18]:
rm -rf "find -type d name .ipynb_checkpoints"

In [19]:
len(classes)

6

In [20]:
classes

['Subrounded',
 'Very angular',
 'Angular',
 'Well rounded',
 'Rounded',
 'Subangular']

### 创建训练集文件夹和测试集文件夹

In [21]:
# 创建 train 文件夹
os.mkdir(os.path.join(dataset_path, 'train'))

# 创建 test 文件夹
os.mkdir(os.path.join(dataset_path, 'val'))

# 在 train 和 test 文件夹中创建各类别子文件夹
for particle in classes:
    os.mkdir(os.path.join(dataset_path, 'train', particle))
    os.mkdir(os.path.join(dataset_path, 'val', particle))

### 划分训练集、测试集，移动文件

In [22]:
test_frac = 0.2  # 测试集比例
random.seed(123) # 随机数种子，便于复现

In [23]:
df = pd.DataFrame()

print('{:^18} {:^18} {:^18}'.format('Type', 'No. of Train set', 'No. of Test set'))

for particle in classes: # 遍历每个类别

    # 读取该类别的所有图像文件名
    old_dir = os.path.join(dataset_path, particle)
    images_filename = os.listdir(old_dir)
    random.shuffle(images_filename) # 随机打乱

    # 划分训练集和测试集
    testset_numer = int(len(images_filename) * test_frac) # 测试集图像个数
    testset_images = images_filename[:testset_numer]      # 获取拟移动至 test 目录的测试集图像文件名
    trainset_images = images_filename[testset_numer:]     # 获取拟移动至 train 目录的训练集图像文件名

    # 移动图像至 test 目录
    for image in testset_images:
        old_img_path = os.path.join(dataset_path, particle, image)         # 获取原始文件路径
        new_test_path = os.path.join(dataset_path, 'val', particle, image) # 获取 test 目录的新文件路径
        shutil.move(old_img_path, new_test_path) # 移动文件

    # 移动图像至 train 目录
    for image in trainset_images:
        old_img_path = os.path.join(dataset_path, particle, image)           # 获取原始文件路径
        new_train_path = os.path.join(dataset_path, 'train', particle, image) # 获取 train 目录的新文件路径
        shutil.move(old_img_path, new_train_path) # 移动文件
    
    # 删除旧文件夹
    assert len(os.listdir(old_dir)) == 0 # 确保旧文件夹中的所有图像都被移动走
    shutil.rmtree(old_dir) # 删除文件夹
    
    # 工整地输出每一类别的数据个数
    print('{:^18} {:^18} {:^18}'.format(particle, len(trainset_images), len(testset_images)))
    
    # 保存到表格中
    df = df._append({'class':particle, 'trainset':len(trainset_images), 'testset':len(testset_images)}, ignore_index=True)

# 重命名数据集文件夹
shutil.move(dataset_path, dataset_name+'_split')

# 数据集各类别数量统计表格，导出为 csv 文件
df['total'] = df['trainset'] + df['testset']
df.to_csv('数据量统计.csv', index=False)

       Type         No. of Train set   No. of Test set  
    Subrounded            278                 69        
   Very angular           124                 31        
     Angular              151                 37        
   Well rounded            56                 14        
     Rounded              127                 31        
    Subangular            265                 66        


In [24]:
df

Unnamed: 0,class,trainset,testset,total
0,Subrounded,278,69,347
1,Very angular,124,31,155
2,Angular,151,37,188
3,Well rounded,56,14,70
4,Rounded,127,31,158
5,Subangular,265,66,331


### 查看文件目录结构

In [32]:
!sudo snap install tree

snap "tree" is already installed, see 'snap help refresh'


In [46]:
!tree "Particle Figures_split" -L 2 #L显示树的数据层级

[01;34mParticle Figures_split[00m
├── [01;34mtrain[00m
│   ├── [01;34mAngular[00m
│   ├── [01;34mRounded[00m
│   ├── [01;34mSubangular[00m
│   ├── [01;34mSubrounded[00m
│   ├── [01;34mVery angular[00m
│   └── [01;34mWell rounded[00m
└── [01;34mval[00m
    ├── [01;34mAngular[00m
    ├── [01;34mRounded[00m
    ├── [01;34mSubangular[00m
    ├── [01;34mSubrounded[00m
    ├── [01;34mVery angular[00m
    └── [01;34mWell rounded[00m

14 directories, 0 files
