* 处理模糊边界的问题
    1. 直接去掉有模糊边界问题的图片
    2. 给 m 一定的权重
        * 如果不存在 m, 则 y=1
        * 如果存在1个 m, 则 y=0.6, m=0.4
        * 如果存在 2 个 m, 则 y=0.4, m=0.3, m=0.3
        * 如果存在 3 个 m, 则 y=0.4, m=0.2, m=0.2

In [1]:
import os
import cv2
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import itertools

from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


warnings.filterwarnings('ignore')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## 导入数据

In [2]:
df_train = pd.read_csv('../../raw/train/Annotations/label.csv')
df_train.columns = ['image_id', 'class', 'label']
df_train.head()

Unnamed: 0,image_id,class,label
0,Images/collar_design_labels/0ef580b4deabcd9fa4...,collar_design_labels,ynnnn
1,Images/collar_design_labels/87ccc33937821a97ad...,collar_design_labels,nnynn
2,Images/collar_design_labels/26caac7d1f1b36fb9d...,collar_design_labels,ynnnn
3,Images/collar_design_labels/396ab4e7cbc6798100...,collar_design_labels,nynnn
4,Images/collar_design_labels/fc4a679c2bf209de13...,collar_design_labels,ynnnn


In [3]:
classes = ['collar_design_labels', 'neckline_design_labels', 'skirt_length_labels',
          'sleeve_length_labels', 'neck_design_labels', 'coat_length_labels', 'lapel_design_labels',
          'pant_length_labels']

## 加载数据

In [4]:
cur_class = classes[3]
df_load = df_train[(df_train['class'] == cur_class)].copy()
df_load.reset_index(inplace=True)
del df_load['index']

print('{0}: {1}'.format(cur_class, len(df_load)))
df_load.head()

sleeve_length_labels: 17285


Unnamed: 0,image_id,class,label
0,Images/sleeve_length_labels/f5c414bda8a9bb97f6...,sleeve_length_labels,nnnnnnmym
1,Images/sleeve_length_labels/ffeef43a34d68b2a47...,sleeve_length_labels,nnynnnnnn
2,Images/sleeve_length_labels/4be61ca727ad25645e...,sleeve_length_labels,nnnnnnmmy
3,Images/sleeve_length_labels/b9d69a26db06295bfa...,sleeve_length_labels,nnnynnnnn
4,Images/sleeve_length_labels/fb7c3a0181d538b52e...,sleeve_length_labels,nnnnnnynn


## 给 M 设定权重

In [6]:
n = len(df_load)
n_class = len(df_load['label'][0])
width = 299 # 定义图片大小

# X = np.zeros((n, width, width, 3), dtype=np.uint8)
y = np.zeros((n, n_class), dtype=np.float32)

In [32]:
# 查看最多有多少个 m
max_m = []
for i in range(n):
    tmp_label = df_load['label'][i]
    m_num = tmp_label.count('m')
    max_m.append(m_num)
print(max(max_m))

3


In [41]:
def findStr(string, subStr, findCnt):
    listStr = string.split(subStr,findCnt)
    if len(listStr) <= findCnt:
        return -1
    return len(string)-len(listStr[-1])-len(subStr)

In [65]:
for i in tqdm(range(n)):
    tmp_label = df_load['label'][i]
    X[i] = cv2.resize(cv2.imread('../../raw/train/{0}'.format(df_load['image_id'][i])), (width, width))
    if tmp_label.count('m') == 1:
        y[i][tmp_label.find('y')] = 0.6
        y[i][tmp_label.find('m')] = 0.4
    elif tmp_label.count('m') == 2:
        y[i][tmp_label.find('y')] = 0.4
        y[i][tmp_label.find('m')] = 0.3    
        y[i][tmp_label.rfind('m')] = 0.3
    elif tmp_label.count('m') == 2:
        y[i][tmp_label.find('y')] = 0.4
        y[i][tmp_label.find('m')] = 0.3    
        y[i][tmp_label.rfind('m')] = 0.3
    elif tmp_label.count('m') == 3:
        y[i][tmp_label.find('y')] = 0.4
        y[i][tmp_label.find('m')] = 0.2 
        y[i][findStr(tmp_label, 'm', 2)] = 0.2
        y[i][tmp_label.rfind('m')] = 0.2
    else:
        y[i][tmp_label.find('y')] = 1

100%|██████████| 17285/17285 [02:14<00:00, 128.73it/s]


## 去掉模糊边界的图片

In [None]:
n = len(df_load)

In [None]:
index = []
for i in range(n):
    if df_load['label'][i].find('m') != -1:
        index.append(i)

In [167]:
df_load.drop(df_load.index[index],inplace=True)

In [172]:
df_load = df_load.reset_index(drop=True)

In [176]:
n = len(df_load)
n_class = len(df_load['label'][0])
width = 299 # 定义图片大小

X = np.zeros((n, width, width, 3), dtype=np.uint8)
y = np.zeros((n, n_class), dtype=np.uint8)

In [177]:
for i in tqdm(range(n)):
    tmp_label = df_load['label'][i]
#     if len(tmp_label) > n_class:
#         print(df_load['image_id'][i])
#     X[i] = cv2.resize(cv2.imread('../../raw/train/{0}'.format(df_load['image_id'][i])), (width, width))
    y[i][tmp_label.find('y')] = 1

100%|██████████| 14597/14597 [00:00<00:00, 50779.94it/s]
