# 1. 임의의 데이터 확인

In [1]:
import os
import pandas as pd
from glob import glob

raw_data = pd.read_csv("./신체정보데이터/F009.csv", header=None)

new_column_names = ['측정월', '측정일', '모델번호'] + list(raw_data.iloc[1, 3:])
data = pd.DataFrame([raw_data.iloc[2].values], columns=new_column_names)

data

Unnamed: 0,측정월,측정일,모델번호,키,목뒤높이,엉덩이높이,겨드랑높이,허리높이,샅높이,무릎높이,...,발너비,얼굴너비,손직선길이,손바닥직선길이,손안쪽가쪽직선길이,몸무게,체지방율,성별,나이,신발굽높이
0,8,24,F009,161.1,130.5,78.1,118.4,98.6,71.6,42.1,...,10.1,12.6,16.2,9.4,7.4,59.2,28.6,F,44,6


# 2. 수치 데이터 병합

In [3]:
directory_path = './신체정보데이터/'

def process_body_measurement_files(directory_path):
    # Load all CSV files in the directory
    csv_files = glob(os.path.join(directory_path, '*.csv'))

    # Initialize an empty list to store each file's processed DataFrame
    dataframes = []
    for file in csv_files:
        # Read each file
        raw_data = pd.read_csv(file, header=None)
        
        # Ensure the file has enough rows to process (at least 3 rows)
        if raw_data.shape[0] >= 3:
            # Adjust the column names, filtering out any non-string values
            column_labels = [str(label) if isinstance(label, str) else None for label in raw_data.iloc[1, 3:]]
            new_column_names = ['측정월', '측정일', '모델번호'] + column_labels
            df = pd.DataFrame([raw_data.iloc[2].values], columns=new_column_names)
            dataframes.append(df)

    # Concatenate all the individual dataframes into one
    if dataframes:
        combined_data = pd.concat(dataframes, ignore_index=True)
        return combined_data
    else:
        raise ValueError("No valid data found in the CSV files.")

combined_data = process_body_measurement_files(directory_path)
combined_data = combined_data.iloc[:, :-1]
combined_data




Unnamed: 0,측정월,측정일,모델번호,키,목뒤높이,엉덩이높이,겨드랑높이,허리높이,샅높이,무릎높이,...,발너비,얼굴너비,손직선길이,손바닥직선길이,손안쪽가쪽직선길이,몸무게,체지방율,성별,나이,신발굽높이
0,8,23,F004,164.1,137.8,83.2,123.3,100.1,75.4,42.4,...,9.3,13.8,17,10.1,7.6,49.4,21.8,F,30,2.5
1,8,23,F005,152.9,127.7,71.1,111.3,93.8,63.3,37,...,9.8,11.9,16.2,9.1,7.9,51.9,24.2,F,44,1.5
2,8,23,F006,159.7,132.2,74,116.5,93.9,66.5,37.5,...,9.6,12.3,16.7,9.9,7.3,47.6,20,F,50,3.2
3,8,23,F007,162.2,136.6,77.5,122.6,100.8,70.6,41.1,...,10.3,12,17.6,10.4,8,59.1,27.8,F,42,5.9
4,8,24,F008,157.6,132,74,114.6,95.5,66.6,41.3,...,10.3,13.9,16.7,9.4,7.8,61.6,36.4,F,65,4.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
988,11,30,M499,169.9,143.2,81.3,127.9,97.6,69.1,42.4,...,10.0,14.6,17.2,9.7,8.2,65.2,17.8,M,53,4.5
989,11,29,M500,173.7,145.3,84.6,128.0,101.5,74.8,44.4,...,10.3,14.1,18.4,10.1,8.0,67.2,20.3,M,30,2.9
990,8,25,F013,152.0,127.5,71.5,109.3,90.2,65.0,38.2,...,8.4,13.5,15.5,9.4,7.9,74.0,50.5,F,50.0,4.0
991,8,25,F014,160.2,134.7,73.6,117.6,96.5,68.4,40.3,...,9.6,13.9,17.6,10.3,7.9,51.2,25.0,F,57,2.4


# 3. 라벨 확인

In [4]:
combined_data.columns

Index(['측정월', '측정일', '모델번호', '키', '목뒤높이', '엉덩이높이', '겨드랑높이', '허리높이', '샅높이',
       '무릎높이', '머리둘레', '목둘레', '젖가슴둘레', '허리둘레', '배꼽수준허리둘레', '엉덩이둘레', '넙다리둘레',
       '무릎둘레', '장딴지둘레', '종아리최소둘레', '발목둘레', '편위팔둘레', '편팔꿈치둘레', '손목둘레', '위팔길이',
       '팔길이', '어깨사이너비', '머리수직길이', '얼굴수직길이', '발크기', '발너비', '얼굴너비', '손직선길이',
       '손바닥직선길이', '손안쪽가쪽직선길이', '몸무게', '체지방율', '성별', '나이', '신발굽높이'],
      dtype='object')

## 3-1 옷 사이즈를 결정하는 매개변수(무신사 참고)
- 상의: 총장, 어깨너비, 가슴단면, 소매길이
- 하의: 허리단면, 밑위, 엉덩이단면, 허벅지단면, 총장, 밑단단면

<img src="./상의사이즈.jpg" width="300" height="300">
<img src="./하의사이즈.jpg" width="300" height="300">

- 인풋 데이터 라벨: 몸무게, 키, 성별
- 아웃풋 데이터 라벨(상의): 
  1. 어깨사이너비(어깨너비) 
  2. 팔길이(소매길이) 
  3. 젖가슴둘레/2(가슴단면)  
  4. 목뒤높이-허리높이(총장)
- 아웃풋 데이터 라벨(하의): 
  1. 허리둘레/2(허리단면)  
  2. 엉덩이둘레/2(엉덩이단면)
  3. 넙다리둘레/2(허벅지단면) 
  4. 발목둘레/2(밑단단면)
  5. 허리높이(총장)
  6. 허리높이-샅높이(밑위)

In [5]:
# Extracting the required columns: '모델번호', '키', '몸무게', '성별'
input_data = combined_data[['모델번호', '키', '몸무게', '성별']]

# One-hot encoding for '성별' column
input_data = pd.get_dummies(input_data, columns=['성별'])

input_data.head()

Unnamed: 0,모델번호,키,몸무게,성별_F,성별_M
0,F004,164.1,49.4,True,False
1,F005,152.9,51.9,True,False
2,F006,159.7,47.6,True,False
3,F007,162.2,59.1,True,False
4,F008,157.6,61.6,True,False


In [6]:
def convert_to_numeric(df, columns):
    for col in columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

numeric_columns = ['젖가슴둘레', '목뒤높이', '허리높이', '허리둘레', '엉덩이둘레', '넙다리둘레', '발목둘레', '샅높이']
numeric_combined_data = convert_to_numeric(combined_data, numeric_columns)

# Now, create the output labels based on calculations
output_data = pd.DataFrame({
    '모델번호': combined_data['모델번호'], # 모델번호
    '어깨너비': numeric_combined_data['어깨사이너비'],  # 1. 어깨사이너비(어깨너비)
    '소매길이': numeric_combined_data['팔길이'],        # 2. 팔길이(소매길이)
    '가슴단면': numeric_combined_data['젖가슴둘레'] / 2,  # 3. 젖가슴둘레/2(가슴단면)
    '상의총장': numeric_combined_data['목뒤높이'] - combined_data['허리높이'],  # 4. 목뒤높이-허리높이(상의총장)
    '허리단면': numeric_combined_data['허리둘레'] / 2,  # 5. 허리둘레/2(허리단면)
    '엉덩이단면': numeric_combined_data['엉덩이둘레'] / 2,  # 6. 엉덩이둘레/2(엉덩이단면)
    '허벅지단면': numeric_combined_data['넙다리둘레'] / 2,  # 7. 넙다리둘레/2(허벅지단면)
    '밑단단면': numeric_combined_data['발목둘레'] / 2,  # 8. 발목둘레/2(밑단단면)
    '하의총장': numeric_combined_data['허리높이'],       # 9. 허리높이(하의총장)
    '밑위': numeric_combined_data['허리높이'] - numeric_combined_data['샅높이']  # 10. 허리높이-샅높이(밑위)
})

output_data.head(10)

Unnamed: 0,모델번호,어깨너비,소매길이,가슴단면,상의총장,허리단면,엉덩이단면,허벅지단면,밑단단면,하의총장,밑위
0,F004,33.5,57.0,40.25,37.7,32.05,43.9,25.15,11.85,100.1,24.7
1,F005,33.1,49.0,40.25,33.9,33.1,45.25,27.5,12.4,93.8,30.5
2,F006,32.1,51.1,39.75,38.3,32.2,43.65,23.8,11.95,93.9,27.4
3,F007,33.6,53.1,46.5,35.8,37.4,46.05,26.1,12.35,100.8,30.2
4,F008,36.6,51.5,49.85,36.5,41.3,46.1,25.75,12.25,95.5,28.9
5,F009,35.4,52.1,45.65,31.9,37.95,48.9,28.9,12.1,98.6,27.0
6,F010,36.4,55.9,43.05,36.6,32.85,46.1,27.15,11.3,104.9,27.1
7,F011,34.4,52.0,41.1,35.4,34.35,45.95,26.75,12.25,100.6,30.5
8,F012,35.4,50.3,42.05,36.9,35.95,47.55,28.0,11.6,92.1,26.7
9,F016,35.5,52.3,45.3,37.5,39.0,51.35,30.85,12.45,98.9,27.9


In [7]:
# 모델 번호 목록 생성
model_numbers_to_remove = [f"F00{i}" for i in range(4, 9)] + [f"M00{i}" for i in range(4, 9)]

# 조건에 따라 데이터 삭제
input_data = input_data[~input_data['모델번호'].isin(model_numbers_to_remove)]
output_data = output_data[~output_data['모델번호'].isin(model_numbers_to_remove)]

In [304]:
# 모델 번호 목록 생성
model_numbers_to_remove = [f"F{i}" for i in range(309, 501)] + [f"M{i}" for i in range(309, 501)]

# 조건에 따라 데이터 삭제
input_data_filtered = input_data_filtered[~input_data_filtered['모델번호'].isin(model_numbers_to_remove)]
output_data_filtered = output_data_filtered[~output_data_filtered['모델번호'].isin(model_numbers_to_remove)]

In [24]:
input_data

Unnamed: 0,모델번호,키,몸무게,성별_F,성별_M
5,F009,161.1,59.2,True,False
6,F010,167.1,52.3,True,False
7,F011,160.8,52.7,True,False
8,F012,154,53,True,False
9,F016,164.1,66.4,True,False
...,...,...,...,...,...
988,M499,169.9,65.2,False,True
989,M500,173.7,67.2,False,True
990,F013,152.0,74.0,True,False
991,F014,160.2,51.2,True,False


In [23]:
output_data

Unnamed: 0,모델번호,어깨너비,소매길이,가슴단면,상의총장,허리단면,엉덩이단면,허벅지단면,밑단단면,하의총장,밑위
5,F009,35.4,52.1,45.65,31.9,37.95,48.90,28.90,12.10,98.6,27.0
6,F010,36.4,55.9,43.05,36.6,32.85,46.10,27.15,11.30,104.9,27.1
7,F011,34.4,52,41.10,35.4,34.35,45.95,26.75,12.25,100.6,30.5
8,F012,35.4,50.3,42.05,36.9,35.95,47.55,28.00,11.60,92.1,26.7
9,F016,35.5,52.3,45.30,37.5,39.00,51.35,30.85,12.45,98.9,27.9
...,...,...,...,...,...,...,...,...,...,...,...
988,M499,39.1,53.0,46.35,45.6,40.90,43.50,24.20,12.20,97.6,28.5
989,M500,39.0,56.1,45.85,43.8,36.15,46.95,27.65,13.10,101.5,26.7
990,F013,33.8,50.5,50.95,37.3,49.85,52.70,32.20,12.35,90.2,25.2
991,F014,35.8,54.8,40.50,38.2,34.25,44.50,26.10,12.35,96.5,28.1


In [28]:
import pandas as pd

# 숫자 형식이 아닌 값이 포함된 행을 삭제하고, 삭제된 행의 모델 번호를 반환하는 함수 정의
def remove_non_numeric_rows(df, model_column='모델번호'):
    # 원본 데이터에서 모델 번호와 데이터를 분리
    model_numbers = df[model_column]
    data_only = df.drop(columns=[model_column])
    
    # 데이터프레임의 모든 열을 숫자 형식으로 변환하고, 변환할 수 없는 값은 NaN으로 설정
    numeric_data = data_only.apply(pd.to_numeric, errors='coerce')
    
    # NaN 값이 포함된 행을 식별
    invalid_rows = numeric_data.isnull().any(axis=1)
    
    # 삭제된 행의 모델 번호를 출력
    removed_model_numbers = model_numbers[invalid_rows]
    print(f"제거된 행의 모델 번호: {removed_model_numbers.tolist()}")
    
    # NaN 값이 포함된 행을 삭제하여 숫자 형식이 아닌 값이 있는 행 제거
    cleaned_data = numeric_data.dropna().reset_index(drop=True)
    
    # 모델 번호를 맨 앞으로 다시 추가
    cleaned_data.insert(0, model_column, model_numbers[~invalid_rows].reset_index(drop=True))
    
    return cleaned_data

# 모델 번호를 제외하고 숫자 형식이 아닌 값이 포함된 행을 제거하고, 모델 번호를 출력
cleaned_input_data = remove_non_numeric_rows(input_data)
cleaned_output_data = remove_non_numeric_rows(output_data)


제거된 행의 모델 번호: ['M042']
제거된 행의 모델 번호: ['F044', 'F322', 'M072', 'M147', 'M368', 'M475']


In [31]:
# 제거할 모델 번호 리스트
input_remove_models = ['F044', 'F322', 'M072', 'M147', 'M368', 'M475']
output_remove_models = ['M042']

# input_data에서 특정 모델 번호에 해당하는 행을 제거
cleaned_input_data = cleaned_input_data[~cleaned_input_data['모델번호'].isin(input_remove_models)].reset_index(drop=True)

# output_data에서 특정 모델 번호에 해당하는 행을 제거
cleaned_output_data = cleaned_output_data[~cleaned_output_data['모델번호'].isin(output_remove_models)].reset_index(drop=True)

In [32]:
cleaned_input_data

Unnamed: 0,모델번호,키,몸무게,성별_F,성별_M
0,F009,161.1,59.2,True,False
1,F010,167.1,52.3,True,False
2,F011,160.8,52.7,True,False
3,F012,154.0,53.0,True,False
4,F016,164.1,66.4,True,False
...,...,...,...,...,...
971,M499,169.9,65.2,False,True
972,M500,173.7,67.2,False,True
973,F013,152.0,74.0,True,False
974,F014,160.2,51.2,True,False


In [33]:
cleaned_output_data

Unnamed: 0,모델번호,어깨너비,소매길이,가슴단면,상의총장,허리단면,엉덩이단면,허벅지단면,밑단단면,하의총장,밑위
0,F009,35.4,52.1,45.65,31.9,37.95,48.90,28.90,12.10,98.6,27.0
1,F010,36.4,55.9,43.05,36.6,32.85,46.10,27.15,11.30,104.9,27.1
2,F011,34.4,52.0,41.10,35.4,34.35,45.95,26.75,12.25,100.6,30.5
3,F012,35.4,50.3,42.05,36.9,35.95,47.55,28.00,11.60,92.1,26.7
4,F016,35.5,52.3,45.30,37.5,39.00,51.35,30.85,12.45,98.9,27.9
...,...,...,...,...,...,...,...,...,...,...,...
971,M499,39.1,53.0,46.35,45.6,40.90,43.50,24.20,12.20,97.6,28.5
972,M500,39.0,56.1,45.85,43.8,36.15,46.95,27.65,13.10,101.5,26.7
973,F013,33.8,50.5,50.95,37.3,49.85,52.70,32.20,12.35,90.2,25.2
974,F014,35.8,54.8,40.50,38.2,34.25,44.50,26.10,12.35,96.5,28.1


In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 모델 번호를 기준으로 input과 output 데이터프레임 정렬 (동일한 순서로 정렬)
cleaned_input_data = cleaned_input_data.sort_values(by="모델번호").reset_index(drop=True)
cleaned_output_data = cleaned_output_data.sort_values(by="모델번호").reset_index(drop=True)

# 두 데이터프레임에서 모델 번호가 같은지 확인
assert all(cleaned_input_data['모델번호'] == cleaned_output_data['모델번호']), "모델 번호가 일치하지 않습니다."

# 훈련 데이터와 검증 데이터를 80:20 비율로 분할
train_input, valid_input, train_output, valid_output = train_test_split(
    cleaned_input_data, cleaned_output_data, test_size=176/976, random_state=42, shuffle=True
)

# 결과 확인
print("Train Input Data Shape:", train_input.shape)
print("Valid Input Data Shape:", valid_input.shape)
print("Train Output Data Shape:", train_output.shape)
print("Valid Output Data Shape:", valid_output.shape)


Train Input Data Shape: (800, 5)
Valid Input Data Shape: (176, 5)
Train Output Data Shape: (800, 11)
Valid Output Data Shape: (176, 11)


In [36]:
# 저장할 경로 설정
train_input_path = './train_input.csv'
valid_input_path = './valid_input.csv'
train_output_path = './train_output.csv'
valid_output_path = './valid_output.csv'

# 데이터프레임 저장
train_input.to_csv(train_input_path, index=False)
valid_input.to_csv(valid_input_path, index=False)
train_output.to_csv(train_output_path, index=False)
valid_output.to_csv(valid_output_path, index=False)

train_input_path, valid_input_path, train_output_path, valid_output_path

('./train_input.csv',
 './valid_input.csv',
 './train_output.csv',
 './valid_output.csv')

In [273]:
import os
import random
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

class BodyMeasurementDataset(Dataset):
    def __init__(self, input_data_path, output_data_path, image_folder, transform=None):
        # 데이터프레임 로드
        self.input_data = pd.read_csv(input_data_path)  # input_data 데이터프레임
        self.output_data = pd.read_csv(output_data_path)  # output_data 데이터프레임
        self.image_folder = image_folder  # 이미지가 저장된 폴더 경로
        self.transform = transform  # 이미지 전처리
        
    def __len__(self):
        return len(self.input_data)
    
    def __getitem__(self, idx):
        idx=0
        # 모델 번호를 통해 input과 output 데이터를 매칭
        model_number = self.input_data.iloc[idx]['모델번호']
        input_features = self.input_data.iloc[idx][['키', '몸무게', '성별_F', '성별_M']].values.astype(float)
        output_labels = self.output_data[self.output_data['모델번호'] == model_number].iloc[0, 1:].values.astype(float)
        # 이미지 폴더에서 임의로 이미지 선택
        image_dir = os.path.join(self.image_folder, model_number, "Image")
        image_file = random.choice(os.listdir(image_dir))  # 폴더에서 랜덤 이미지 선택
        image_path = os.path.join(image_dir, image_file)
        
        # 이미지 로드 및 전처리
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        
        # Tensor로 변환
        input_features = torch.tensor(input_features, dtype=torch.float32)
        output_labels = torch.tensor(output_labels, dtype=torch.float32)
        
        return image, input_features, output_labels

# 이미지 전처리 (예시로, 이미지 크기 조정과 정규화 적용)
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Dataset과 DataLoader 생성
image_folder = './신체사진데이터'
dataset = BodyMeasurementDataset(input_data_path='./input_data.csv',
                                 output_data_path='./output_data.csv',
                                 image_folder=image_folder,
                                 transform=transform)

# DataLoader 설정
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [17]:
print(dataset[0])

(tensor([[[-0.7479, -0.7822, -0.8335,  ...,  1.2728,  1.2899,  1.2899],
         [-0.7479, -0.7822, -0.8335,  ...,  1.2557,  1.2557,  1.2728],
         [-0.7479, -0.7822, -0.8507,  ...,  1.2557,  1.2385,  1.2557],
         ...,
         [-1.3130, -1.2959, -1.3302,  ...,  1.3070,  1.2899,  1.2728],
         [-1.3302, -1.3302, -1.3473,  ...,  1.3242,  1.3070,  1.2728],
         [-1.3473, -1.3473, -1.3644,  ...,  1.3413,  1.3242,  1.3070]],

        [[-0.5301, -0.5651, -0.6176,  ...,  1.6232,  1.6232,  1.6232],
         [-0.5301, -0.5651, -0.6176,  ...,  1.6057,  1.6057,  1.6232],
         [-0.5301, -0.5651, -0.6176,  ...,  1.5882,  1.6057,  1.6057],
         ...,
         [-1.1078, -1.0728, -1.0728,  ...,  1.6232,  1.6057,  1.5882],
         [-1.1253, -1.0728, -1.0903,  ...,  1.6408,  1.6232,  1.5882],
         [-1.1429, -1.1078, -1.1078,  ...,  1.6583,  1.6408,  1.6232]],

        [[-0.0964, -0.1312, -0.1835,  ...,  2.0474,  2.0474,  2.0474],
         [-0.0964, -0.1312, -0.1835,  ...,  

In [297]:
import torch
import torch.nn as nn
import torchvision.models as models

class CustomModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(CustomModel, self).__init__()
        
        # Pretrained ResNet18 모델 로드 및 수정
        self.resnet = models.resnet18(pretrained=True)
        num_ftrs = self.resnet.fc.in_features
        self.resnet.fc = nn.Identity()  # ResNet18의 fully connected layer를 Identity로 변경
        
        # 추가 입력 데이터의 피처를 결합하기 위한 밀집층
        self.fc_input = nn.Sequential(
            nn.Linear(input_size, 32),
            nn.ReLU()
        )
        
        # 이미지 특징과 추가 입력 데이터를 결합한 후 최종 예측을 위한 레이어
        self.fc_combined = nn.Sequential(
            nn.Linear(num_ftrs + 32, 64),
            nn.ReLU(),
            nn.Linear(64, output_size)  # 아웃풋 데이터의 개수에 맞춰서 출력층 크기 설정
        )
    
    def forward(self, image, additional_input):
        # 이미지 데이터에서 특징 추출
        image_features = self.resnet(image)
        
        # 추가 입력 데이터 처리
        additional_features = self.fc_input(additional_input)
        
        # 이미지 특징과 추가 입력 데이터를 결합
        combined = torch.cat((image_features, additional_features), dim=1)
        
        # 최종 예측
        output = self.fc_combined(combined)
        return output

# 모델 초기화 (예시)
input_size = 4       # 추가 인풋 데이터의 크기 (키, 몸무게, 성별_F, 성별_M)
output_size = len(output_data.columns) - 1  # 아웃풋 데이터의 열 수에서 모델 번호를 제외한 나머지 개수
model = CustomModel(input_size=input_size, output_size=output_size)

# 손실 함수와 옵티마이저 설정
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


10




In [298]:
# 예시 학습 루프
num_epochs = 10
for epoch in range(num_epochs):
    for images, input_features, output_labels in dataloader:
        # 옵티마이저 초기화
        optimizer.zero_grad()
        
        # 모델 예측
        outputs = model(images, input_features)
        
        # 손실 계산
        loss = criterion(outputs, output_labels)
        print(f"Step Loss: {loss.item():.4f}")
        # 역전파
        loss.backward()
        optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

Step Loss: 2217.7896


KeyboardInterrupt: 