In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from google.colab import files

In [None]:
uploaded = files.upload()

Saving MicrobiomeMetadataDictionary.csv to MicrobiomeMetadataDictionary.csv
Saving MicrobiomeOTUtaxonomy.csv to MicrobiomeOTUtaxonomy.csv
Saving MicrobiomeWithMetadata.csv to MicrobiomeWithMetadata.csv


In [None]:
for filename in uploaded.keys():
    print(f'업로드된 파일: {filename}')

업로드된 파일: MicrobiomeMetadataDictionary.csv
업로드된 파일: MicrobiomeOTUtaxonomy.csv
업로드된 파일: MicrobiomeWithMetadata.csv


In [None]:
# 첫 번째 파일 읽기: MicrobiomeWithMetadata.csv
df_metadata = pd.read_csv('MicrobiomeWithMetadata.csv')
print("MicrobiomeWithMetadata.csv의 첫 5개 데이터:")
print(df_metadata.head())  # 데이터 미리보기

MicrobiomeWithMetadata.csv의 첫 5개 데이터:
   Diet  Source  Donor  CollectionMet  Sex          OTU0          OTU1  \
0     0       0      0              0    0  1.560000e-11  4.720000e-11   
1     0       1      0              0    0  2.360000e-11  9.530000e-11   
2     0       2      0              1    0  6.770000e-11  3.680000e-11   
3     0       2      0              0    0  5.520000e-11  9.890000e-11   
4     0       3      0              0    0  5.240000e-11  6.340000e-11   

           OTU2          OTU3          OTU4  ...       OTU6686       OTU6687  \
0  1.230000e-11  4.520000e-11  2.720000e-11  ...  6.660000e-11  3.020000e-11   
1  3.330000e-11  2.670000e-11  2.020000e-11  ...  3.260000e-12  5.390000e-11   
2  8.020000e-11  5.490000e-11  1.340000e-11  ...  7.230000e-11  6.300000e-12   
3  4.580000e-11  3.540000e-11  2.090000e-11  ...  9.100000e-11  2.940000e-11   
4  2.350000e-11  7.470000e-11  2.490000e-11  ...  1.500000e-11  4.900000e-11   

        OTU6688       OTU6689       

In [None]:
# 두 번째 파일 읽기: MicrobiomeMetadataDictionary.csv
df_metadata_dict = pd.read_csv('MicrobiomeMetadataDictionary.csv')
print("MicrobiomeMetadataDictionary.csv의 모든 데이터:")
# MicrobiomeMetadataDictionary.csv 파일의 전체 데이터 출력
print(df_metadata_dict)

MicrobiomeMetadataDictionary.csv의 모든 데이터:
              Sex     Unnamed: 1
0               0           Male
1               1         Female
2             NaN            NaN
3           Donor            NaN
4               0     HMouseLFPP
5               1          CONVR
6               2          Human
7               3          Fresh
8               4         Frozen
9               5  HMouseWestern
10              6          CONVD
11            NaN            NaN
12           Diet            NaN
13              0           LFPP
14              1        Western
15              2          CARBR
16              3           FATR
17              4       Suckling
18              5          Human
19            NaN            NaN
20         Source            NaN
21              0         Cecum1
22              1         Cecum2
23              2         Colon1
24              3         Colon2
25              4          Feces
26              5            SI1
27              6           SI13
2

In [None]:
# 세 번째 파일 읽기: MicrobiomeOTUtaxonomy.csv
df_otu_taxonomy = pd.read_csv('MicrobiomeOTUtaxonomy.csv')
print("MicrobiomeOTUtaxonomy.csv의 첫 5개 데이터:")
print(df_otu_taxonomy.head())  # 데이터 미리보기

MicrobiomeOTUtaxonomy.csv의 첫 5개 데이터:
  #OTU ID\t   Kingdom         Phylum          Class            Order  \
0    OTU0\t  Bacteria            NaN            NaN              NaN   
1    OTU1\t  Bacteria     Firmicutes     Clostridia    Clostridiales   
2    OTU2\t  Bacteria     Firmicutes        Bacilli  Lactobacillales   
3    OTU3\t  Bacteria  Bacteroidetes  Bacteroidetes    Bacteroidales   
4    OTU4\t  Bacteria  Bacteroidetes            NaN              NaN   

               Family            Genus Unnamed: 7 Unnamed: 8  
0                 NaN              NaN        NaN        NaN  
1     Ruminococcaceae              NaN        NaN        NaN  
2     Enterococcaceae     Enterococcus        NaN        NaN  
3  Porphyromonadaceae  Parabacteroides        NaN        NaN  
4                 NaN              NaN        NaN        NaN  


In [None]:
# 'Diet' 컬럼을 타겟(y)으로, 나머지 컬럼들을 특징(X)으로 설정
X = df_metadata.drop(columns=['Diet'])
y = df_metadata['Diet']

In [None]:
# 데이터 표준화 (StandardScaler)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# PyTorch 텐서로 변환
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

In [None]:
# 훈련 데이터와 테스트 데이터 나누기 (80:20)
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

In [None]:
# 훈련 데이터 크기와 테스트 데이터 크기 출력
print(f'훈련 데이터 크기: {X_train.shape}, 테스트 데이터 크기: {X_test.shape}')

훈련 데이터 크기: torch.Size([540, 6700]), 테스트 데이터 크기: torch.Size([135, 6700])


In [None]:
# 모델 1: BasicMLP
class BasicMLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BasicMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
# 모델 2: LargerMLP
class LargerMLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(LargerMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [None]:
# 모델 3: MLPWithDropoutAndBatchNorm
class MLPWithDropoutAndBatchNorm(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLPWithDropoutAndBatchNorm, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.dropout1 = nn.Dropout(p=0.5)

        self.fc2 = nn.Linear(128, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.dropout2 = nn.Dropout(p=0.5)

        self.fc3 = nn.Linear(128, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.dropout3 = nn.Dropout(p=0.5)

        self.fc4 = nn.Linear(64, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = self.relu(self.bn3(self.fc3(x)))
        x = self.dropout3(x)
        x = self.fc4(x)
        return x

In [None]:
# 공통 하이퍼파라미터 설정
input_size = X_train.shape[1]  # 입력 크기 (특징의 수)
output_size = len(y.unique())  # 출력 크기 (클래스의 수, 즉 'Diet'의 고유 값의 수)
num_epochs = 20  # 학습 횟수
learning_rate = 0.001  # 학습률

In [None]:
# 모델, 손실 함수, 옵티마이저 설정
def train_and_evaluate(model, criterion, optimizer, X_train, y_train, X_test, y_test, num_epochs):
    # 학습
    for epoch in range(num_epochs):
        model.train()  # 학습 모드로 전환
        optimizer.zero_grad()  # 기울기 초기화
        outputs = model(X_train)  # 예측값 계산
        loss = criterion(outputs, y_train)  # 손실 계산
        loss.backward()  # 역전파
        optimizer.step()  # 파라미터 업데이트

        if (epoch + 1) % 10 == 0:  # 10번마다 출력
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

            # 평가
    model.eval()  # 평가 모드로 전환
    with torch.no_grad():  # 기울기 계산하지 않음
        outputs = model(X_test)  # 테스트셋 예측
        _, predicted = torch.max(outputs, 1)  # 예측값
        correct = (predicted == y_test).sum().item()  # 정확도 계산
        accuracy = 100 * correct / y_test.size(0)  # 정확도 퍼센트로 계산
        return accuracy

In [None]:
# 모델 1: BasicMLP
model1 = BasicMLP(input_size, hidden_size=64, output_size=output_size)
criterion = nn.CrossEntropyLoss()
optimizer1 = optim.Adam(model1.parameters(), lr=learning_rate)

# 모델 2: LargerMLP
model2 = LargerMLP(input_size, hidden_size=128, num_classes=output_size)
optimizer2 = optim.Adam(model2.parameters(), lr=learning_rate)

# 모델 3: MLPWithDropoutAndBatchNorm
model3 = MLPWithDropoutAndBatchNorm(input_size, hidden_size=128, num_classes=output_size)
optimizer3 = optim.Adam(model3.parameters(), lr=learning_rate)

In [None]:
print("\nTraining and evaluating BasicMLP...")
accuracy1 = train_and_evaluate(model1, criterion, optimizer1, X_train, y_train, X_test, y_test, num_epochs)
print(f"BasicMLP Test Accuracy: {accuracy1:.2f}%")


Training and evaluating BasicMLP...
Epoch [10/20], Loss: 0.1533
Epoch [20/20], Loss: 0.0120
BasicMLP Test Accuracy: 92.59%


In [None]:
print("\nTraining and evaluating LargerMLP...")
accuracy2 = train_and_evaluate(model2, criterion, optimizer2, X_train, y_train, X_test, y_test, num_epochs)
print(f"LargerMLP Test Accuracy: {accuracy2:.2f}%")


Training and evaluating LargerMLP...
Epoch [10/20], Loss: 0.3973
Epoch [20/20], Loss: 0.0189
LargerMLP Test Accuracy: 94.81%


In [None]:
print("\nTraining and evaluating MLPWithDropoutAndBatchNorm...")
accuracy3 = train_and_evaluate(model3, criterion, optimizer3, X_train, y_train, X_test, y_test, num_epochs)
print(f"MLPWithDropoutAndBatchNorm Test Accuracy: {accuracy3:.2f}%")


Training and evaluating MLPWithDropoutAndBatchNorm...
Epoch [10/20], Loss: 1.4897
Epoch [20/20], Loss: 0.9985
MLPWithDropoutAndBatchNorm Test Accuracy: 95.56%
