In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [None]:
from datasets import load_dataset

train_ds = load_dataset("glue", "sst2", split="train")

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [None]:
def transform_data(X_train, X_test):
    """
    Input:
    - X_train, X_test: Series containing the text data for training and testing respectively.

    Output:
    - X_train_tfidf, X_test_tfidf: Transformed text data in TF-IDF format for training and testing respectively.
    - vectorizer: Fitted TfidfVectorizer object.
    """
    #########################################
    # TODO: Convert the text data to TF-IDF format and return the transformed data and the vectorizer
    vectorizer = TfidfVectorizer()
    X_test_tfidf = vectorizer.fit_transform(X_test).toarray()
    X_train_tfidf = vectorizer.transform(X_train).toarray()
    #########################################
    return X_train_tfidf, X_test_tfidf, vectorizer

In [None]:
# Evaluation should be done using test_ds
test_ds = load_dataset("csv", data_files="/content/drive/MyDrive/인지개/test_dataset.csv")['train']
X_test, y_test = test_ds['sentence'], test_ds['label']

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
class MLPmodel(nn.Module):
    def __init__(self, input_size, output_size, hidden_sizes):
        super(MLPmodel, self).__init__()

        layers = []
        prev_size = input_size

        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(prev_size, hidden_size))
            layers.append(nn.ReLU())
            prev_size = hidden_size

        layers.append(nn.Linear(prev_size, output_size))
        layers.append(nn.Sigmoid())

        self.layers = nn.ModuleList(layers)

    def forward(self, x):
      for layer in self.layers:
          x = layer(x)
      return x

In [None]:
from sklearn.model_selection import train_test_split

X_train, y_train = train_ds['sentence'], train_ds['label']

X_train_tfidf, X_test_tfidf, vectorizer = transform_data(X_train, X_test)

X_train_tensor = torch.tensor(X_train_tfidf, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test_tfidf, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

input_size = X_train_tfidf.shape[1]
output_size = 1
hidden_sizes = [256,128, 64, 32]
model = MLPmodel(input_size, output_size, hidden_sizes)

optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.BCELoss()


num_epochs = 200
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs.squeeze(), y_train_tensor)
    loss.backward()
    optimizer.step()
    # if(epoch%10==0):
    #   print("loss : ",loss)
print("Training complete.")

model.eval()


# 모델 평가
from sklearn.metrics import accuracy_score, classification_report

with torch.no_grad():
  test_outputs = model(torch.tensor(X_test_tfidf, dtype=torch.float32))

  test_predictions = (test_outputs > 0.5).float().flatten()
  print(classification_report(test_predictions,y_test))

Training complete.
              precision    recall  f1-score   support

         0.0       0.67      0.80      0.73        45
         1.0       0.80      0.67      0.73        55

    accuracy                           0.73       100
   macro avg       0.74      0.74      0.73       100
weighted avg       0.74      0.73      0.73       100



In [None]:

input_size = X_train_tfidf.shape[1]
output_size = 1
hidden_sizes = [512,256,128, 64, 32]
model = MLPmodel(input_size, output_size, hidden_sizes)

optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.BCELoss()


num_epochs = 200
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs.squeeze(), y_train_tensor)
    loss.backward()
    optimizer.step()
    # if(epoch%10==0):
    #   print("loss : ",loss)
print("Training complete.")

model.eval()


# 모델 평가
from sklearn.metrics import accuracy_score, classification_report

with torch.no_grad():
  test_outputs = model(torch.tensor(X_test_tfidf, dtype=torch.float32))

  test_predictions = (test_outputs > 0.5).float().flatten()
  print(classification_report(test_predictions,y_test))

Training complete.
              precision    recall  f1-score   support

         0.0       0.74      0.75      0.75        53
         1.0       0.72      0.70      0.71        47

    accuracy                           0.73       100
   macro avg       0.73      0.73      0.73       100
weighted avg       0.73      0.73      0.73       100



In [None]:
input_size = X_train_tfidf.shape[1]
output_size = 1
hidden_sizes = [512,256,128, 64, 32]
model = MLPmodel(input_size, output_size, hidden_sizes)

optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.BCELoss()


num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs.squeeze(), y_train_tensor)
    loss.backward()
    optimizer.step()
    # if(epoch%10==0):
    #   print("loss : ",loss)
print("Training complete.")

model.eval()


# 모델 평가
from sklearn.metrics import accuracy_score, classification_report

with torch.no_grad():
  test_outputs = model(torch.tensor(X_test_tfidf, dtype=torch.float32))

  test_predictions = (test_outputs > 0.5).float().flatten()
  print(classification_report(test_predictions,y_test))

Training complete.
              precision    recall  f1-score   support

         0.0       0.74      0.71      0.73        56
         1.0       0.65      0.68      0.67        44

    accuracy                           0.70       100
   macro avg       0.70      0.70      0.70       100
weighted avg       0.70      0.70      0.70       100

