In [1]:
!nvidia-smi

Fri Aug  2 23:10:41 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 552.22                 Driver Version: 552.22         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   66C    P8              6W /   35W |      14MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import torch
print(torch.cuda.is_available())

print(torch.version.cuda)
print(torch.cuda.get_arch_list())
print(torch.cuda.get_device_capability())

True
11.7
['sm_37', 'sm_50', 'sm_60', 'sm_61', 'sm_70', 'sm_75', 'sm_80', 'sm_86', 'compute_37']
(8, 6)


In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler
from lstm_model import LSTMModel
from lstm_model import BiLSTMModel

In [4]:
import pandas as pd
import torch
from torch.utils.data import Dataset

def make_dataset_before_tokenizer(filename, window_size):
    # filename = r'sentence_file/2022gendai_info.csv'
    df = pd.read_csv(filename)

    # window_size = 1, 2, 3, 4
    
    if window_size == 4:
        df['b_text'] = df[['b4', 'b3', 'b2', 'b1']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
        df['a_text'] = df[['a1', 'a2', 'a3', 'a4']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
    elif window_size == 3:
        df['b_text'] = df[['b3', 'b2', 'b1']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
        df['a_text'] = df[['a1', 'a2', 'a3']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
    elif window_size == 2:
        df['b_text'] = df[['b2', 'b1']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
        df['a_text'] = df[['a1', 'a2']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
    else:
        df['b_text'] = df[['b1']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
        df['a_text'] = df[['a1']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
    
    df['pose'] = '@@@'
    
    # データの整形
    output = df[['b_text','pose','a_text', 'category']]
    
    return output

"""
Window Size 4
Maximum b_text length: 465 characters
Maximum a_text length: 352 characters
Window Size 3
Maximum b_text length: 338 characters
Maximum a_text length: 252 characters
Window Size 2
Maximum b_text length: 273 characters
Maximum a_text length: 202 characters
Window Size 1
Maximum b_text length: 146 characters
Maximum a_text length: 124 characters
"""

class ConjunctionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=468):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        b_text = self.texts.iloc[idx]['b_text']
        a_text = self.texts.iloc[idx]['a_text']
        text = f"{b_text} [SEP] {a_text}"
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [16]:
import glob
from transformers import BertTokenizer, BertJapaneseTokenizer
import torch
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
file_paths = glob.glob('sentence_file/*')
window_size = 1 # TODO: window_size = 1, 2, 3, 4 試す
output_list = [make_dataset_before_tokenizer(file, window_size) for file in file_paths]
df = pd.concat(output_list, ignore_index=True)
# トレーニングとテストデータに分割
train_texts, valid_texts, train_labels, valid_labels = train_test_split(df[['b_text','a_text']], df['category'], test_size=0.2, random_state=42)

# Tokenizerの準備
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese')

# データローダーの準備
train_dataset = ConjunctionDataset(train_texts, train_labels, tokenizer)
valid_dataset = ConjunctionDataset(valid_texts, valid_labels, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=False)

KeyError: "['@@@'] not in index"

In [14]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [15]:
from transformers import AdamW
from torch.optim.lr_scheduler import StepLR
import numpy as np
from lstm_model import LSTMModel
from lstm_model import BiLSTMModel
import tqdm

# モデルの準備
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTMModel(input_size=1, hidden_size=256, num_layers=2, output_size=6).to(device)  # 入力サイズはBERTの出力サイズ(768)、隠れ層サイズ、層数は例として指定

# オプティマイザーとスケジューラーの設定
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
scheduler = StepLR(optimizer, step_size=2, gamma=0.1)
loss_fn = nn.CrossEntropyLoss().to(device)

# 訓練用エポック関数
def train_epoch(model, dataloader, loss_fn, optimizer, device, scheduler):
    model = model.train()
    losses = []
    correct_predictions = 0

    for data in dataloader:
        input_ids = data['input_ids'].to(device)
        input_ids = input_ids.float()
        input_ids = input_ids.unsqueeze(2)
        labels = data['labels'].to(device)

        outputs = model(input_ids)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(dataloader.dataset), np.mean(losses)

# 検証用エポック関数
def eval_model(model, dataloader, loss_fn, device):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for data in dataloader:
            input_ids = data['input_ids'].to(device)
            input_ids = input_ids.float()
            input_ids = input_ids.unsqueeze(2) # バッチサイズ(16)*ウィンドウサイズ(465, 時系列のイメージ)*特徴量(1)
            labels = data['labels'].to(device)

            outputs = model(input_ids)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(dataloader.dataset), np.mean(losses)

# トレーニングループ
epochs = 10

for epoch in tqdm.tqdm(range(epochs)): # TODO: tqdmをいい感じの場所に置ければ置き直す
    train_acc, train_loss = train_epoch(model, train_dataloader, loss_fn, optimizer, device, scheduler)
    print(f'Epoch {epoch + 1}/{epochs}')
    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(model, valid_dataloader, loss_fn, device)
    print(f'Validation loss {val_loss} accuracy {val_acc}')


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10
Train loss 1.789623327922543 accuracy 0.21981028821597956


 10%|█         | 1/10 [01:15<11:16, 75.13s/it]

Validation loss 1.7897713004156601 accuracy 0.2137126185266229
Epoch 2/10
Train loss 1.7896208109730534 accuracy 0.21981028821597956


 20%|██        | 2/10 [02:27<09:46, 73.31s/it]

Validation loss 1.7897713004156601 accuracy 0.2137126185266229


 20%|██        | 2/10 [02:28<09:52, 74.12s/it]


KeyboardInterrupt: 

In [8]:
"""
import pandas as pd
from janome.tokenizer import Tokenizer
from transformers import BertJapaneseTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

def load_and_preprocess_data(file_list, category_mapping, n_sentences):
    dfs = []
    for filename in file_list:
        df = pd.read_csv(f'sentence_file/{filename}')
        # データの整形
        df['text'] = df[['b4', 'b3', 'b2', 'b1', 'a1', 'a2', 'a3', 'a4'][:n_sentences*2]].apply(
            lambda x: ' '.join(x.dropna().astype(str)), axis=1)
        df = df[['text', 'category']]
        dfs.append(df)
    
    # 全データフレームを結合
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # カテゴリを数値に変換
    combined_df['category'] = combined_df['category'].map(category_mapping) # MEMO: これいらない気がする
    
    return combined_df

import os
# dataフォルダ内の全ファイルに対して適用
data_folder = 'sentence_file'
file_list = []
for filename in os.listdir(data_folder):
    if filename.endswith('.csv'):
        file_list.append(filename)

# 前後何文を使用するか指定
n_sentences = 4

# データの読み込みと前処理
df = load_and_preprocess_data(file_list, category_mapping, n_sentences)

# トレーニングとテストデータに分割
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['category'], test_size=0.2, random_state=42)

# Tokenizerの準備
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese')

class ConjunctionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# データローダーの準備
train_dataset = ConjunctionDataset(train_texts, train_labels, tokenizer)
test_dataset = ConjunctionDataset(test_texts, test_labels, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)
"""


"\nimport pandas as pd\nfrom janome.tokenizer import Tokenizer\nfrom transformers import BertJapaneseTokenizer\nimport torch\nfrom torch.utils.data import Dataset, DataLoader\nfrom sklearn.model_selection import train_test_split\n\ndef load_and_preprocess_data(file_list, category_mapping, n_sentences):\n    dfs = []\n    for filename in file_list:\n        df = pd.read_csv(f'sentence_file/{filename}')\n        # データの整形\n        df['text'] = df[['b4', 'b3', 'b2', 'b1', 'a1', 'a2', 'a3', 'a4'][:n_sentences*2]].apply(\n            lambda x: ' '.join(x.dropna().astype(str)), axis=1)\n        df = df[['text', 'category']]\n        dfs.append(df)\n    \n    # 全データフレームを結合\n    combined_df = pd.concat(dfs, ignore_index=True)\n    \n    # カテゴリを数値に変換\n    combined_df['category'] = combined_df['category'].map(category_mapping) # MEMO: これいらない気がする\n    \n    return combined_df\n\nimport os\n# dataフォルダ内の全ファイルに対して適用\ndata_folder = 'sentence_file'\nfile_list = []\nfor filename in os.listdir(data_folde

In [9]:
"""
import torch.nn as nn
from transformers import BertModel

class ConjunctionClassifier(nn.Module):
    def __init__(self, n_classes):
        super(ConjunctionClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('cl-tohoku/bert-base-japanese')
        self.lstm = nn.LSTM(768, 128, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(128*2, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output, _ = self.lstm(outputs.last_hidden_state)
        avg_pool = torch.mean(sequence_output, 1)
        logits = self.fc(avg_pool)
        return logits

# モデルの初期化
model = ConjunctionClassifier(n_classes=6)
"""

"\nimport torch.nn as nn\nfrom transformers import BertModel\n\nclass ConjunctionClassifier(nn.Module):\n    def __init__(self, n_classes):\n        super(ConjunctionClassifier, self).__init__()\n        self.bert = BertModel.from_pretrained('cl-tohoku/bert-base-japanese')\n        self.lstm = nn.LSTM(768, 128, batch_first=True, bidirectional=True)\n        self.fc = nn.Linear(128*2, n_classes)\n\n    def forward(self, input_ids, attention_mask):\n        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)\n        sequence_output, _ = self.lstm(outputs.last_hidden_state)\n        avg_pool = torch.mean(sequence_output, 1)\n        logits = self.fc(avg_pool)\n        return logits\n\n# モデルの初期化\nmodel = ConjunctionClassifier(n_classes=6)\n"

In [10]:
"""
from transformers import AdamW
from torch.optim.lr_scheduler import StepLR
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
scheduler = StepLR(optimizer, step_size=2, gamma=0.1)
loss_fn = nn.CrossEntropyLoss().to(device)

def train_epoch(model, dataloader, loss_fn, optimizer, device, scheduler):
    model = model.train()
    losses = []
    correct_predictions = 0

    for data in dataloader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(dataloader.dataset), np.mean(losses)

def eval_model(model, dataloader, loss_fn, device):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for data in dataloader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(dataloader.dataset), np.mean(losses)

# トレーニングループ
epochs = 10

for epoch in range(epochs):
    train_acc, train_loss = train_epoch(model, train_dataloader, loss_fn, optimizer, device, scheduler)
    print(f'Epoch {epoch + 1}/{epochs}')
    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(model, test_dataloader, loss_fn, device)
    print(f'Validation loss {val_loss} accuracy {val_acc}')
"""

'\nfrom transformers import AdamW\nfrom torch.optim.lr_scheduler import StepLR\nimport numpy as np\n\ndevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")\nmodel = model.to(device)\n\noptimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)\nscheduler = StepLR(optimizer, step_size=2, gamma=0.1)\nloss_fn = nn.CrossEntropyLoss().to(device)\n\ndef train_epoch(model, dataloader, loss_fn, optimizer, device, scheduler):\n    model = model.train()\n    losses = []\n    correct_predictions = 0\n\n    for data in dataloader:\n        input_ids = data[\'input_ids\'].to(device)\n        attention_mask = data[\'attention_mask\'].to(device)\n        labels = data[\'labels\'].to(device)\n\n        outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n        _, preds = torch.max(outputs, dim=1)\n        loss = loss_fn(outputs, labels)\n\n        correct_predictions += torch.sum(preds == labels)\n        losses.append(loss.item())\n\n        loss.backward()\n