In [None]:
import numpy as np
import pandas as pd
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
#ファイル
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/grad_comp

FEATURE_NUM = 5000
BATCH_SIZE = 128
EPOCH = 50

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/grad_comp


In [None]:
# preprocess path
prepro = 'sudachi-neo-small/A'

# setup data

f = open('preprocess/' + prepro + '/text.prep_train.txt', 'r')
train_data = f.read()
train_data = train_data.split('\n')

del train_data[30000]
for i in range(len(train_data)):
  train_data[i] = train_data[i].split(' ')


f = open('preprocess/' + prepro + '/text.prep_test.txt', 'r')
test_data = f.read()
test_data = test_data.split('\n')
del test_data[2500]
for i in range(len(test_data)):
  test_data[i] = test_data[i].split(' ')

f = open('preprocess/' + prepro + '/text.prep_dev.txt', 'r')
dev_data = f.read()
dev_data = dev_data.split('\n')
del dev_data[2500]
for i in range(len(dev_data)):
  dev_data[i] = dev_data[i].split(' ')

# label
y_train = np.loadtxt('/content/drive/MyDrive/grad_comp/data/label.train.txt')
y_dev = np.loadtxt('/content/drive/MyDrive/grad_comp/data/label.dev.txt')
y_test = np.loadtxt('/content/drive/MyDrive/grad_comp/data/label.dummy.txt') # ダミーデータ

y_train += 2
y_dev += 2
y_test += 2

u, counts = np.unique(y_train, return_counts=True)
print(u)
print(counts)

[0. 1. 2. 3. 4.]
[3543 5593 9227 7760 3877]


In [None]:
# 検証

from sklearn.feature_extraction.text import TfidfVectorizer
#vectorizer = TfidfVectorizer(max_features=FEATURE_NUM,norm='l1',analyzer='char',smooth_idf=True)
#vectorizer = TfidfVectorizer(max_features=FEATURE_NUM)
vectorizer = TfidfVectorizer(analyzer=lambda x: x)

X_train = vectorizer.fit_transform(train_data)
X_dev = vectorizer.transform(dev_data)
X_test = vectorizer.transform(test_data)

X_train = X_train.toarray()
X_dev = X_dev.toarray()
X_test = X_test.toarray()

In [None]:
X_train.shape

(30000, 24830)

## MLP(多層パーセプトロン)への適応

In [None]:
# ====================
# データ形式の変換 (ndarray --> Tensor)
# ====================

import torch

print("変換前：", type(X_train), type(y_train))

x_train = torch.tensor(X_train, dtype=torch.float32)
x_dev = torch.tensor(X_dev, dtype=torch.float32)
x_test = torch.tensor(X_test, dtype=torch.float32)

#targets = targets.type(torch.LongTensor)
y_train = torch.tensor(y_train, dtype=torch.int64)
y_dev = torch.tensor(y_dev, dtype=torch.int64)
y_test = torch.tensor(y_test, dtype=torch.int64)

print("変換後：", type(x_train), type(y_train))

変換前： <class 'numpy.ndarray'> <class 'numpy.ndarray'>
変換後： <class 'torch.Tensor'> <class 'torch.Tensor'>


In [None]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

train = TensorDataset(x_train, y_train)
val = TensorDataset(x_dev, y_dev)
test = TensorDataset(x_test, y_test)
batch_size = BATCH_SIZE

train_loader = DataLoader(train, batch_size, shuffle=True)
val_loader = DataLoader(val, batch_size, shuffle=False)
test_loader = DataLoader(test, 1, shuffle=False)

In [None]:
# ====================
# ネットワークを定義
# ====================

import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):

    # モデルの構造
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(24830,2000) # 1000
        self.fc2 = nn.Linear(2000,500)
        self.fc3 = nn.Linear(500,100)
        self.fc4 = nn.Linear(100,10)
        self.fc5 = nn.Linear(10,5)
        # 回帰問題にする場合
        #self.fc5 = nn.Linear(10,1)
    # 順伝播
    def forward(self, x):
        h1 = F.relu(self.fc1(x))
        h2 = F.relu(self.fc2(h1))
        h3 = F.relu(self.fc3(h2))
        h4 = F.relu(self.fc4(h3))
        y = self.fc5(h4)
        return y

# ネットワークのインスタンスを作成
net = Net()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = net.to(device)
device

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(),lr=0.1)

In [None]:
# ====================
# 学習ループ
# ====================

max_epoch = 100
min_val_loss = 100
eps = 1e-6
eps = torch.tensor(eps)
for epoch in range(max_epoch):

    # ミニバッチ学習
    for batch in train_loader:

        # バッチサイズ分のサンプルを抽出
        x, t = batch  # 黄色の部分
        # データをGPUへ転送
        x = x.to(device)
        t = t.to(device)

        # 勾配を初期化
        optimizer.zero_grad()
        # 順伝播
        y = net(x)  # 赤色の部分
        loss = criterion(y, t)  # 緑色の部分
        # 誤差逆伝播
        loss.backward()  # 青色の部分
        optimizer.step()  # 青色の部分
    
    # 更新と切り離し、検証データの性能を確認
    with torch.no_grad():
        losses = list()
        for batch in val_loader:
            x, t = batch  # 黄色の部分
            x = x.to(device)
            t = t.to(device)
            y = net(x)  # 赤色の部分
            #print(f' true = {t}')
            #print(f'pred.shape={y.shape} : true.shape = {t.shape}')
            #exit()
            loss = criterion(y, t)  # 緑色の部分
            
            losses.append(loss)
    val_loss = torch.tensor(losses).mean()
    print("Epoch: %02d  val_loss: %.3f" % (epoch+1, val_loss))

In [None]:
# ====================
# 推定したラベルを獲得
# ====================

with torch.no_grad():
    preds = list()
    for batch in test_loader:
        x, t = batch  # 黄色の部分
        x = x.to(device)
        t = t.to(device)
        y = net(x)  # 赤色の部分
        preds.append(y.argmax(axis=1))  # 事例ごとに最高の予測値を持つラベルを選ぶ
    preds = torch.concat(preds)

preds

tensor([2, 2, 2,  ..., 2, 3, 2], device='cuda:0')

In [None]:
with open('MLP/1BMLP-.txt','w') as f:
    for y_pred in preds:
      y_pred = int(y_pred)
      y_pred -= 2
      y_pred = str(y_pred)
      f.write(y_pred + '\n')

In [None]:
from sklearn.metrics import classification_report
golds = torch.concat([t for x, t in test_loader])
preds = preds.to('cpu')
golds = golds.to('cpu')
print(classification_report(golds, preds, digits=3))