In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
from gensim.models import KeyedVectors
from tqdm import tqdm
import pandas as pd
import re
import torch

In [6]:
PATH = "/content/drive/MyDrive/dataset/"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

Using cuda device


In [12]:
# 70
# CATEGORYをencodeする
# b = business, t = science and technology, e = entertainment, m = health
def EncoderNN(sign):
    if sign == "b":
        code = 0
    elif sign == "t":
        code = 1
    elif sign == "e":
        code = 2
    elif sign == "m":
        code = 3
    else:
        print("Error")
    return code

# テキストをベクトルに変換する
def Text2Vec(text):
    lines = text.split(" ")
    # sapceで分けて

    vec_sum = 0
    length = 0

    for line in lines:
      # 単語を一つずつ読み込む
        try:
            temp = model.get_vector(line)
            # 単語のベクトルをもらう
            vec_sum += temp
            # ベクトルを合計する
            length += 1
        except:
            pass
            # 単語が存在しない場合は無視する

    # 平均ベクトルを計算する
    return vec_sum/length

def TorchData(data):
    df = pd.read_table(PATH + "{}.txt".format(data))
    # dataを読み込む

    sign_regrex = re.compile('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`|＄＃＠£â€™]')
    f_regrex = lambda x:sign_regrex.sub("", x)
    df["TITLE"] = df["TITLE"].map(f_regrex)
    # TITLE内の記号など削除する


    X_torch = torch.tensor(df["TITLE"].apply(Text2Vec)).to(device)
    torch.save(X_torch, PATH + "X_{}.pt".format(data))
    df["CATEGORY"] = df["CATEGORY"].map(EncoderNN)
    Y_torch = torch.tensor(df["CATEGORY"]).to(device)
    torch.save(Y_torch, PATH + "Y_{}.pt".format(data))
    # TITLEとCATEGORYの単語のベクトルを計算して、保存する

model = KeyedVectors.load_word2vec_format(PATH + "GoogleNews-vectors-negative300.bin", binary=True)
TorchData("train")
TorchData("test")
TorchData("valid")

In [13]:
# 71
import torch.nn as nn
import torch

class NeuralNetwork(nn.Module):
    def __init__(self, input_feature, output):
      super(NeuralNetwork, self).__init__()
      self.fc1 = nn.Linear(input_feature, output, bias=False)
      # このレイヤーはinput_featureからoutputへ線形変換をほどこする
      # 300—>4

      self.fc2 = nn.Softmax(dim=1)
      # Softmax関数をほどこする

    def forward(self, x):
      x = self.fc1(x)
      x = self.fc2(x)
      return x

X_train = torch.load(PATH + "X_train.pt")
model = NeuralNetwork(300, 4).to(device)
model(X_train)

tensor([[0.2555, 0.2560, 0.2607, 0.2278],
        [0.2472, 0.2722, 0.2358, 0.2448],
        [0.2663, 0.2710, 0.2308, 0.2319],
        ...,
        [0.2378, 0.2584, 0.2691, 0.2346],
        [0.2466, 0.2713, 0.2293, 0.2528],
        [0.2555, 0.2550, 0.2402, 0.2493]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)

In [15]:
# 72
loss_function = nn.CrossEntropyLoss(reduction="mean")

Y_train = torch.load(PATH + "Y_train.pt")
# dataを読み込む

Y_pred = model(X_train).to(device)
# X_trainで予測したY_pred

loss = loss_function(Y_pred, Y_train).to(device)


model.zero_grad()
# モデルパラメータの勾配をリセットする
loss.backward()
# バックプロパゲーションを実行します。

print("loss", loss.item())
print("勾配", model.fc1.weight.grad)


loss 1.3871394395828247
勾配 tensor([[ 0.0005, -0.0011,  0.0007,  ..., -0.0020, -0.0029,  0.0033],
        [ 0.0007,  0.0010, -0.0017,  ..., -0.0002,  0.0023, -0.0006],
        [-0.0027, -0.0011,  0.0029,  ...,  0.0029, -0.0011, -0.0012],
        [ 0.0015,  0.0013, -0.0020,  ..., -0.0007,  0.0017, -0.0015]],
       device='cuda:0')


In [20]:
# 73
from torch.utils.data import TensorDataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim

training_data = TensorDataset(X_train, Y_train)
train_dataloader = DataLoader(training_data,batch_size=128, shuffle=True)

optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)

    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # 予測とlossの計算
        pred = model(X)
        loss = loss_fn(pred, y)

        # バックプロパゲーション
        optimizer.zero_grad()
        # モデルパラメータの勾配をリセットする
        loss.backward()
        # バックプロパゲーションを実行する
        optimizer.step()
        # パラメータの勾配を使用してパラメータの値を調整する

        # 10バッチごとに、現在のlossとトレーニングの進捗状況をprintして
        if batch % 10 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_function, optimizer)
    print("-------------------------------\n")

torch.save(model.state_dict(), PATH + "SigleLayer.pth")
#モデルの保存
print("Done!")

Epoch 1
-------------------------------
loss: 1.363063  [    0/10672]
loss: 1.365040  [ 1280/10672]
loss: 1.362398  [ 2560/10672]
loss: 1.365509  [ 3840/10672]
loss: 1.366102  [ 5120/10672]
loss: 1.364123  [ 6400/10672]
loss: 1.363882  [ 7680/10672]
loss: 1.365993  [ 8960/10672]
loss: 1.362200  [10240/10672]
-------------------------------

Epoch 2
-------------------------------
loss: 1.363236  [    0/10672]
loss: 1.362759  [ 1280/10672]
loss: 1.361163  [ 2560/10672]
loss: 1.365413  [ 3840/10672]
loss: 1.361068  [ 5120/10672]
loss: 1.362050  [ 6400/10672]
loss: 1.363461  [ 7680/10672]
loss: 1.362600  [ 8960/10672]
loss: 1.363866  [10240/10672]
-------------------------------

Epoch 3
-------------------------------
loss: 1.359041  [    0/10672]
loss: 1.363871  [ 1280/10672]
loss: 1.362170  [ 2560/10672]
loss: 1.360437  [ 3840/10672]
loss: 1.361659  [ 5120/10672]
loss: 1.365584  [ 6400/10672]
loss: 1.361628  [ 7680/10672]
loss: 1.361935  [ 8960/10672]
loss: 1.362888  [10240/10672]
----