## 必要なライブラリのインストール

In [2]:
!pip install graphviz
!pip install pgmpy



## ドライブのマウント

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd '/content/drive/MyDrive/Colab Notebooks/DLBasics2023_colab/submit/'

/content/drive/MyDrive/Colab Notebooks/DLBasics2023_colab/submit


## ライブラリのインポート

In [5]:
!pip install portalocker

import random
import numpy as np
import string
import re
from collections import Counter
from typing import List
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score



import numpy as np
import os
import pandas as pd
from scipy.stats import chi2

seed = 1234
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

Collecting portalocker
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.7.0


## 単語の出現頻度把握

csvからPytorchのDatasetを作る方法
参考
- https://dreamer-uma.com/pytorch-dataset/
- https://discuss.pytorch.org/t/loading-a-csv-with-a-column-of-strings-and-a-column-of-integers/151080/4

In [10]:
import pandas as pd

path_to_csv_file_wordcount = './data/labels/labels_sensitivity_evaluation.csv'


PyTorchのDataset形式でcsvデータを読み込み(headerは自動で認識)

In [11]:
class CustomDataset_trainvalid(torch.utils.data.Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        label = row[1]
        comment = row[3]
        sample = (label, comment)
        return sample

trainvalid_dataset = CustomDataset_trainvalid(path_to_csv_file_wordcount)

単語数をカウント

In [14]:
# 単語をスペースで区切り，!"#$%&といった記号を除去する，すべて小文字化する，などの処理
# https://pytorch.org/text/stable/data_utils.html
tokenizer = get_tokenizer("basic_english")

counter = Counter()

for label, comment in trainvalid_dataset:
    counter.update(tokenizer(comment))

vocabulary = vocab(
    counter,
    min_freq=5,
    specials=('<unk>', '<PAD>', '<BOS>', '<EOS>')
)
# <unk>をデフォルトに設定することにより，min_freq回以上出てこない単語は<unk>になる
vocabulary.set_default_index(vocabulary['<unk>'])

word_num = len(vocabulary)

print(f"単語種数: {word_num}")
print(*vocabulary.get_itos()[:200], sep=', ')

単語種数: 113
<unk>, <PAD>, <BOS>, <EOS>, the, expression, is, scary, ., eyes, are, and, cute, cool, fur, it, ', s, a, bit, dark, ,, but, feels, like, at, being, flash, strong, looks, well, that, seems, to, be, something, in, sense, of, story, which, good, blurred, background, on, cat, makes, for, photo, well-resolved, also, face, hard, identify, color, light, even, see, black, i, don, t, pose, kitten, surrounding, text, posture, difficult, moment, natural, nice, as, not, from, make, soft, nicely, looking, pattern, because, beautiful, unnecessary, too, can, grass, with, feeling, aiming, has, green, staring, shadows, there, slightly, white, glowing, angle, feel, focus, first, glance, out, no, eye, an, gaze, targeting, resolved, lighting, sunlight, shining, eyecatch, catchlight
