In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
cd /content/drive/My Drive/Colab Notebooks/NLP

/content/drive/My Drive/Colab Notebooks/NLP


In [26]:
from torchtext import data

In [4]:
from torchtext import data


class DataLoader(object):
    '''
    Data loader class to load text file using torchtext library.
    '''

    def __init__(
        self, train_fn,
        batch_size=64,
        valid_ratio=.2,
        device=-1,
        max_vocab=999999,
        min_freq=1,
        use_eos=False,
        shuffle=True
    ):
        '''
        DataLoader initialization.
        :param train_fn: Train-set filename
        :param batch_size: Batchify data fot certain batch size.
        :param device: Device-id to load data (-1 for CPU)
        :param max_vocab: Maximum vocabulary size
        :param min_freq: Minimum frequency for loaded word.
        :param use_eos: If it is True, put <EOS> after every end of sentence.
        :param shuffle: If it is True, random shuffle the input data.
        '''
        #super().__init__()

        # Define field of the input file.
        # The input file consists of two fields.
        self.label = data.Field(
            sequential=False,
            use_vocab=True,
            unk_token=None
        )
        self.text = data.Field(
            use_vocab=True,
            batch_first=True,
            include_lengths=False,
            eos_token='<EOS>' if use_eos else None
        )

        # Those defined two columns will be delimited by TAB.
        # Thus, we use TabularDataset to load two columns in the input file.
        # We would have two separate input file: train_fn, valid_fn
        # Files consist of two columns: label field and text field.
        train, valid = data.TabularDataset(
            path=train_fn,
            format='tsv', 
            fields=[
                ('label', self.label),
                ('text', self.text),
            ],
        ).split(split_ratio=(1 - valid_ratio))

        # Those loaded dataset would be feeded into each iterator:
        # train iterator and valid iterator.
        # We sort input sentences by length, to group similar lengths.
        self.train_loader, self.valid_loader = data.BucketIterator.splits(
            (train, valid),
            batch_size=batch_size,
            device='cuda:%d' % device if device >= 0 else 'cpu',
            shuffle=shuffle,
            sort_key=lambda x: len(x.text),
            sort_within_batch=True,
        )

        # At last, we make a vocabulary for label and text field.
        # It is making mapping table between words and indice.
        self.label.build_vocab(train)
        self.text.build_vocab(train, max_size=max_vocab, min_freq=min_freq)

In [27]:
max_length = 256 # sms(가사) 최대 길이

1. 데이터 불러오기

In [28]:
import pandas as pd
df = pd.read_csv('sms.tsv', sep='\t',)
df.columns = ['label', 'sms']
print(df.columns)
print(df.shape)

Index(['label', 'sms'], dtype='object')
(5574, 2)


In [29]:
df.head()

Unnamed: 0,label,sms
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,spam,"%^^×？×^×&#****,>,;//×&>>*(*^%=÷#~^&,****)"
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


데이터 전처리

In [30]:
# 클래스 파악
classes = sorted(set(df['label']))
class_to_idx = {}

for i, c in enumerate(classes): # 모든 클래스에 대해
  class_to_idx.update({c:i})

nclass = len(classes)

print("# of classes: %d" %nclass)
print(classes)
print(class_to_idx)

# of classes: 2
['ham', 'spam']
{'ham': 0, 'spam': 1}


2. 새로운 DataFrame

1) 'label,sms'만 남기기

2) 최대 텍스트 길이 만큼 자르기 # pandas.Series.str.slice


*   '성별,가사'만 남기려면?


In [31]:
new_df = pd.DataFrame({'label':df['label'],
                       'sms':df['sms'].str.slice( # 최대 가사 텍스트 만큼 자르기
                                                 start=0, stop=max_length)
                       })

3)중복 제거

In [32]:
new_df = pd.DataFrame(new_df.drop_duplicates())
len(new_df)

5171

4) 셔플

In [33]:
df_shuffled=new_df.sample(frac=1).reset_index(drop=True)
df_shuffled.head()

Unnamed: 0,label,sms
0,ham,Do you want bold 2 or bb torch
1,ham,Yes:)from last week itself i'm taking live call.
2,ham,Keep my payasam there if rinu brings
3,ham,How much did ur hdd casing cost.
4,ham,Take care and sleep well.you need to learn to ...


5) train, test 나누기

In [34]:
# train: test = 9:1
# train: test = 540: 60 -> train:valid:test = 432:108:60
train_ratio = 0.9

# train dataset
s, e = 0, int(df_shuffled.shape[0] * train_ratio) # # of rows
df_train = pd.DataFrame({'label':df_shuffled['label'][s:e],
                         'sms':df_shuffled['sms'][s:e]})
print("index for train: %d~%d" %(s,e))

# test dataset
s, e = e, e+int(df_shuffled.shape[0] * (1.0 - train_ratio))
print("index for test: %d~%d" %(s,e))
df_test = pd.DataFrame({'label':df_shuffled['label'][s:e],
                        'sms':df_shuffled['sms'][s:e]})

index for train: 0~4653
index for test: 4653~5170


In [35]:
# columns 수 확인
print(df_train.shape)
print(df_test.shape)

(4653, 2)
(517, 2)


4)저장

In [36]:
# new_df.columns : ['성별', '가사']
df_train.to_csv('./sms.maxlen.uniq.shuf.train.tsv',
                header=False, index=False, sep='\t')
df_test.to_csv('./sms.maxlen.uniq.shuf.test.tsv',
               header=False, index=False, sep='\t')

In [37]:
import torch
print(torch.__version__)

1.9.0+cu102


In [21]:
!pip install torchtext==0.4.0

Collecting torchtext==0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/43/94/929d6bd236a4fb5c435982a7eb9730b78dcd8659acf328fd2ef9de85f483/torchtext-0.4.0-py3-none-any.whl (53kB)
[K     |██████▏                         | 10kB 15.5MB/s eta 0:00:01[K     |████████████▍                   | 20kB 13.2MB/s eta 0:00:01[K     |██████████████████▌             | 30kB 9.4MB/s eta 0:00:01[K     |████████████████████████▊       | 40kB 8.2MB/s eta 0:00:01[K     |██████████████████████████████▉ | 51kB 4.3MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.1MB/s 
Installing collected packages: torchtext
  Found existing installation: torchtext 0.10.0
    Uninstalling torchtext-0.10.0:
      Successfully uninstalled torchtext-0.10.0
Successfully installed torchtext-0.4.0


In [38]:
import torchtext
import numpy as np

3. 데이터 로드 함수

학습시킬 때 batch_size 단위로 끊어서 로드하기 위함

In [39]:
from data_loader import DataLoader