### 作業目的: 熟練自定義collate_fn與sampler進行資料讀取

本此作業主要會使用[IMDB](http://ai.stanford.edu/~amaas/data/sentiment/)資料集利用Pytorch的Dataset與DataLoader進行
客製化資料讀取。
下載後的資料有分成train與test，因為這份作業目的在讀取資料，所以我們取用train部分來進行練習。
(請同學先行至IMDB下載資料)

### 載入套件

In [1]:
# Import torch and other required modules
import glob
import torch
import re
import nltk
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import load_svmlight_file
from nltk.corpus import stopwords

nltk.download('stopwords') #下載stopwords
nltk.download('punkt') #下載word_tokenize需要的corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liaochifen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/liaochifen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### 探索資料與資料前處理
這份作業我們使用test資料中的pos與neg


In [2]:
# 讀取字典，這份字典為review內所有出現的字詞
with open("data/aclImdb/imdb.vocab", "r") as f:
    vocab = f.readlines()
    vocab = [word.strip() for word in vocab]

# 以nltk stopwords移除贅字，過多的贅字無法提供有用的訊息，也可能影響模型的訓練
print(f"vocab length before removing stopwords: {len(vocab)}")

stop_words = set(stopwords.words('English'))
vocab = set(vocab).difference(stop_words)
print(f"vocab length after removing stopwords: {len(vocab)}")

# 將字典轉換成dictionary
word2idx = {word: i for i, word in enumerate(vocab)}

vocab length before removing stopwords: 89527
vocab length after removing stopwords: 89356


In [11]:
# 將資料打包成(x, y)配對，其中x為review的檔案路徑，y為正評(1)或負評(0)
# 這裡將x以檔案路徑代表的原因是讓同學練習不一次將資料全讀取進來，若電腦記憶體夠大(所有資料檔案沒有很大)
# 可以將資料全一次讀取，可以減少在訓練時I/O時間，增加訓練速度

### <your code> ###
train_pos_review_file_paths = glob.glob("data/aclImdb/train/pos/*")
train_neg_review_file_paths = glob.glob("data/aclImdb/train/neg/*")
train_review_file_paths = train_pos_review_file_paths + train_neg_review_file_paths
labels = [1]*len(train_pos_review_file_paths) + [0]*len(train_neg_review_file_paths)

review_pairs = [(file, label) for file, label in zip(train_review_file_paths, labels)]

print(review_pairs[:2])
print(f"Total reviews: {len(review_pairs)}")

[('data/aclImdb/train/pos/4715_9.txt', 1), ('data/aclImdb/train/pos/12390_8.txt', 1)]
Total reviews: 25000


### 建立Dataset, DataLoader, Sampler與Collate_fn讀取資料
這裡我們會需要兩個helper functions，其中一個是讀取資料與清洗資料的函式(load_review)，另外一個是生成詞向量函式
(generate_vec)，注意這裡我們用來產生詞向量的方法是單純將文字tokenize(為了使產生的文本長度不同，而不使用BoW)

In [101]:
def load_review(review_path):
    ###<your code>###
    with open(review_path, "r") as f:
        review = f.read().lower()
        review = re.sub(r"<.*?>", "", review)
        review = re.sub(r"[^a-zA-Z]", " ", review)
        
    token_with_stop_words = nltk.word_tokenize(review)
    token_no_stop_words = [token for token in token_with_stop_words if token not in stop_words]
 
    return token_no_stop_words
    

def generate_vec(review, vocab_dic):
    ### <your code> ###s
    vector = []
    for token in review:
        if vocab_dic.get(token):
            vector.append(vocab_dic[token])
    
    return vector

In [111]:
#建立客製化dataset

class dataset(Dataset):
    '''custom dataset to load reviews and labels
    Parameters
    ----------
    data_pairs: list
        directory of all review-label pairs
    vocab: list
        list of vocabularies
    '''
    ### <your code> ###
    def __init__(self, data_pairs, vocab):
        self.data_pairs = data_pairs
        self.vocab = vocab
        
    def __getitem__(self, idx):
        token = load_review(self.data_pairs[idx][0])
        vector = generate_vec(token, self.vocab)
        label = self.data_pairs[idx][1]
        return torch.tensor(vector), torch.tensor(label)
        
    def __len__(self):
        return len(self.data_pairs)

In [156]:
#建立客製化collate_fn，將長度不一的文本pad 0 變成相同長度
def collate_fn(batch):
    ### <your code> ###
    vectors, labels = zip(*batch)
    lengths = [len(vector) for vector in vectors]
    max_length = max(lengths)
    
    sentence_batch = []
    for vector in vectors:
        temp_pad = torch.zeros(max_length)
        temp_pad[:len(vector)] = vector
        sentence_batch.append(temp_pad.reshape(-1, max_length))
    
    return torch.cat(sentence_batch, dim=0), torch.tensor(labels), torch.tensor(lengths)

In [158]:
# 使用Pytorch的RandomSampler來進行indice讀取並建立dataloader
### <your code> ###
custom_dst = dataset(review_pairs, word2idx)
custom_dataloader = DataLoader(custom_dst, collate_fn=collate_fn, shuffle=True, batch_size=2)
next(iter(custom_dataloader))

(tensor([[83798., 11514.,  8215., 19553., 85709., 30397., 77021., 62458., 76079.,
           7428., 30931., 73342., 52876., 10574., 81284., 16656., 20549., 68493.,
          47856., 54191., 81298., 54261.,  1830., 35745.,  4053., 35039., 52863.,
           1238., 30373., 54765., 58744., 71795., 61041., 77061., 10565., 10962.,
          12734., 40595., 28353.,   403., 30188., 67063., 60463., 69402., 44841.,
          59832., 11723., 57319., 66891., 30108., 45878., 83779., 75179., 40791.,
          12734., 79771., 20749.,  7447., 30108., 77029., 77295., 76416.,  6578.,
          65321.,  1292., 16554., 76572., 30931., 87852., 57613., 21869.,  1830.],
         [12734., 76539., 38241., 11723., 64633., 56014., 76807., 76572., 10024.,
          35039., 26598., 46811., 46620., 20549., 35039., 82116., 55366.,  5021.,
          68690., 66385., 65750., 78280., 37962., 49282., 62858., 67269., 46620.,
          24084., 74586., 13058., 12808., 76572., 76358., 52325., 80358., 58339.,
          79478