In [7]:
import pandas as pd 
## data with imdb-movie
## https://www.kaggle.com/datasets/yasserh/imdb-movie-ratings-sentiment-analysis
basepath = ""
movie_path = basepath + "movie.csv"
movie_df = pd.read_csv(movie_path)
movie_df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [10]:
#after data downloaded
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import string
import collections
import matplotlib.pyplot as plt

def read_imdb(data_dir, train):
    """读取IMDb评论数据集文本序列和标签"""
    """分别读取训练集与测试集"""
    data, labels = [], []
    for label in ('pos', 'neg'):
        folder_name = os.path.join(data_dir, 'train' if train else 'test', label)
        print(folder_name)
        for file in os.listdir(folder_name):
          with open(os.path.join(folder_name, file), 'rb') as f:
              review = f.read().decode('utf-8').replace('\n', '')
              data.append(review)
              labels.append(1 if label == 'pos' else 0)
    return data, labels

#使用类的方式避免重复运算
class Vocab:
    #min_freq：把出现次数少于min_freq的低频率词元视为相同的未知词元视为'<unk>'
    def __init__(self, tokens=[], min_freq=0):
        # 展开多维的token列表
        if tokens and isinstance(tokens[0], list):
            tokens = [token for line in tokens for token in line]
        # 计算token中对应数据的出现次数，从大到小
        counter = collections.Counter(tokens)
        self.freqs = sorted(counter.items(), key=lambda x: x[1],
                                  reverse=True)
        # The list of unique tokens
        self.index_to_token = sorted([token for token, freq in self.freqs 
                                      if freq >= min_freq]) + ['<unk>']
        self.token_to_index = {token: idx
                             for idx, token in enumerate(self.index_to_token)}

    def __len__(self):
        return len(self.index_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            # if not exist '<unk>'
            return self.token_to_index.get(tokens, self.token_to_index['<unk>'])
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if isinstance(indices, list):
            return [self.index_to_token[int(index)] for index in indices]
        return self.index_to_token[indices]

def token_nize(sentence_list):
  wordss = []
  for s in sentence_list:
    words = s.split(" ")
    words = [w.strip(string.punctuation) for w in words]
    wordss.append(words)
  return wordss

def truncate_pad(line, num_steps, padding_token):
    """Truncate or pad sequences."""
    if len(line) > num_steps:
        return line[:num_steps]  # Truncate
    return line + [padding_token] * (num_steps - len(line))  # Pad

def load_array(data_arrays, batch_size, is_train=True):
    """构造一个PyTorch数据迭代器。"""
    dataset = torch.utils.data.TensorDataset(*data_arrays)
    return torch.utils.data.DataLoader(dataset, batch_size, shuffle=is_train)

def load_data(batch_size, num_steps=500):
    imdb_data_dir = "aclImdb"
    imdb_train_data = read_imdb(imdb_data_dir, True)
    imdb_train = pd.DataFrame([imdb_train_data[0], imdb_train_data[1]]).T
    imdb_train.columns = ['sentence','label']
    imdb_test_data = read_imdb(imdb_data_dir, False)
    imdb_test = pd.DataFrame([imdb_test_data[0], imdb_test_data[1]]).T
    imdb_test.columns = ['sentence','label']
    
    movie_train, movie_test = train_test_split(movie_df, test_size=0.5, random_state=0)
    movie_train.columns = ['sentence','label']
    movie_test.columns = ['sentence','label']
    
    train_data = pd.concat([movie_train, imdb_train])
    test_data = pd.concat([movie_test, imdb_test])
    train_data['label'] = train_data['label'].astype(int)
    test_data['label'] = test_data['label'].astype(int)
    
    train_tokens = token_nize(train_data["sentence"])
    test_tokens = token_nize(test_data["sentence"])
    vocab = Vocab(train_tokens, min_freq=5)
    train_features = torch.tensor([truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in train_tokens])
    test_features = torch.tensor([truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in test_tokens])
    train_iter = load_array((train_features, torch.tensor(train_data["label"].values)), batch_size)
    test_iter = load_array((test_features, torch.tensor(test_data["label"].values)), batch_size, is_train=False)
    return train_iter, test_iter, vocab

In [11]:
from torch import nn
batch_size = 64
train_iter, test_iter, vocab = load_data(batch_size)

aclImdb/train/pos
aclImdb/train/neg
aclImdb/test/pos
aclImdb/test/neg


In [12]:
print(train_iter, test_iter, vocab)

<torch.utils.data.dataloader.DataLoader object at 0x7f7b695a2f70> <torch.utils.data.dataloader.DataLoader object at 0x7f7b695a2a30> <__main__.Vocab object at 0x7f7b6a1af3d0>
