In [63]:
from datasets import load_dataset

In [64]:
#!unzip ../data/drugsCom_raw.zip

## 数据的加载

In [65]:
!tree

[01;34m.[0m
├── 0.tokenizer.ipynb
├── 1.datasets.ipynb
├── 2.transformer.ipynb
├── [01;34mdata[0m
│   ├── drugsComTest_raw.tsv
│   └── drugsComTrain_raw.tsv
└── [01;34mimg[0m

2 directories, 5 files


In [66]:
data_file = {"train": "./data/drugsComTrain_raw.tsv"
             , "test": "./data/drugsComTest_raw.tsv"
            }

In [67]:
data_file

{'train': './data/drugsComTrain_raw.tsv',
 'test': './data/drugsComTest_raw.tsv'}

In [68]:
# 加载数据集
squad_it_dataset = load_dataset("csv", data_files=data_file, delimiter="\t")

Using custom data configuration default-ec8a77444714a9ed


Downloading and preparing dataset csv/default to /home/gavin/.cache/huggingface/datasets/csv/default-ec8a77444714a9ed/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)


Dataset csv downloaded and prepared to /home/gavin/.cache/huggingface/datasets/csv/default-ec8a77444714a9ed/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [69]:
# 获取一个Dataset_Dict
squad_it_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [70]:
# 获取训练数据
train_data = squad_it_dataset["train"]
train_data

Dataset({
    features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
    num_rows: 161297
})

## 数据预处理

In [71]:
# shuffle数据， 完成随机选择
train_sample = train_data.shuffle(seed=100).select(range(1000))

In [72]:
train_sample[:3]

{'Unnamed: 0': [164224, 22158, 101891],
 'drugName': ['Xiidra', 'Baclofen', 'Terbinafine'],
 'condition': ['Dry Eye Disease', 'Muscle Spasm', 'Onychomycosis, Toenail'],
 'review': ['"I have been experiencing the dry eye &amp; red eye problems for about 2yrs. I finally this year decided to go to a real eye specialist. They called in a prescription for Xiidra using a 90 day free coupon (happy) without using my health insurance as they would not cover it. I&#039;m glad to try something else besides clear eyes and or artificial drops. I use to where contacts for about 15 years and had to stop because of the dry eye problems.  Anyway I have been using Xiidra now for about 1 month and I must say the results is good. The only side effects I&#039;m having is the bad bitter taste, very light headaches periodically &amp; very mild cough here &amp; there, nothing major. I go back to see my eye specialist in July for a follow up."',
  '"I have use Baclofen for some time now but have only in the la

In [73]:
# 唯一值
len(train_sample.unique("Unnamed: 0"))

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

1000

In [74]:
# 修改列名字
squad_it_dataset = squad_it_dataset.rename_column("Unnamed: 0", "uuid")

In [75]:
squad_it_dataset

DatasetDict({
    train: Dataset({
        features: ['uuid', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['uuid', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [79]:
def lowcase_conditation(example:"dataset"):
    """转换为小写"""

    return {"condition": example["condition"].lower()}

In [80]:
train_sample = train_sample.filter(lambda x: x["condition"] is not None)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [81]:
train_sample = train_sample.rename_column("Unnamed: 0", "uuid")

In [82]:
# 映射数据，传入一个字段
train_sample = train_sample.map(lowcase_conditation)

  0%|          | 0/992 [00:00<?, ?ex/s]

In [28]:
train_sample[1]

{'uuid': 22158,
 'drugName': 'Baclofen',
 'condition': 'muscle spasm',
 'review': '"I have use Baclofen for some time now but have only in the last 2 to 3 months started to have side effects  such a hallucinations, and seeing and hearing things that are not there.  I also take Requip which also give almost the same side effects.  I also take coumidin(Warfarin)"',
 'rating': 5.0,
 'date': 'May 6, 2016',
 'usefulCount': 35}

In [29]:
def compute_review_length(example: dict):
    """计算文本长度
    """
    
    return {"review_length": len(example["review"].split())}

In [83]:
# 创建一个新的列
# 如果返回的未在字典中，则进行修改
train_sample = train_sample.map(compute_review_length)

  0%|          | 0/992 [00:00<?, ?ex/s]

In [85]:
# 新增一列数据
train_sample[0]

{'uuid': 164224,
 'drugName': 'Xiidra',
 'condition': 'dry eye disease',
 'review': '"I have been experiencing the dry eye &amp; red eye problems for about 2yrs. I finally this year decided to go to a real eye specialist. They called in a prescription for Xiidra using a 90 day free coupon (happy) without using my health insurance as they would not cover it. I&#039;m glad to try something else besides clear eyes and or artificial drops. I use to where contacts for about 15 years and had to stop because of the dry eye problems.  Anyway I have been using Xiidra now for about 1 month and I must say the results is good. The only side effects I&#039;m having is the bad bitter taste, very light headaches periodically &amp; very mild cough here &amp; there, nothing major. I go back to see my eye specialist in July for a follow up."',
 'rating': 8.0,
 'date': 'May 11, 2017',
 'usefulCount': 6,
 'review_length': 140}

In [89]:
# 排序
train_sample = train_sample.sort("review_length")
train_sample[0: 4]

{'uuid': [156055, 80431, 82710, 181107],
 'drugName': ['Cialis', 'Aleve', 'Paxil', 'Dramamine'],
 'condition': ['erectile dysfunction',
  'pain',
  'panic disorde',
  'motion sickness'],
 'review': ['"Wonderful"',
  '"Works well."',
  '"Lifesaver drug!!!"',
  '"Chapped hands"'],
 'rating': [10.0, 10.0, 10.0, 10.0],
 'date': ['February 2, 2014',
  'December 3, 2008',
  'July 2, 2017',
  'January 6, 2014'],
 'usefulCount': [15, 10, 5, 12],
 'review_length': [1, 2, 2, 2]}

In [98]:
# map操作

In [115]:
import html

In [116]:
text = "I&#039;m a transformer called BERT"
html.unescape(text)

"I'm a transformer called BERT"

In [117]:
squad_it_dataset

DatasetDict({
    train: Dataset({
        features: ['uuid', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['uuid', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [118]:
# 单个处理操作
squad_it_dataset.map(lambda x: {"review": html.unescape(x["review"])})



DatasetDict({
    train: Dataset({
        features: ['uuid', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['uuid', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [121]:
# 批量处理, 可以指定batch_size大小
squad_it_dataset.map(lambda x: {"review": html.unescape(x["review"])}
                     , batched=True)



DatasetDict({
    train: Dataset({
        features: ['uuid', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['uuid', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [122]:
from transformers import AutoTokenizer

In [142]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased",use_fast=False)

In [143]:
# 批量分词操作

In [149]:
def tokenize_funcation(examples):
    """批量分词操作"""
    
    return tokenizer(examples["review"], truncation=True)

In [150]:
tokenized_dataset = squad_it_dataset.map(tokenize_funcation, batched=True, num_proc=8)

             

#0:   0%|          | 0/21 [00:00<?, ?ba/s]

#1:   0%|          | 0/21 [00:00<?, ?ba/s]

#2:   0%|          | 0/21 [00:00<?, ?ba/s]

  

#3:   0%|          | 0/21 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/21 [00:00<?, ?ba/s]

#5:   0%|          | 0/21 [00:00<?, ?ba/s]

#7:   0%|          | 0/21 [00:00<?, ?ba/s]

#6:   0%|          | 0/21 [00:00<?, ?ba/s]

            

#7:   0%|          | 0/7 [00:00<?, ?ba/s]

  

#3:   0%|          | 0/7 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/7 [00:00<?, ?ba/s]

#6:   0%|          | 0/7 [00:00<?, ?ba/s]

#5:   0%|          | 0/7 [00:00<?, ?ba/s]

#0:   0%|          | 0/7 [00:00<?, ?ba/s]

#4:   0%|          | 0/7 [00:00<?, ?ba/s]

#1:   0%|          | 0/7 [00:00<?, ?ba/s]

In [None]:
tokenized_dataset["train"][0]

In [None]:
# 测试单条数据
tokenizer('"It has no side effect,\
            I take it in combination of Bystolic 5 Mg and Fish Oil"'
        , truncation=True, return_length=True, return_tensors="pt", num_proc=8
         )

## TODO
1. 复习字典方法，包括默认字典