In [48]:
from fastNLP import DataSet
data = {'raw_words':["This is the first instance .", "Second instance .", "Third instance ."],  # value 长度为3的list
        'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']],  # value 为长度为3 的list
        'seq_len': [6, 3, 3] # value（也就是[6,3,3]） 为长度为3的 list
       } 

# 构造DataSet实例的方法 =》 直接传入一个dict 
# 键值具体的内容可随意
# 传入的dict的每个key的value应该为具有相同长度的list
dataset = DataSet(data)

print(dataset)
print(type(dataset)) # <class 'fastNLP.core.dataset.DataSet'>
print(len(dataset)) # 3
for i in range(len(dataset)-1,0,-1):
    dataset.delete_instance(i)
print(dataset)

+------------------------------+------------------------------+---------+
| raw_words                    | words                        | seq_len |
+------------------------------+------------------------------+---------+
| This is the first instanc... | ['this', 'is', 'the', 'fi... | 6       |
| Second instance .            | ['Second', 'instance', '.... | 3       |
| Third instance .             | ['Third', 'instance', '.'... | 3       |
+------------------------------+------------------------------+---------+
<class 'fastNLP.core.dataset.DataSet'>
3
+------------------------------+------------------------------+---------+
| raw_words                    | words                        | seq_len |
+------------------------------+------------------------------+---------+
| This is the first instanc... | ['this', 'is', 'the', 'fi... | 6       |
+------------------------------+------------------------------+---------+


In [8]:
for instance in dataset:
    print(instance)

+------------------------------+------------------------------+---------+
| raw_words                    | words                        | seq_len |
+------------------------------+------------------------------+---------+
| This is the first instanc... | ['this', 'is', 'the', 'fi... | 6       |
+------------------------------+------------------------------+---------+
+-------------------+------------------------------+---------+
| raw_words         | words                        | seq_len |
+-------------------+------------------------------+---------+
| Second instance . | ['Second', 'instance', '.... | 3       |
+-------------------+------------------------------+---------+
+------------------+------------------------------+---------+
| raw_words        | words                        | seq_len |
+------------------+------------------------------+---------+
| Third instance . | ['Third', 'instance', '.'... | 3       |
+------------------+------------------------------+---------+


In [17]:
from fastNLP import DataSet
dataset = DataSet({'a': range(-5, 5), 'c': [0]*10})
print(dataset)

+----+---+
| a  | c |
+----+---+
| -5 | 0 |
| -4 | 0 |
| -3 | 0 |
| -2 | 0 |
| -1 | 0 |
| 0  | 0 |
| 1  | 0 |
| 2  | 0 |
| 3  | 0 |
| 4  | 0 |
+----+---+


In [36]:
'''使用delete方法删除instance 和 field
'''
from fastNLP import DataSet
dataset = DataSet({'raw_words':["This is the first instance .", "Second instance .", "Third instance ."],
                  'len':[5,2,2]})
print(dataset)
# 删除voca field
dataset.delete_field("len")
print(dataset)

# 删除第2行，下标从0开始
dataset.delete_instance(1) 
print(dataset)

+------------------------------+-----+
| raw_words                    | len |
+------------------------------+-----+
| This is the first instance . | 5   |
| Second instance .            | 2   |
| Third instance .             | 2   |
+------------------------------+-----+
+------------------------------+
| raw_words                    |
+------------------------------+
| This is the first instance . |
| Second instance .            |
| Third instance .             |
+------------------------------+
+------------------------------+
| raw_words                    |
+------------------------------+
| This is the first instance . |
| Third instance .             |
+------------------------------+


In [38]:
'''使用apply() 或 apply_field()方法处理文本
'''
from fastNLP import DataSet
data = {'raw_words':["This is the first instance .", "Second instance .", "Third instance ."]}
dataset = DataSet(data)

print(dataset)
# 将句子分成单词形式, 详见DataSet.apply()方法
# 用 new_field_name 参数指定函数返回值组成的新 field 的名称
# 原地修改
dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')
print(dataset)

# 或使用DataSet.apply_field()
# 原地修改
dataset.apply_field(lambda sent:sent.split(), field_name='raw_words', new_field_name='voca')
print(dataset)

# 除了匿名函数，也可以定义函数传递进去
def get_words(instance):
    sentence = instance['raw_words']
    words = sentence.split()
    return words
dataset.apply(get_words, new_field_name='words')
print(dataset)

+------------------------------+
| raw_words                    |
+------------------------------+
| This is the first instance . |
| Second instance .            |
| Third instance .             |
+------------------------------+
+------------------------------+------------------------------------------+
| raw_words                    | words                                    |
+------------------------------+------------------------------------------+
| This is the first instance . | ['This', 'is', 'the', 'first', 'insta... |
| Second instance .            | ['Second', 'instance', '.']              |
| Third instance .             | ['Third', 'instance', '.']               |
+------------------------------+------------------------------------------+
+------------------------------+------------------------------+------------------------------+
| raw_words                    | words                        | voca                         |
+------------------------------+---------------

In [42]:
'''使用apply_field()方法
'''
from fastNLP import DataSet
data = {'raw_words':["This is the first instance .", "Second instance .", "Third instance ."],
       'len':[5,3,3]}
dataset = DataSet(data)
print(dataset)
dataset.apply_field(lambda x:x+1, 'len','after_len')
print(dataset)


+------------------------------+-----+
| raw_words                    | len |
+------------------------------+-----+
| This is the first instance . | 5   |
| Second instance .            | 3   |
| Third instance .             | 3   |
+------------------------------+-----+
+------------------------------+-----+-----------+
| raw_words                    | len | after_len |
+------------------------------+-----+-----------+
| This is the first instanc... | 5   | 6         |
| Second instance .            | 3   | 4         |
| Third instance .             | 3   | 4         |
+------------------------------+-----+-----------+


In [33]:
'''获取某个field 的操作
'''
from fastNLP import DataSet
data = {'raw_words':["This is the first instance .", "Second instance .", "Third instance ."]}
dataset = DataSet(data)
print(dataset)

# 获取数据集中所有的field名，也就是所有的列名
fields = dataset.get_field_names()
print(fields)

# 获取field 下所有的值，返回结果是个fieldArray，可遍历输出
out = dataset.get_field('raw_words')
for i in out: 
    print(i)

+------------------------------+
| raw_words                    |
+------------------------------+
| This is the first instance . |
| Second instance .            |
| Third instance .             |
+------------------------------+
['raw_words']
This is the first instance .
Second instance .
Third instance .

