In [1]:
from datasets import load_from_disk, concatenate_datasets, Dataset, DatasetDict

#textile patent documents
dataset = load_from_disk('../Textiles_patents')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(f"Train dataset size: {len(dataset['train'])}")
print(f"Validation dataset size: {len(dataset['validation'])}")
print(f"Test dataset size: {len(dataset['test'])}")

Train dataset size: 10164
Validation dataset size: 565
Test dataset size: 565


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['description', 'abstract'],
        num_rows: 10164
    })
    validation: Dataset({
        features: ['description', 'abstract'],
        num_rows: 565
    })
    test: Dataset({
        features: ['description', 'abstract'],
        num_rows: 565
    })
})

In [4]:
full_dataset = concatenate_datasets([dataset['train'], dataset['test'], dataset['validation']])

In [5]:
full_dataset

Dataset({
    features: ['description', 'abstract'],
    num_rows: 11294
})

In [6]:
import pandas as pd

df = full_dataset.to_pandas()

In [7]:
df.head()

Unnamed: 0,description,abstract
0,BACKGROUND OF THE INVENTION \n This invent...,A method of forming fiber mixtures from differ...
1,CROSS-REFERENCE TO RELATED APPLICATION \n ...,The fibers of recycled paper are cleaned by ag...
2,This is a division of application Ser. No. 922...,"Non-woven, bias laid fabrics, where the variou..."
3,FIELD OF THE INVENTION \n The present inve...,Multiple-color air jet looms successively inse...
4,[0001] This application claims the benefit...,A method of performing a spinning operation of...


In [8]:
from sklearn.model_selection import train_test_split

train_df, remaining_df = train_test_split(df, test_size=0.3, random_state=42)
test_df, validation_df = train_test_split(remaining_df, test_size=1/3, random_state=42)


In [9]:
train_df

Unnamed: 0,description,abstract
3033,FIELD OF THE INVENTION \n [0001] T...,A head control device and its control method f...
8643,FIELD OF THE INVENTION \n The invention re...,An applicator device for applying a coating ag...
10938,This is a divisional application under 37 C.F....,A novel processing technique is reported to bo...
6282,BACKGROUND OF THE INVENTION \n [0001]...,A warp oriented woven sailcloth is provided in...
449,This application is a continuation in-part of ...,The movement of each of a plurality jacks or r...
...,...,...
11284,CROSS REFERENCE TO RELATED APPLICATION \n ...,The method for operating a fleece layer requir...
5191,This is a continuation of international applic...,The production of fibers is carried out by all...
5390,This is a continuation of application Ser. No....,Methods are disclosed for treating fibrous web...
860,FIELD \n [0001] The following discl...,The use of thermosetting binder systems in the...


In [10]:
test_df

Unnamed: 0,description,abstract
11011,FIELD OF THE INVENTION \n The present inve...,Processes for preparing pulp from lignin-conta...
5885,BACKGROUND OF THE INVENTION \n 1. Technical ...,A durable erosion control blanket featuring a ...
6745,[0001] This is a Division of application Se...,A method for spinning a multifilament yarn fro...
5844,BACKGROUND OF THE INVENTION \n This invent...,A surgical repair device having a length to wi...
8291,CROSS REFERENCE TO RELATED APPLICATION \n ...,A transporting carriage for conveying a coiler...
...,...,...
415,BACKGROUND OF THE INVENTION \n 1. Field of...,A clothes washing machine includes a main wash...
633,BACKGROUND OF THE INVENTION \n 1. Field of...,A compressible cheese center for dyeing purpos...
3181,BACKGROUND OF THE INVENTION \n This invent...,A yarn feeder for a circular knitting machine ...
9736,BACKGROUND OF THE INVENTION \n The present i...,A method and an apparatus for stuffer box crim...


In [11]:
validation_df

Unnamed: 0,description,abstract
6840,This application is a U.S. National Phase unde...,A method for the production of a composite mat...
8046,BACKGROUND OF THE INVENTION \n 1. Field of...,A cam section block is provided with variously...
5999,FIELD OF THE INVENTION \n [0001] T...,A method for reducing a production of a slime ...
7775,BACKGROUND OF THE INVENTION \n The present...,"The device comprises drums, which are coaxial ..."
10004,BACKGROUND OF THE INVENTION \n 1. Field of...,A method of programming and controlling an aut...
...,...,...
4705,PRIOR APPLICATIONS \n [0001] This is ...,The feed system is for a continuous digester w...
1740,BACKGROUND OF THE INVENTION \n [0001] ...,Multiple groups of sensors are circumferential...
9302,BACKGROUND OF THE INVENTION \n 1. Field of...,The present invention relates a washing machin...
2874,FIELD OF THE INVENTION \n This invention r...,What is disclosed is an improvement in a braid...


In [12]:
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
validation_df.reset_index(drop=True, inplace=True)


In [13]:
train_df

Unnamed: 0,description,abstract
0,FIELD OF THE INVENTION \n [0001] T...,A head control device and its control method f...
1,FIELD OF THE INVENTION \n The invention re...,An applicator device for applying a coating ag...
2,This is a divisional application under 37 C.F....,A novel processing technique is reported to bo...
3,BACKGROUND OF THE INVENTION \n [0001]...,A warp oriented woven sailcloth is provided in...
4,This application is a continuation in-part of ...,The movement of each of a plurality jacks or r...
...,...,...
7900,CROSS REFERENCE TO RELATED APPLICATION \n ...,The method for operating a fleece layer requir...
7901,This is a continuation of international applic...,The production of fibers is carried out by all...
7902,This is a continuation of application Ser. No....,Methods are disclosed for treating fibrous web...
7903,FIELD \n [0001] The following discl...,The use of thermosetting binder systems in the...


In [14]:
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)
validation_ds = Dataset.from_pandas(validation_df)


In [15]:
train_ds

Dataset({
    features: ['description', 'abstract'],
    num_rows: 7905
})

In [16]:
new_dataset = DatasetDict({
    'train': train_ds,
    'validation': validation_ds,
    'test': test_ds
})

In [17]:
new_dataset

DatasetDict({
    train: Dataset({
        features: ['description', 'abstract'],
        num_rows: 7905
    })
    validation: Dataset({
        features: ['description', 'abstract'],
        num_rows: 1130
    })
    test: Dataset({
        features: ['description', 'abstract'],
        num_rows: 2259
    })
})

In [20]:
new_dataset.save_to_disk("../Data/Textile_Patents_(70-20-10)")

Saving the dataset (1/1 shards): 100%|██████████| 7905/7905 [00:00<00:00, 223015.74 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1130/1130 [00:00<00:00, 164453.97 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2259/2259 [00:00<00:00, 185684.69 examples/s]
