<a href="https://colab.research.google.com/github/Moon-Wrecker/Deep_Learning_Practices/blob/main/DLP_Week_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from datasets import load_dataset

In [6]:
imdb_dataset = load_dataset("stanfordnlp/imdb")
print(imdb_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [8]:
imdb_train_split = imdb_dataset['train']
print(imdb_train_split)

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})


In [9]:
imdb_dataset.pop('unsupervised')
print(imdb_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})


In [13]:
train_split = load_dataset("stanfordnlp/imdb", split="train")
print(train_split)

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})


In [15]:
small_ds = train_split.train_test_split(test_size=0.2)
print(small_ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})


### ACCESSING SAMPLES

In [18]:
import pprint
idx = 1000
example = imdb_dataset['train'][idx]
pprint.pprint(example)

{'label': 0,
 'text': 'Although I have to admit I laughed more watching this movie than the '
         'last few comedies I saw.<br /><br />The budget must have consisted '
         'of pocket change from the actors. The production values are so low '
         'that they actual made it kind of fun to watch. Reminds me of the '
         'Robot Monster made up of a guy in a gorilla suit with a cardboard '
         'diving helmet on.<br /><br />In one scene a hapless victim gets '
         'their arm and leg cut off. Geez, hard to believe but the Black '
         'Knight scene from Holy Grail was more realistic. I kept wondering '
         'why the victim didn\'t start shouting " None Shall Pass" and " It\'s '
         'only a flesh wound, I\'ve had worse". It was one of the funniest '
         'scenes I\'ve seen in the past year.<br /><br />The "gladiator/demon" '
         'was a stitch too. Between the horribly cheap costume and the geeky '
         'look of the guy in it the end result

In [19]:
example = imdb_dataset['train'].select([idx])
print(example)

Dataset({
    features: ['text', 'label'],
    num_rows: 1
})


In [21]:
idx = range(0, 100, 2)
examples = imdb_dataset['train'].select(idx)
print(examples)

Dataset({
    features: ['text', 'label'],
    num_rows: 50
})


### Translation Datsets

https://huggingface.co/datasets/wmt/wtm14

In [23]:
from datasets import get_dataset_config_names, get_dataset_split_names
print(get_dataset_config_names("wmt/wmt14"))
print(get_dataset_split_names("wmt/wmt14", "hi-en"))

README.md: 0.00B [00:00, ?B/s]

['cs-en', 'de-en', 'fr-en', 'hi-en', 'ru-en']
['train', 'validation', 'test']


In [24]:
translation_dataset = load_dataset(path="wmt/wmt14", name='hi-en')
print(translation_dataset)

hi-en/train-00000-of-00001.parquet:   0%|          | 0.00/992k [00:00<?, ?B/s]

hi-en/validation-00000-of-00001.parquet:   0%|          | 0.00/85.8k [00:00<?, ?B/s]

hi-en/test-00000-of-00001.parquet:   0%|          | 0.00/506k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32863 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/520 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2507 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 32863
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})


In [29]:
raw_dataset = load_dataset(path="wmt/wmt/14", name="hi-en", split="train+test+validation")

FileNotFoundError: Couldn't find any data file at /content/wmt/wmt/14.

### Features

In [34]:
pprint.pprint(translation_dataset['train'].features)

{'translation': Translation(languages=['hi', 'en'])}


In [37]:
mrpc_dataset = load_dataset('glue', 'mrpc', split='train')
pprint.pprint(mrpc_dataset.features)

{'idx': Value('int32'),
 'label': ClassLabel(names=['not_equivalent', 'equivalent']),
 'sentence1': Value('string'),
 'sentence2': Value('string')}


### Commom Methods

In [39]:
print('Before Filtering')
print(20*'-')
print(imdb_dataset)

Before Filtering
--------------------
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})


In [42]:
num_words = 100
imdb_filtered_dataset = imdb_dataset.filter(lambda example:len(example['text'].split(' '))>=num_words)
print('After Filtering')
print(20*'-')
print(imdb_filtered_dataset)

After Filtering
--------------------
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 22074
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 21909
    })
})


In [45]:
def add_prefix(example):
  example['text'] = "IMDB"+example["text"]
  return example

In [46]:
imdb_prefixed_dataset = imdb_dataset.map(add_prefix)
print(imdb_prefixed_dataset)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})


In [50]:
rt_dataset_whole = load_dataset('cornell-movie-review-data/rotten_tomatoes', split="all")
print(rt_dataset_whole)
print(rt_dataset_whole.features)

README.md: 0.00B [00:00, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 10662
})
{'text': Value('string'), 'label': ClassLabel(names=['neg', 'pos'])}


In [51]:
import datasets
concat_daatset = datasets.concatenate_datasets([imdb_dataset_whole, rt_dataset_whole], axis=0)

NameError: name 'imdb_dataset_whole' is not defined

### Interleaving Datasets

In [52]:
from datasets import interleave_datasets
inter_datasets = interleave_datasets([imdb_daatset_whole, rt_dataset_whole],
                                     probabilities=[0.6, 90.4])

print(inter_datasets)

NameError: name 'imdb_daatset_whole' is not defined

### Iterable Dataset

In [53]:
imdb_iter_dataset = load_dataset('imdb', split='train', streaming=True)
print(imdb_iter_dataset)

README.md: 0.00B [00:00, ?B/s]

IterableDataset({
    features: ['text', 'label'],
    num_shards: 1
})


In [55]:
for x in imdb_iter_dataset:
  print(x)
  break

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

QUESTIONS TO PONDER
Can we load a dataset directly from external links?
Can we load a dataset from zipped file directly?