In [2]:
from torchdata import datapipes as dp
from torchtext import transforms as T
import spacy
from torchtext.vocab import build_vocab_from_iterator

In [39]:
file_path = "data/netflix.txt"
data_pipe = dp.iter.IterableWrapper([file_path])
data_pipe = dp.iter.FileOpener(data_pipe, mode='rb')

In [40]:
type(data_pipe)

torch.utils.data.datapipes.iter.fileopener.FileOpenerIterDataPipe

In [48]:
print(data_pipe.datapipe)

IterableWrapperIterDataPipe


In [49]:
data_pipe = data_pipe.parse_csv(skip_lines=0,
                                delimiter=',',
                                as_tuple=True)

In [50]:
for ind, x in enumerate(data_pipe):
    print(x)
    if ind == 3:
        break

('show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description')
('s1', 'Movie', 'Dick Johnson Is Dead', 'Kirsten Johnson', '', 'United States', 'September 25, 2021', '2020', 'PG-13', '90 min', 'Documentaries', 'As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.')
('s2', 'TV Show', 'Blood & Water', '', 'Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng', 'South Africa', 'September 24, 2021', '2021', 'TV-MA', '2 Seasons', 'International TV Shows, TV Dramas, TV Mysteries', 'After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school

In [None]:
netList = list(data_pipe)
netList[3:5]

In [54]:
# only keep type and description
def keepDescription(row):
    "keep type, duration, description as list"
    return [row[1], row[-3], row[-1]]

In [55]:
data_pipe = data_pipe.map(keepDescription)
trimList = list(data_pipe)
trimList[:3]

[['duration', 'type', 'description'],
 ['90 min',
  'Movie',
  'As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.'],
 ['2 Seasons',
  'TV Show',
  'After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.']]

In [58]:
eng_tokeniser = spacy.load("en_core_web_md")

def tokenisedDesc(desc):
    "Tokenise the description"
    return [token.text for token in eng_tokeniser(desc)]

In [None]:
tokenisedDesc('As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.')

In [60]:
def createTokens(data_source):
    for row in data_source:
        yield tokenisedDesc(row[-1])

In [61]:
desc_vocab = build_vocab_from_iterator(
    createTokens(data_pipe),
    min_freq=2,
    specials=['<pad>','<sos>','<eos>','<unk>'],
    special_first=True
)
desc_vocab.set_default_index(desc_vocab['<unk>'])

In [63]:
# itos is a list
desc_vocab.get_itos()[:10]

['<pad>', '<sos>', '<eos>', '<unk>', 'a', '.', ',', 'the', 'to', 'and']

In [65]:
# stoi is a dictionary
desc_vocab.get_stoi()['wipes']

11002

In [None]:
T.VocabTransform(vocab=desc_vocab)(['wipes', '<sos>'])

In [70]:
text_transform = T.Sequential(
    T.VocabTransform(desc_vocab),
    T.AddToken(1, begin=True),
    T.AddToken(2, begin=False)
)

In [77]:
trimList[2][2]

'After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.'

In [80]:
transformed_desc = text_transform(tokenisedDesc(trimList[2][2]))
transformed_desc

[1,
 42,
 3495,
 474,
 37,
 4,
 388,
 6,
 4,
 5118,
 3431,
 99,
 176,
 50,
 8,
 447,
 1255,
 4,
 875,
 12,
 66,
 4976,
 154,
 23,
 14,
 229,
 28,
 270,
 1738,
 37,
 891,
 5,
 2]

In [81]:
for x in transformed_desc:
    print(desc_vocab.get_itos()[x], end=' ')

<sos> After crossing paths at a party , a Cape Town teen sets out to prove whether a private - school swimming star is her sister who was abducted at birth . <eos> 

In [None]:
def applyTransform(row):
    return [row, text_transform(tokenisedDesc(row[2]))]

dpipe = data_pipe.map(applyTransform)
tokenedlist = list(dpipe)
tokenedlist[1]

In [100]:
def onlyTokens(row):
    return text_transform(tokenisedDesc(row[2]))

In [None]:
onlytoken_pipe = data_pipe.map(onlyTokens)
onlytoken_list = list(onlytoken_pipe)
onlytoken_list[1]

In [106]:
def sortBucket(buc):
    "Sort a bucket based length of target sequence"
    return sorted(buc, key = lambda x: len(x), reverse=True)

In [107]:
sorted_pipe = onlytoken_pipe.bucketbatch(
    batch_size=4, batch_num=5, bucket_num=1,
    use_in_batch_shuffle=False, sort_key=sortBucket
)

In [108]:
for dat in sorted_pipe:
    print(dat)
    break

[[1, 3371, 47, 131, 72, 522, 17, 2358, 5, 1192, 36, 1560, 19, 7, 2574, 4948, 6, 1486, 7, 6106, 49, 10, 616, 23, 443, 85, 634, 5, 2], [1, 21, 39, 56, 4078, 1444, 30, 7563, 32, 18, 577, 426, 4736, 20, 70, 6, 9, 24, 40, 461, 8, 3487, 816, 10, 7, 355, 913, 5, 2], [1, 156, 189, 150, 30, 3, 194, 67, 6, 1226, 2489, 261, 1366, 7597, 10, 599, 9, 6869, 11, 90, 516, 10, 47, 6, 641, 9, 5743, 5, 2], [1, 3, 26, 13, 414, 16, 1642, 6, 4, 1596, 12, 997, 2205, 68, 167, 303, 8, 1183, 19, 7, 375, 16, 538, 9, 181, 1348, 5, 2]]


In [110]:
def applyPadding(seq):
    return T.ToTensor(0)(list(seq[-1]))

padded_pipe = sorted_pipe.map(applyPadding)

padded_list = list(padded_pipe)
padded_list[1]

tensor([    1,  1092,     7,   174,     9,  9704,    10,  4189,   357,  3133,
            6,    22,  1011,  5219,  7060,     8,     7, 10953,  1680,    10,
         8280,     5,     2])

In [113]:
dits = desc_vocab.get_itos() 

def showSentence(data_pep):
    for it, x in enumerate(data_pep):
        desc = ""
        for ind in x:
            desc += " " + dits[ind]
        print(desc)
        if it == 3:
            break

In [114]:
showSentence(padded_pipe)

 <sos> Join the StoryBots and the space travelers of the historic Inspiration4 mission as they search for answers to kids ' questions about space . <eos>
 <sos> <unk> documents reveal the post - WWII life of Otto <unk> , a close Hitler ally who escaped to Spain and became an adviser to world presidents . <eos>
 <sos> The adventures of adolescent ninja Naruto <unk> continue as he 's tasked with protecting a <unk> from a demon – but to do so , he must die . <eos>
 <sos> When a good deed unwittingly endangers his clan , a <unk> - century Turkish warrior agrees to fight a <unk> 's enemies in exchange for new tribal land . <eos>
