In [None]:
"""
Moview reviews are present as individual text file (one file per review) in review folder. 

Folder structure looks like this,

reviews

    |__ positive
        |__pos_1.txt
        |__pos_2.txt
        |__pos_3.txt
    |__ negative
        |__neg_1.txt
        |__neg_2.txt
        |__neg_3.txt
   
You need to read these reviews using tf.data.Dataset and perform following transformations,

(1) Read text review and generate a label from folder name. your dataset should have review text and label as a tuple

(2) Filter blank text review. Two files are blank in this dataset

(3) Do all of the above transformations in single line of code. Also shuffle all the reviews
"""

In [1]:
import tensorflow as tf

2023-06-02 21:31:03.205004: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
reviews_ds = tf.data.Dataset.list_files('reviews/*/*', shuffle=False)

In [3]:
for file in reviews_ds:
    print(file.numpy())

b'reviews/negative/neg_1.txt'
b'reviews/negative/neg_2.txt'
b'reviews/negative/neg_3.txt'
b'reviews/positive/pos_1.txt'
b'reviews/positive/pos_2.txt'
b'reviews/positive/pos_3.txt'


2023-06-02 21:31:20.153364: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [6]
	 [[{{node Placeholder/_0}}]]


In [4]:
# Extract review text from these files. Extract label from folder name

import os
def extract_review_and_label(file_path):
    return tf.io.read_file(file_path), tf.strings.split(file_path, os.path.sep)[-2]

In [5]:
reviews_ds_1 = reviews_ds.map(extract_review_and_label)
for review, label in reviews_ds_1:
    print("Review: ",review.numpy()[:50])
    print("Label: ",label.numpy())

Review:  b"Basically there's a family where a little boy (Jak"
Label:  b'negative'
Review:  b'This show was an amazing, fresh & innovative idea '
Label:  b'negative'
Review:  b''
Label:  b'negative'
Review:  b'One of the other reviewers has mentioned that afte'
Label:  b'positive'
Review:  b'A wonderful little production. <br /><br />The fil'
Label:  b'positive'
Review:  b''
Label:  b'positive'


2023-06-02 21:31:59.830763: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [6]
	 [[{{node Placeholder/_0}}]]


In [6]:
# Filter blank reviews

reviews_ds_2 = reviews_ds_1.filter(lambda review, label: review!="")
for review, label in reviews_ds_2.as_numpy_iterator():
    print("Review: ",review[:50])
    print("Label: ",label)

Review:  b"Basically there's a family where a little boy (Jak"
Label:  b'negative'
Review:  b'This show was an amazing, fresh & innovative idea '
Label:  b'negative'
Review:  b'One of the other reviewers has mentioned that afte'
Label:  b'positive'
Review:  b'A wonderful little production. <br /><br />The fil'
Label:  b'positive'


2023-06-02 21:32:20.921041: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [6]
	 [[{{node Placeholder/_0}}]]


In [7]:
# Perform map, filter and shuffle in single ine of code

final_ds = reviews_ds.map(extract_review_and_label).filter(lambda review, label: review!="").shuffle(3)
for review, label in final_ds.as_numpy_iterator():
    print("Review:",review[:50])
    print("Label:",label)

Review: b'One of the other reviewers has mentioned that afte'
Label: b'positive'
Review: b"Basically there's a family where a little boy (Jak"
Label: b'negative'
Review: b'A wonderful little production. <br /><br />The fil'
Label: b'positive'
Review: b'This show was an amazing, fresh & innovative idea '
Label: b'negative'


2023-06-02 21:32:44.328202: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [6]
	 [[{{node Placeholder/_0}}]]
2023-06-02 21:32:44.328500: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [6]
	 [[{{node Placeholder/_0}}]]
