#Alternative with tf.datasets

In [2]:
!pip install tensorflow-datasets > /dev/null

In [3]:
import tensorflow_datasets as tfds

In [4]:
(ds_train,ds_test),ds_info = tfds.load(
    name="imdb_reviews",
    split=["train","test"],
    shuffle_files=True,
    as_supervised=True,
    with_info=True
)

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete4LS8ZJ/imdb_reviews-train.tfrecord*...…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete4LS8ZJ/imdb_reviews-test.tfrecord*...:…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete4LS8ZJ/imdb_reviews-unsupervised.tfrec…

Dataset imdb_reviews downloaded and prepared to ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [5]:
ds_info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset.
    This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0',
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        '

In [6]:
import pandas as pd
import numpy as np
import string
import unicodedata, sys
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Creating dataframe for training dataset

In [7]:
df_train = pd.DataFrame()        #Creating an empty dataframe
for i in ds_train.as_numpy_iterator():
     row = {'Review':i[0],'Review_label':i[1]}
     df_train = df_train.append(row, ignore_index=True)      #Storing reviews with labels in data frame row by row

In [8]:
df_train.shape

(25000, 2)

In [9]:
df_train.head()

Unnamed: 0,Review,Review_label
0,"b""This was an absolutely terrible movie. Don't...",0.0
1,b'I have been known to fall asleep during film...,0.0
2,b'Mann photographs the Alberta Rocky Mountains...,0.0
3,b'This is the kind of film for a snowy Sunday ...,1.0
4,"b'As others have mentioned, all the women that...",1.0


In [10]:
#To check for missing values
df_train.isnull().sum()

Review          0
Review_label    0
dtype: int64

In [11]:
#To check for count for review labels
df_train['Review_label'].value_counts()

0.0    12500
1.0    12500
Name: Review_label, dtype: int64

In [12]:
#Lets have a look at few reviews!
df_train['Review'][0]

b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."

In [13]:
df_train['Review'][1]

b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was constant. Constantly slow and boring. Things seemed to happen, but with no explanation of what was causing them or why. I admit, I may have missed part of the film, but i watched the majority of it and everything just seemed to happen of its own accord without any real concern for anything else. I cant recommend this film at all.'

In [14]:
df_train['Review'][2]

b'Mann photographs the Alberta Rocky Mountains in a superb fashion, and Jimmy Stewart and Walter Brennan give enjoyable performances as they always seem to do. <br /><br />But come on Hollywood - a Mountie telling the people of Dawson City, Yukon to elect themselves a marshal (yes a marshal!) and to enforce the law themselves, then gunfighters battling it out on the streets for control of the town? <br /><br />Nothing even remotely resembling that happened on the Canadian side of the border during the Klondike gold rush. Mr. Mann and company appear to have mistaken Dawson City for Deadwood, the Canadian North for the American Wild West.<br /><br />Canadian viewers be prepared for a Reefer Madness type of enjoyable howl with this ludicrous plot, or, to shake your head in disgust.'

In [15]:
df_train['Review'][3]

b'This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful performances from Cher and Nicolas Cage (as always) gently row the plot along. There are no rapids to cross, no dangerous waters, just a warm and witty paddle through New York life at its best. A family film in every sense and one that deserves the praise it received.'

#Data Preprocessing

Data Preprocessing involves following steps:

1. Remove byte string
2. Remove html tags
3. Remove URLs
4. Remove punctuation marks
5. Remove stop words

Training Data

In [16]:
df_train

Unnamed: 0,Review,Review_label
0,"b""This was an absolutely terrible movie. Don't...",0.0
1,b'I have been known to fall asleep during film...,0.0
2,b'Mann photographs the Alberta Rocky Mountains...,0.0
3,b'This is the kind of film for a snowy Sunday ...,1.0
4,"b'As others have mentioned, all the women that...",1.0
...,...,...
24995,"b'I have a severe problem with this show, seve...",0.0
24996,"b'The year is 1964. Ernesto ""Che"" Guevara, hav...",1.0
24997,b'Okay. So I just got back. Before I start my ...,0.0
24998,b'When I saw this trailer on TV I was surprise...,0.0


In [17]:
#To remove byte string
df_train['Review'] = df_train['Review'].apply(lambda x:x.decode("UTF-8"))

In [18]:
#To remove html tags 
df_train['Review'] = df_train['Review'].str.replace('<[^<]+?>','',regex = True)   

In [19]:
df_train['Review'][2]

'Mann photographs the Alberta Rocky Mountains in a superb fashion, and Jimmy Stewart and Walter Brennan give enjoyable performances as they always seem to do. But come on Hollywood - a Mountie telling the people of Dawson City, Yukon to elect themselves a marshal (yes a marshal!) and to enforce the law themselves, then gunfighters battling it out on the streets for control of the town? Nothing even remotely resembling that happened on the Canadian side of the border during the Klondike gold rush. Mr. Mann and company appear to have mistaken Dawson City for Deadwood, the Canadian North for the American Wild West.Canadian viewers be prepared for a Reefer Madness type of enjoyable howl with this ludicrous plot, or, to shake your head in disgust.'

In [20]:
#To remove URLs
df_train['Review'] = df_train['Review'].str.replace(r'https ? ://\s+|www\.\s+','', regex = True)

In [21]:
#To remove punctuation marks
punc = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))
df_train['Review'] = df_train['Review'].apply(lambda x:" ".join([string.translate(punc) for string in x.split()]))

In [22]:
df_train['Review'][2]

'Mann photographs the Alberta Rocky Mountains in a superb fashion and Jimmy Stewart and Walter Brennan give enjoyable performances as they always seem to do But come on Hollywood  a Mountie telling the people of Dawson City Yukon to elect themselves a marshal yes a marshal and to enforce the law themselves then gunfighters battling it out on the streets for control of the town Nothing even remotely resembling that happened on the Canadian side of the border during the Klondike gold rush Mr Mann and company appear to have mistaken Dawson City for Deadwood the Canadian North for the American Wild WestCanadian viewers be prepared for a Reefer Madness type of enjoyable howl with this ludicrous plot or to shake your head in disgust'

In [23]:
#Removing stopwords
stop_words = stopwords.words('english')   #Loading stopwords
df_train['Review'] = df_train['Review'].apply(lambda x:" ".join([string for string in x.split() if string not in stop_words]))

In [24]:
#Creating a dataframe with preprocessed data
df_train_clean = df_train

In [25]:
#Lets have a look at the processed data!
df_train_clean.head()

Unnamed: 0,Review,Review_label
0,This absolutely terrible movie Dont lured Chri...,0.0
1,I known fall asleep films usually due combinat...,0.0
2,Mann photographs Alberta Rocky Mountains super...,0.0
3,This kind film snowy Sunday afternoon rest wor...,1.0
4,As others mentioned women go nude film mostly ...,1.0


Test Data

In [26]:
df_test = pd.DataFrame()        #Creating an empty dataframe
for i in ds_test.as_numpy_iterator():
     row = {'Review':i[0],'Review_label':i[1]}
     df_test = df_test.append(row, ignore_index=True)

In [27]:
df_test.shape

(25000, 2)

In [28]:
df_test.head()

Unnamed: 0,Review,Review_label
0,"b""There are films that make careers. For Georg...",1.0
1,"b""A blackly comic tale of a down-trodden pries...",1.0
2,"b'Scary Movie 1-4, Epic Movie, Date Movie, Mee...",0.0
3,b'Poor Shirley MacLaine tries hard to lend som...,0.0
4,b'As a former Erasmus student I enjoyed this f...,1.0


In [29]:
#To check for missing values
df_test.isnull().sum()

Review          0
Review_label    0
dtype: int64

In [30]:
#To check for count for review labels
df_test['Review_label'].value_counts()

1.0    12500
0.0    12500
Name: Review_label, dtype: int64

In [31]:
#To remove byte string
df_test['Review'] = df_test['Review'].apply(lambda x:x.decode("UTF-8"))

In [32]:
#To remove html tags 
df_test['Review'] = df_test['Review'].str.replace('<[^<]+?>','',regex = True)      

In [33]:
#To remove URLs
df_test['Review'] = df_test['Review'].str.replace(r'https ? ://\s+|www\.\s+','', regex = True)

In [34]:
#To remove punctuation marks
df_test['Review'] = df_test['Review'].apply(lambda x:" ".join([string.translate(punc) for string in x.split()]))

In [35]:
#Removing stop words
df_test['Review'] = df_test['Review'].apply(lambda x:" ".join([string for string in x.split() if string not in stop_words]))

In [36]:
#Creating a test dataframe with preprocessed data
df_test_clean = df_test

In [37]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
df_train_clean.to_csv("/content/drive/MyDrive/IMDB_NLP/Train_reviews.csv")
df_test_clean.to_csv("/content/drive/MyDrive/IMDB_NLP/Test_reviews.csv")